trajscore 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trajscore-1.0.0/PKG-INFO +277 -0
- trajscore-1.0.0/README.md +242 -0
- trajscore-1.0.0/agenteval/__init__.py +78 -0
- trajscore-1.0.0/agenteval/advanced.py +614 -0
- trajscore-1.0.0/agenteval/evaluator.py +113 -0
- trajscore-1.0.0/agenteval/exceptions.py +25 -0
- trajscore-1.0.0/agenteval/metrics/__init__.py +16 -0
- trajscore-1.0.0/agenteval/metrics/answer_faithfulness.py +51 -0
- trajscore-1.0.0/agenteval/metrics/base.py +34 -0
- trajscore-1.0.0/agenteval/metrics/goal_completion.py +52 -0
- trajscore-1.0.0/agenteval/metrics/loop_detection.py +52 -0
- trajscore-1.0.0/agenteval/metrics/reasoning_coherence.py +53 -0
- trajscore-1.0.0/agenteval/metrics/step_efficiency.py +47 -0
- trajscore-1.0.0/agenteval/metrics/tool_accuracy.py +61 -0
- trajscore-1.0.0/agenteval/models.py +71 -0
- trajscore-1.0.0/agenteval/watcher.py +63 -0
- trajscore-1.0.0/setup.cfg +4 -0
- trajscore-1.0.0/setup.py +42 -0
- trajscore-1.0.0/tests/test_agenteval.py +276 -0
- trajscore-1.0.0/trajscore.egg-info/PKG-INFO +277 -0
- trajscore-1.0.0/trajscore.egg-info/SOURCES.txt +22 -0
- trajscore-1.0.0/trajscore.egg-info/dependency_links.txt +1 -0
- trajscore-1.0.0/trajscore.egg-info/requires.txt +5 -0
- trajscore-1.0.0/trajscore.egg-info/top_level.txt +1 -0
trajscore-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trajscore
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Production-grade agentic trajectory evaluation — score multi-step AI agent runs on goal completion, tool accuracy, step efficiency, reasoning coherence, loop detection, and faithfulness
|
|
5
|
+
Home-page: https://github.com/maheshmakvana/trajscore
|
|
6
|
+
Keywords: agent evaluation,trajectory evaluation,llm agent,agentic ai,ai evaluation,tool use,multi-step reasoning,agent testing,ai agent metrics,goal completion,step efficiency,loop detection,reasoning coherence,answer faithfulness,agent benchmark,ai observability,agentic benchmark,agent trajectory,llm testing,agent quality,production ai,ai quality assurance
|
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
26
|
+
Dynamic: classifier
|
|
27
|
+
Dynamic: description
|
|
28
|
+
Dynamic: description-content-type
|
|
29
|
+
Dynamic: home-page
|
|
30
|
+
Dynamic: keywords
|
|
31
|
+
Dynamic: provides-extra
|
|
32
|
+
Dynamic: requires-dist
|
|
33
|
+
Dynamic: requires-python
|
|
34
|
+
Dynamic: summary
|
|
35
|
+
|
|
36
|
+
# agenteval
|
|
37
|
+
|
|
38
|
+
**Production-grade agentic trajectory evaluation for multi-step AI agents.**
|
|
39
|
+
|
|
40
|
+
Score any AI agent run on 6 built-in metrics, detect regressions, stream results, and integrate into CI/CD — with zero vendor lock-in.
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install agenteval
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Why agenteval?
|
|
49
|
+
|
|
50
|
+
In 2026, every team building agentic AI faces the same problem: **you can't improve what you can't measure.** Agents fail in subtle ways — they loop, misuse tools, hallucinate answers unsupported by observations, or take twice as many steps as needed. No single library evaluated full multi-step trajectories with structured, auditable metrics.
|
|
51
|
+
|
|
52
|
+
`agenteval` fixes this.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Quickstart
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from agenteval import (
|
|
60
|
+
Trajectory, TrajectoryStep, StepType,
|
|
61
|
+
TrajectoryEvaluator,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
trajectory = Trajectory(
|
|
65
|
+
trajectory_id="run-001",
|
|
66
|
+
task="What is the capital of France?",
|
|
67
|
+
steps=[
|
|
68
|
+
TrajectoryStep(step_index=0, step_type=StepType.THOUGHT,
|
|
69
|
+
content="I should look this up."),
|
|
70
|
+
TrajectoryStep(step_index=1, step_type=StepType.TOOL_CALL,
|
|
71
|
+
content="search", tool_name="search",
|
|
72
|
+
tool_args={"query": "capital of France"}),
|
|
73
|
+
TrajectoryStep(step_index=2, step_type=StepType.OBSERVATION,
|
|
74
|
+
content="Paris is the capital of France."),
|
|
75
|
+
TrajectoryStep(step_index=3, step_type=StepType.FINAL_ANSWER,
|
|
76
|
+
content="The capital of France is Paris."),
|
|
77
|
+
],
|
|
78
|
+
final_answer="The capital of France is Paris.",
|
|
79
|
+
expected_tools=["search"],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
evaluator = TrajectoryEvaluator()
|
|
83
|
+
score = evaluator.evaluate(trajectory)
|
|
84
|
+
|
|
85
|
+
print(f"Overall: {score.overall_score:.3f} Passed: {score.passed}")
|
|
86
|
+
print(score.metric_scores)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Built-in Metrics
|
|
92
|
+
|
|
93
|
+
| Metric | Description |
|
|
94
|
+
|--------|-------------|
|
|
95
|
+
| `goal_completion` | Did the agent produce a relevant final answer? |
|
|
96
|
+
| `tool_accuracy` | Did it use the right tools? (F1 vs expected_tools) |
|
|
97
|
+
| `step_efficiency` | Did it reach the goal without unnecessary steps? |
|
|
98
|
+
| `reasoning_coherence` | Do thoughts lead logically to actions? |
|
|
99
|
+
| `loop_detection` | Did the agent repeat actions or thoughts? |
|
|
100
|
+
| `answer_faithfulness` | Is the final answer grounded in observations? |
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Batch & Async Evaluation
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from agenteval import TrajectoryEvaluator
|
|
108
|
+
|
|
109
|
+
evaluator = TrajectoryEvaluator()
|
|
110
|
+
|
|
111
|
+
# Synchronous batch
|
|
112
|
+
result = evaluator.evaluate_batch(trajectories, max_workers=8)
|
|
113
|
+
|
|
114
|
+
# Async batch
|
|
115
|
+
import asyncio
|
|
116
|
+
result = asyncio.run(evaluator.aevaluate_batch(trajectories))
|
|
117
|
+
|
|
118
|
+
print(f"Pass rate: {result.pass_rate:.1%}")
|
|
119
|
+
print(f"Mean score: {result.mean_overall:.3f}")
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Advanced Features
|
|
125
|
+
|
|
126
|
+
### Caching (LRU + TTL + SHA-256)
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from agenteval.advanced import TrajectoryCache
|
|
130
|
+
|
|
131
|
+
cache = TrajectoryCache(max_size=512, ttl=600)
|
|
132
|
+
memoized_eval = cache.memoize(evaluator.evaluate)
|
|
133
|
+
score = memoized_eval(trajectory) # cached on second call
|
|
134
|
+
print(cache.stats())
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Evaluation Pipeline
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from agenteval.advanced import EvalPipeline
|
|
141
|
+
|
|
142
|
+
pipeline = (
|
|
143
|
+
EvalPipeline()
|
|
144
|
+
.filter("non_empty", lambda t: len(t.steps) > 0)
|
|
145
|
+
.map("tag_metadata", lambda t: t)
|
|
146
|
+
.with_retry("tag_metadata", retries=2)
|
|
147
|
+
)
|
|
148
|
+
cleaned = pipeline.run(trajectories)
|
|
149
|
+
print(pipeline.audit_log)
|
|
150
|
+
|
|
151
|
+
# Async
|
|
152
|
+
import asyncio
|
|
153
|
+
cleaned = asyncio.run(pipeline.arun(trajectories))
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Declarative Validation
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from agenteval.advanced import TrajectoryValidator, TrajectoryRule
|
|
160
|
+
|
|
161
|
+
validator = (
|
|
162
|
+
TrajectoryValidator()
|
|
163
|
+
.add_rule(TrajectoryRule("has_steps", lambda t: len(t.steps) > 0, "Need steps"))
|
|
164
|
+
.add_rule(TrajectoryRule("has_task", lambda t: bool(t.task), "Need task"))
|
|
165
|
+
)
|
|
166
|
+
violations = validator.validate(trajectory)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Rate Limiter (sync + async)
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from agenteval.advanced import RateLimiter
|
|
173
|
+
|
|
174
|
+
limiter = RateLimiter(rate=10, capacity=10) # 10 evals/s
|
|
175
|
+
if limiter.acquire():
|
|
176
|
+
score = evaluator.evaluate(trajectory)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Budget-Controlled Evaluation
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from agenteval.advanced import evaluate_with_budget
|
|
183
|
+
scores = evaluate_with_budget(trajectories, evaluator.evaluate, budget_seconds=5.0)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Streaming Results
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from agenteval.advanced import stream_scores, scores_to_ndjson
|
|
190
|
+
|
|
191
|
+
for score in stream_scores(trajectories, evaluator.evaluate):
|
|
192
|
+
print(score.trajectory_id, score.overall_score)
|
|
193
|
+
|
|
194
|
+
# NDJSON stream
|
|
195
|
+
for line in scores_to_ndjson(trajectories, evaluator.evaluate):
|
|
196
|
+
print(line)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Diff & Regression Tracking
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from agenteval.advanced import diff_results, RegressionTracker
|
|
203
|
+
|
|
204
|
+
tracker = RegressionTracker(window=10)
|
|
205
|
+
tracker.record(result_v1)
|
|
206
|
+
tracker.record(result_v2)
|
|
207
|
+
print(tracker.trend()) # "improving" / "declining" / "stable"
|
|
208
|
+
diff = tracker.latest_regression()
|
|
209
|
+
print(diff.summary())
|
|
210
|
+
print(diff.to_json())
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### Observability
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from agenteval.advanced import EvaluationProfiler, DriftDetector, EvaluationReport
|
|
217
|
+
|
|
218
|
+
profiler = EvaluationProfiler()
|
|
219
|
+
scored = profiler.profile(evaluator.evaluate)(trajectory)
|
|
220
|
+
print(profiler.report())
|
|
221
|
+
|
|
222
|
+
detector = DriftDetector(threshold=0.05)
|
|
223
|
+
detector.set_baseline(result_v1)
|
|
224
|
+
print(detector.detect(result_v2))
|
|
225
|
+
|
|
226
|
+
report = EvaluationReport(result)
|
|
227
|
+
print(report.to_json())
|
|
228
|
+
print(report.to_csv())
|
|
229
|
+
print(report.to_markdown())
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Audit Log & Cost Ledger
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from agenteval.advanced import AuditLog, CostLedger
|
|
236
|
+
|
|
237
|
+
log = AuditLog()
|
|
238
|
+
log.log("eval_start", {"run_id": "ci-42"})
|
|
239
|
+
|
|
240
|
+
ledger = CostLedger()
|
|
241
|
+
ledger.record("t1", tokens=1200, cost_usd=0.024)
|
|
242
|
+
print(ledger.summary())
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## Live Trajectory Watcher
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from agenteval import TrajectoryWatcher, TrajectoryStep, StepType
|
|
251
|
+
|
|
252
|
+
watcher = TrajectoryWatcher(
|
|
253
|
+
trajectory_id="live-001",
|
|
254
|
+
task="Summarize the paper",
|
|
255
|
+
on_step=lambda step, idx: print(f"Step {idx}: {step.step_type}"),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
watcher.add_step(TrajectoryStep(step_index=0, step_type=StepType.THOUGHT, content="Reading..."))
|
|
259
|
+
trajectory = watcher.finish("Summary complete.")
|
|
260
|
+
score = evaluator.evaluate(trajectory)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Installation
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
pip install agenteval
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Python 3.8+ · No external dependencies (stdlib + pydantic)
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
MIT
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# agenteval
|
|
2
|
+
|
|
3
|
+
**Production-grade agentic trajectory evaluation for multi-step AI agents.**
|
|
4
|
+
|
|
5
|
+
Score any AI agent run on 6 built-in metrics, detect regressions, stream results, and integrate into CI/CD — with zero vendor lock-in.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install agenteval
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why agenteval?
|
|
14
|
+
|
|
15
|
+
In 2026, every team building agentic AI faces the same problem: **you can't improve what you can't measure.** Agents fail in subtle ways — they loop, misuse tools, hallucinate answers unsupported by observations, or take twice as many steps as needed. No single library evaluated full multi-step trajectories with structured, auditable metrics.
|
|
16
|
+
|
|
17
|
+
`agenteval` fixes this.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quickstart
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from agenteval import (
|
|
25
|
+
Trajectory, TrajectoryStep, StepType,
|
|
26
|
+
TrajectoryEvaluator,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
trajectory = Trajectory(
|
|
30
|
+
trajectory_id="run-001",
|
|
31
|
+
task="What is the capital of France?",
|
|
32
|
+
steps=[
|
|
33
|
+
TrajectoryStep(step_index=0, step_type=StepType.THOUGHT,
|
|
34
|
+
content="I should look this up."),
|
|
35
|
+
TrajectoryStep(step_index=1, step_type=StepType.TOOL_CALL,
|
|
36
|
+
content="search", tool_name="search",
|
|
37
|
+
tool_args={"query": "capital of France"}),
|
|
38
|
+
TrajectoryStep(step_index=2, step_type=StepType.OBSERVATION,
|
|
39
|
+
content="Paris is the capital of France."),
|
|
40
|
+
TrajectoryStep(step_index=3, step_type=StepType.FINAL_ANSWER,
|
|
41
|
+
content="The capital of France is Paris."),
|
|
42
|
+
],
|
|
43
|
+
final_answer="The capital of France is Paris.",
|
|
44
|
+
expected_tools=["search"],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
evaluator = TrajectoryEvaluator()
|
|
48
|
+
score = evaluator.evaluate(trajectory)
|
|
49
|
+
|
|
50
|
+
print(f"Overall: {score.overall_score:.3f} Passed: {score.passed}")
|
|
51
|
+
print(score.metric_scores)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Built-in Metrics
|
|
57
|
+
|
|
58
|
+
| Metric | Description |
|
|
59
|
+
|--------|-------------|
|
|
60
|
+
| `goal_completion` | Did the agent produce a relevant final answer? |
|
|
61
|
+
| `tool_accuracy` | Did it use the right tools? (F1 vs expected_tools) |
|
|
62
|
+
| `step_efficiency` | Did it reach the goal without unnecessary steps? |
|
|
63
|
+
| `reasoning_coherence` | Do thoughts lead logically to actions? |
|
|
64
|
+
| `loop_detection` | Did the agent repeat actions or thoughts? |
|
|
65
|
+
| `answer_faithfulness` | Is the final answer grounded in observations? |
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Batch & Async Evaluation
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from agenteval import TrajectoryEvaluator
|
|
73
|
+
|
|
74
|
+
evaluator = TrajectoryEvaluator()
|
|
75
|
+
|
|
76
|
+
# Synchronous batch
|
|
77
|
+
result = evaluator.evaluate_batch(trajectories, max_workers=8)
|
|
78
|
+
|
|
79
|
+
# Async batch
|
|
80
|
+
import asyncio
|
|
81
|
+
result = asyncio.run(evaluator.aevaluate_batch(trajectories))
|
|
82
|
+
|
|
83
|
+
print(f"Pass rate: {result.pass_rate:.1%}")
|
|
84
|
+
print(f"Mean score: {result.mean_overall:.3f}")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Advanced Features
|
|
90
|
+
|
|
91
|
+
### Caching (LRU + TTL + SHA-256)
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from agenteval.advanced import TrajectoryCache
|
|
95
|
+
|
|
96
|
+
cache = TrajectoryCache(max_size=512, ttl=600)
|
|
97
|
+
memoized_eval = cache.memoize(evaluator.evaluate)
|
|
98
|
+
score = memoized_eval(trajectory) # cached on second call
|
|
99
|
+
print(cache.stats())
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Evaluation Pipeline
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from agenteval.advanced import EvalPipeline
|
|
106
|
+
|
|
107
|
+
pipeline = (
|
|
108
|
+
EvalPipeline()
|
|
109
|
+
.filter("non_empty", lambda t: len(t.steps) > 0)
|
|
110
|
+
.map("tag_metadata", lambda t: t)
|
|
111
|
+
.with_retry("tag_metadata", retries=2)
|
|
112
|
+
)
|
|
113
|
+
cleaned = pipeline.run(trajectories)
|
|
114
|
+
print(pipeline.audit_log)
|
|
115
|
+
|
|
116
|
+
# Async
|
|
117
|
+
import asyncio
|
|
118
|
+
cleaned = asyncio.run(pipeline.arun(trajectories))
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Declarative Validation
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from agenteval.advanced import TrajectoryValidator, TrajectoryRule
|
|
125
|
+
|
|
126
|
+
validator = (
|
|
127
|
+
TrajectoryValidator()
|
|
128
|
+
.add_rule(TrajectoryRule("has_steps", lambda t: len(t.steps) > 0, "Need steps"))
|
|
129
|
+
.add_rule(TrajectoryRule("has_task", lambda t: bool(t.task), "Need task"))
|
|
130
|
+
)
|
|
131
|
+
violations = validator.validate(trajectory)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Rate Limiter (sync + async)
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from agenteval.advanced import RateLimiter
|
|
138
|
+
|
|
139
|
+
limiter = RateLimiter(rate=10, capacity=10) # 10 evals/s
|
|
140
|
+
if limiter.acquire():
|
|
141
|
+
score = evaluator.evaluate(trajectory)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Budget-Controlled Evaluation
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from agenteval.advanced import evaluate_with_budget
|
|
148
|
+
scores = evaluate_with_budget(trajectories, evaluator.evaluate, budget_seconds=5.0)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Streaming Results
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from agenteval.advanced import stream_scores, scores_to_ndjson
|
|
155
|
+
|
|
156
|
+
for score in stream_scores(trajectories, evaluator.evaluate):
|
|
157
|
+
print(score.trajectory_id, score.overall_score)
|
|
158
|
+
|
|
159
|
+
# NDJSON stream
|
|
160
|
+
for line in scores_to_ndjson(trajectories, evaluator.evaluate):
|
|
161
|
+
print(line)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Diff & Regression Tracking
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
from agenteval.advanced import diff_results, RegressionTracker
|
|
168
|
+
|
|
169
|
+
tracker = RegressionTracker(window=10)
|
|
170
|
+
tracker.record(result_v1)
|
|
171
|
+
tracker.record(result_v2)
|
|
172
|
+
print(tracker.trend()) # "improving" / "declining" / "stable"
|
|
173
|
+
diff = tracker.latest_regression()
|
|
174
|
+
print(diff.summary())
|
|
175
|
+
print(diff.to_json())
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Observability
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from agenteval.advanced import EvaluationProfiler, DriftDetector, EvaluationReport
|
|
182
|
+
|
|
183
|
+
profiler = EvaluationProfiler()
|
|
184
|
+
scored = profiler.profile(evaluator.evaluate)(trajectory)
|
|
185
|
+
print(profiler.report())
|
|
186
|
+
|
|
187
|
+
detector = DriftDetector(threshold=0.05)
|
|
188
|
+
detector.set_baseline(result_v1)
|
|
189
|
+
print(detector.detect(result_v2))
|
|
190
|
+
|
|
191
|
+
report = EvaluationReport(result)
|
|
192
|
+
print(report.to_json())
|
|
193
|
+
print(report.to_csv())
|
|
194
|
+
print(report.to_markdown())
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Audit Log & Cost Ledger
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
from agenteval.advanced import AuditLog, CostLedger
|
|
201
|
+
|
|
202
|
+
log = AuditLog()
|
|
203
|
+
log.log("eval_start", {"run_id": "ci-42"})
|
|
204
|
+
|
|
205
|
+
ledger = CostLedger()
|
|
206
|
+
ledger.record("t1", tokens=1200, cost_usd=0.024)
|
|
207
|
+
print(ledger.summary())
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Live Trajectory Watcher
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from agenteval import TrajectoryWatcher, TrajectoryStep, StepType
|
|
216
|
+
|
|
217
|
+
watcher = TrajectoryWatcher(
|
|
218
|
+
trajectory_id="live-001",
|
|
219
|
+
task="Summarize the paper",
|
|
220
|
+
on_step=lambda step, idx: print(f"Step {idx}: {step.step_type}"),
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
watcher.add_step(TrajectoryStep(step_index=0, step_type=StepType.THOUGHT, content="Reading..."))
|
|
224
|
+
trajectory = watcher.finish("Summary complete.")
|
|
225
|
+
score = evaluator.evaluate(trajectory)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Installation
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
pip install agenteval
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Python 3.8+ · No external dependencies (stdlib + pydantic)
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
agenteval — Agentic Multi-Step Trajectory Evaluation.
|
|
3
|
+
|
|
4
|
+
Evaluate any AI agent trajectory with production-grade metrics:
|
|
5
|
+
goal completion, tool accuracy, step efficiency, reasoning coherence,
|
|
6
|
+
loop detection, and answer faithfulness.
|
|
7
|
+
"""
|
|
8
|
+
from agenteval.models import (
|
|
9
|
+
StepType,
|
|
10
|
+
TrajectoryStep,
|
|
11
|
+
Trajectory,
|
|
12
|
+
StepScore,
|
|
13
|
+
TrajectoryScore,
|
|
14
|
+
EvaluationResult,
|
|
15
|
+
)
|
|
16
|
+
from agenteval.evaluator import TrajectoryEvaluator
|
|
17
|
+
from agenteval.watcher import TrajectoryWatcher
|
|
18
|
+
from agenteval.exceptions import (
|
|
19
|
+
AgentEvalError,
|
|
20
|
+
TrajectoryValidationError,
|
|
21
|
+
EvaluationError,
|
|
22
|
+
MetricNotFoundError,
|
|
23
|
+
BudgetExceededError,
|
|
24
|
+
SchemaViolationError,
|
|
25
|
+
)
|
|
26
|
+
from agenteval.metrics import (
|
|
27
|
+
GoalCompletionMetric,
|
|
28
|
+
ToolAccuracyMetric,
|
|
29
|
+
StepEfficiencyMetric,
|
|
30
|
+
ReasoningCoherenceMetric,
|
|
31
|
+
LoopDetectionMetric,
|
|
32
|
+
AnswerFaithfulnessMetric,
|
|
33
|
+
)
|
|
34
|
+
from agenteval.advanced import (
|
|
35
|
+
TrajectoryCache,
|
|
36
|
+
EvalPipeline,
|
|
37
|
+
TrajectoryRule,
|
|
38
|
+
TrajectoryValidator,
|
|
39
|
+
ConfidenceScorer,
|
|
40
|
+
RateLimiter,
|
|
41
|
+
CancellationToken,
|
|
42
|
+
abatch_evaluate,
|
|
43
|
+
batch_evaluate,
|
|
44
|
+
evaluate_with_budget,
|
|
45
|
+
EvaluationProfiler,
|
|
46
|
+
DriftDetector,
|
|
47
|
+
EvaluationReport,
|
|
48
|
+
stream_scores,
|
|
49
|
+
scores_to_ndjson,
|
|
50
|
+
ScoreDiff,
|
|
51
|
+
diff_results,
|
|
52
|
+
RegressionTracker,
|
|
53
|
+
AuditLog,
|
|
54
|
+
CostLedger,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__version__ = "1.0.0"
|
|
58
|
+
|
|
59
|
+
__all__ = [
|
|
60
|
+
# Models
|
|
61
|
+
"StepType", "TrajectoryStep", "Trajectory", "StepScore",
|
|
62
|
+
"TrajectoryScore", "EvaluationResult",
|
|
63
|
+
# Core
|
|
64
|
+
"TrajectoryEvaluator", "TrajectoryWatcher",
|
|
65
|
+
# Exceptions
|
|
66
|
+
"AgentEvalError", "TrajectoryValidationError", "EvaluationError",
|
|
67
|
+
"MetricNotFoundError", "BudgetExceededError", "SchemaViolationError",
|
|
68
|
+
# Metrics
|
|
69
|
+
"GoalCompletionMetric", "ToolAccuracyMetric", "StepEfficiencyMetric",
|
|
70
|
+
"ReasoningCoherenceMetric", "LoopDetectionMetric", "AnswerFaithfulnessMetric",
|
|
71
|
+
# Advanced
|
|
72
|
+
"TrajectoryCache", "EvalPipeline", "TrajectoryRule", "TrajectoryValidator",
|
|
73
|
+
"ConfidenceScorer", "RateLimiter", "CancellationToken",
|
|
74
|
+
"abatch_evaluate", "batch_evaluate", "evaluate_with_budget",
|
|
75
|
+
"EvaluationProfiler", "DriftDetector", "EvaluationReport",
|
|
76
|
+
"stream_scores", "scores_to_ndjson", "ScoreDiff", "diff_results",
|
|
77
|
+
"RegressionTracker", "AuditLog", "CostLedger",
|
|
78
|
+
]
|