trajscore 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,277 @@
1
+ Metadata-Version: 2.4
2
+ Name: trajscore
3
+ Version: 1.0.0
4
+ Summary: Production-grade agentic trajectory evaluation — score multi-step AI agent runs on goal completion, tool accuracy, step efficiency, reasoning coherence, loop detection, and faithfulness
5
+ Home-page: https://github.com/maheshmakvana/trajscore
6
+ Keywords: agent evaluation,trajectory evaluation,llm agent,agentic ai,ai evaluation,tool use,multi-step reasoning,agent testing,ai agent metrics,goal completion,step efficiency,loop detection,reasoning coherence,answer faithfulness,agent benchmark,ai observability,agentic benchmark,agent trajectory,llm testing,agent quality,production ai,ai quality assurance
7
+ Classifier: Development Status :: 5 - Production/Stable
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: pydantic>=2.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0; extra == "dev"
25
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
26
+ Dynamic: classifier
27
+ Dynamic: description
28
+ Dynamic: description-content-type
29
+ Dynamic: home-page
30
+ Dynamic: keywords
31
+ Dynamic: provides-extra
32
+ Dynamic: requires-dist
33
+ Dynamic: requires-python
34
+ Dynamic: summary
35
+
36
+ # agenteval
37
+
38
+ **Production-grade agentic trajectory evaluation for multi-step AI agents.**
39
+
40
+ Score any AI agent run on 6 built-in metrics, detect regressions, stream results, and integrate into CI/CD — with zero vendor lock-in.
41
+
42
+ ```bash
43
+ pip install agenteval
44
+ ```
45
+
46
+ ---
47
+
48
+ ## Why agenteval?
49
+
50
+ In 2026, every team building agentic AI faces the same problem: **you can't improve what you can't measure.** Agents fail in subtle ways — they loop, misuse tools, hallucinate answers unsupported by observations, or take twice as many steps as needed. No single library evaluated full multi-step trajectories with structured, auditable metrics.
51
+
52
+ `agenteval` fixes this.
53
+
54
+ ---
55
+
56
+ ## Quickstart
57
+
58
+ ```python
59
+ from agenteval import (
60
+ Trajectory, TrajectoryStep, StepType,
61
+ TrajectoryEvaluator,
62
+ )
63
+
64
+ trajectory = Trajectory(
65
+ trajectory_id="run-001",
66
+ task="What is the capital of France?",
67
+ steps=[
68
+ TrajectoryStep(step_index=0, step_type=StepType.THOUGHT,
69
+ content="I should look this up."),
70
+ TrajectoryStep(step_index=1, step_type=StepType.TOOL_CALL,
71
+ content="search", tool_name="search",
72
+ tool_args={"query": "capital of France"}),
73
+ TrajectoryStep(step_index=2, step_type=StepType.OBSERVATION,
74
+ content="Paris is the capital of France."),
75
+ TrajectoryStep(step_index=3, step_type=StepType.FINAL_ANSWER,
76
+ content="The capital of France is Paris."),
77
+ ],
78
+ final_answer="The capital of France is Paris.",
79
+ expected_tools=["search"],
80
+ )
81
+
82
+ evaluator = TrajectoryEvaluator()
83
+ score = evaluator.evaluate(trajectory)
84
+
85
+ print(f"Overall: {score.overall_score:.3f} Passed: {score.passed}")
86
+ print(score.metric_scores)
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Built-in Metrics
92
+
93
+ | Metric | Description |
94
+ |--------|-------------|
95
+ | `goal_completion` | Did the agent produce a relevant final answer? |
96
+ | `tool_accuracy` | Did it use the right tools? (F1 vs expected_tools) |
97
+ | `step_efficiency` | Did it reach the goal without unnecessary steps? |
98
+ | `reasoning_coherence` | Do thoughts lead logically to actions? |
99
+ | `loop_detection` | Did the agent repeat actions or thoughts? |
100
+ | `answer_faithfulness` | Is the final answer grounded in observations? |
101
+
102
+ ---
103
+
104
+ ## Batch & Async Evaluation
105
+
106
+ ```python
107
+ from agenteval import TrajectoryEvaluator
108
+
109
+ evaluator = TrajectoryEvaluator()
110
+
111
+ # Synchronous batch
112
+ result = evaluator.evaluate_batch(trajectories, max_workers=8)
113
+
114
+ # Async batch
115
+ import asyncio
116
+ result = asyncio.run(evaluator.aevaluate_batch(trajectories))
117
+
118
+ print(f"Pass rate: {result.pass_rate:.1%}")
119
+ print(f"Mean score: {result.mean_overall:.3f}")
120
+ ```
121
+
122
+ ---
123
+
124
+ ## Advanced Features
125
+
126
+ ### Caching (LRU + TTL + SHA-256)
127
+
128
+ ```python
129
+ from agenteval.advanced import TrajectoryCache
130
+
131
+ cache = TrajectoryCache(max_size=512, ttl=600)
132
+ memoized_eval = cache.memoize(evaluator.evaluate)
133
+ score = memoized_eval(trajectory) # cached on second call
134
+ print(cache.stats())
135
+ ```
136
+
137
+ ### Evaluation Pipeline
138
+
139
+ ```python
140
+ from agenteval.advanced import EvalPipeline
141
+
142
+ pipeline = (
143
+ EvalPipeline()
144
+ .filter("non_empty", lambda t: len(t.steps) > 0)
145
+ .map("tag_metadata", lambda t: t)
146
+ .with_retry("tag_metadata", retries=2)
147
+ )
148
+ cleaned = pipeline.run(trajectories)
149
+ print(pipeline.audit_log)
150
+
151
+ # Async
152
+ import asyncio
153
+ cleaned = asyncio.run(pipeline.arun(trajectories))
154
+ ```
155
+
156
+ ### Declarative Validation
157
+
158
+ ```python
159
+ from agenteval.advanced import TrajectoryValidator, TrajectoryRule
160
+
161
+ validator = (
162
+ TrajectoryValidator()
163
+ .add_rule(TrajectoryRule("has_steps", lambda t: len(t.steps) > 0, "Need steps"))
164
+ .add_rule(TrajectoryRule("has_task", lambda t: bool(t.task), "Need task"))
165
+ )
166
+ violations = validator.validate(trajectory)
167
+ ```
168
+
169
+ ### Rate Limiter (sync + async)
170
+
171
+ ```python
172
+ from agenteval.advanced import RateLimiter
173
+
174
+ limiter = RateLimiter(rate=10, capacity=10) # 10 evals/s
175
+ if limiter.acquire():
176
+ score = evaluator.evaluate(trajectory)
177
+ ```
178
+
179
+ ### Budget-Controlled Evaluation
180
+
181
+ ```python
182
+ from agenteval.advanced import evaluate_with_budget
183
+ scores = evaluate_with_budget(trajectories, evaluator.evaluate, budget_seconds=5.0)
184
+ ```
185
+
186
+ ### Streaming Results
187
+
188
+ ```python
189
+ from agenteval.advanced import stream_scores, scores_to_ndjson
190
+
191
+ for score in stream_scores(trajectories, evaluator.evaluate):
192
+ print(score.trajectory_id, score.overall_score)
193
+
194
+ # NDJSON stream
195
+ for line in scores_to_ndjson(trajectories, evaluator.evaluate):
196
+ print(line)
197
+ ```
198
+
199
+ ### Diff & Regression Tracking
200
+
201
+ ```python
202
+ from agenteval.advanced import diff_results, RegressionTracker
203
+
204
+ tracker = RegressionTracker(window=10)
205
+ tracker.record(result_v1)
206
+ tracker.record(result_v2)
207
+ print(tracker.trend()) # "improving" / "declining" / "stable"
208
+ diff = tracker.latest_regression()
209
+ print(diff.summary())
210
+ print(diff.to_json())
211
+ ```
212
+
213
+ ### Observability
214
+
215
+ ```python
216
+ from agenteval.advanced import EvaluationProfiler, DriftDetector, EvaluationReport
217
+
218
+ profiler = EvaluationProfiler()
219
+ scored = profiler.profile(evaluator.evaluate)(trajectory)
220
+ print(profiler.report())
221
+
222
+ detector = DriftDetector(threshold=0.05)
223
+ detector.set_baseline(result_v1)
224
+ print(detector.detect(result_v2))
225
+
226
+ report = EvaluationReport(result)
227
+ print(report.to_json())
228
+ print(report.to_csv())
229
+ print(report.to_markdown())
230
+ ```
231
+
232
+ ### Audit Log & Cost Ledger
233
+
234
+ ```python
235
+ from agenteval.advanced import AuditLog, CostLedger
236
+
237
+ log = AuditLog()
238
+ log.log("eval_start", {"run_id": "ci-42"})
239
+
240
+ ledger = CostLedger()
241
+ ledger.record("t1", tokens=1200, cost_usd=0.024)
242
+ print(ledger.summary())
243
+ ```
244
+
245
+ ---
246
+
247
+ ## Live Trajectory Watcher
248
+
249
+ ```python
250
+ from agenteval import TrajectoryWatcher, TrajectoryStep, StepType
251
+
252
+ watcher = TrajectoryWatcher(
253
+ trajectory_id="live-001",
254
+ task="Summarize the paper",
255
+ on_step=lambda step, idx: print(f"Step {idx}: {step.step_type}"),
256
+ )
257
+
258
+ watcher.add_step(TrajectoryStep(step_index=0, step_type=StepType.THOUGHT, content="Reading..."))
259
+ trajectory = watcher.finish("Summary complete.")
260
+ score = evaluator.evaluate(trajectory)
261
+ ```
262
+
263
+ ---
264
+
265
+ ## Installation
266
+
267
+ ```bash
268
+ pip install agenteval
269
+ ```
270
+
271
+ Python 3.8+ · No external dependencies (stdlib + pydantic)
272
+
273
+ ---
274
+
275
+ ## License
276
+
277
+ MIT
@@ -0,0 +1,242 @@
1
+ # agenteval
2
+
3
+ **Production-grade agentic trajectory evaluation for multi-step AI agents.**
4
+
5
+ Score any AI agent run on 6 built-in metrics, detect regressions, stream results, and integrate into CI/CD — with zero vendor lock-in.
6
+
7
+ ```bash
8
+ pip install agenteval
9
+ ```
10
+
11
+ ---
12
+
13
+ ## Why agenteval?
14
+
15
+ In 2026, every team building agentic AI faces the same problem: **you can't improve what you can't measure.** Agents fail in subtle ways — they loop, misuse tools, hallucinate answers unsupported by observations, or take twice as many steps as needed. No single library evaluated full multi-step trajectories with structured, auditable metrics.
16
+
17
+ `agenteval` fixes this.
18
+
19
+ ---
20
+
21
+ ## Quickstart
22
+
23
+ ```python
24
+ from agenteval import (
25
+ Trajectory, TrajectoryStep, StepType,
26
+ TrajectoryEvaluator,
27
+ )
28
+
29
+ trajectory = Trajectory(
30
+ trajectory_id="run-001",
31
+ task="What is the capital of France?",
32
+ steps=[
33
+ TrajectoryStep(step_index=0, step_type=StepType.THOUGHT,
34
+ content="I should look this up."),
35
+ TrajectoryStep(step_index=1, step_type=StepType.TOOL_CALL,
36
+ content="search", tool_name="search",
37
+ tool_args={"query": "capital of France"}),
38
+ TrajectoryStep(step_index=2, step_type=StepType.OBSERVATION,
39
+ content="Paris is the capital of France."),
40
+ TrajectoryStep(step_index=3, step_type=StepType.FINAL_ANSWER,
41
+ content="The capital of France is Paris."),
42
+ ],
43
+ final_answer="The capital of France is Paris.",
44
+ expected_tools=["search"],
45
+ )
46
+
47
+ evaluator = TrajectoryEvaluator()
48
+ score = evaluator.evaluate(trajectory)
49
+
50
+ print(f"Overall: {score.overall_score:.3f} Passed: {score.passed}")
51
+ print(score.metric_scores)
52
+ ```
53
+
54
+ ---
55
+
56
+ ## Built-in Metrics
57
+
58
+ | Metric | Description |
59
+ |--------|-------------|
60
+ | `goal_completion` | Did the agent produce a relevant final answer? |
61
+ | `tool_accuracy` | Did it use the right tools? (F1 vs expected_tools) |
62
+ | `step_efficiency` | Did it reach the goal without unnecessary steps? |
63
+ | `reasoning_coherence` | Do thoughts lead logically to actions? |
64
+ | `loop_detection` | Did the agent repeat actions or thoughts? |
65
+ | `answer_faithfulness` | Is the final answer grounded in observations? |
66
+
67
+ ---
68
+
69
+ ## Batch & Async Evaluation
70
+
71
+ ```python
72
+ from agenteval import TrajectoryEvaluator
73
+
74
+ evaluator = TrajectoryEvaluator()
75
+
76
+ # Synchronous batch
77
+ result = evaluator.evaluate_batch(trajectories, max_workers=8)
78
+
79
+ # Async batch
80
+ import asyncio
81
+ result = asyncio.run(evaluator.aevaluate_batch(trajectories))
82
+
83
+ print(f"Pass rate: {result.pass_rate:.1%}")
84
+ print(f"Mean score: {result.mean_overall:.3f}")
85
+ ```
86
+
87
+ ---
88
+
89
+ ## Advanced Features
90
+
91
+ ### Caching (LRU + TTL + SHA-256)
92
+
93
+ ```python
94
+ from agenteval.advanced import TrajectoryCache
95
+
96
+ cache = TrajectoryCache(max_size=512, ttl=600)
97
+ memoized_eval = cache.memoize(evaluator.evaluate)
98
+ score = memoized_eval(trajectory) # cached on second call
99
+ print(cache.stats())
100
+ ```
101
+
102
+ ### Evaluation Pipeline
103
+
104
+ ```python
105
+ from agenteval.advanced import EvalPipeline
106
+
107
+ pipeline = (
108
+ EvalPipeline()
109
+ .filter("non_empty", lambda t: len(t.steps) > 0)
110
+ .map("tag_metadata", lambda t: t)
111
+ .with_retry("tag_metadata", retries=2)
112
+ )
113
+ cleaned = pipeline.run(trajectories)
114
+ print(pipeline.audit_log)
115
+
116
+ # Async
117
+ import asyncio
118
+ cleaned = asyncio.run(pipeline.arun(trajectories))
119
+ ```
120
+
121
+ ### Declarative Validation
122
+
123
+ ```python
124
+ from agenteval.advanced import TrajectoryValidator, TrajectoryRule
125
+
126
+ validator = (
127
+ TrajectoryValidator()
128
+ .add_rule(TrajectoryRule("has_steps", lambda t: len(t.steps) > 0, "Need steps"))
129
+ .add_rule(TrajectoryRule("has_task", lambda t: bool(t.task), "Need task"))
130
+ )
131
+ violations = validator.validate(trajectory)
132
+ ```
133
+
134
+ ### Rate Limiter (sync + async)
135
+
136
+ ```python
137
+ from agenteval.advanced import RateLimiter
138
+
139
+ limiter = RateLimiter(rate=10, capacity=10) # 10 evals/s
140
+ if limiter.acquire():
141
+ score = evaluator.evaluate(trajectory)
142
+ ```
143
+
144
+ ### Budget-Controlled Evaluation
145
+
146
+ ```python
147
+ from agenteval.advanced import evaluate_with_budget
148
+ scores = evaluate_with_budget(trajectories, evaluator.evaluate, budget_seconds=5.0)
149
+ ```
150
+
151
+ ### Streaming Results
152
+
153
+ ```python
154
+ from agenteval.advanced import stream_scores, scores_to_ndjson
155
+
156
+ for score in stream_scores(trajectories, evaluator.evaluate):
157
+ print(score.trajectory_id, score.overall_score)
158
+
159
+ # NDJSON stream
160
+ for line in scores_to_ndjson(trajectories, evaluator.evaluate):
161
+ print(line)
162
+ ```
163
+
164
+ ### Diff & Regression Tracking
165
+
166
+ ```python
167
+ from agenteval.advanced import diff_results, RegressionTracker
168
+
169
+ tracker = RegressionTracker(window=10)
170
+ tracker.record(result_v1)
171
+ tracker.record(result_v2)
172
+ print(tracker.trend()) # "improving" / "declining" / "stable"
173
+ diff = tracker.latest_regression()
174
+ print(diff.summary())
175
+ print(diff.to_json())
176
+ ```
177
+
178
+ ### Observability
179
+
180
+ ```python
181
+ from agenteval.advanced import EvaluationProfiler, DriftDetector, EvaluationReport
182
+
183
+ profiler = EvaluationProfiler()
184
+ scored = profiler.profile(evaluator.evaluate)(trajectory)
185
+ print(profiler.report())
186
+
187
+ detector = DriftDetector(threshold=0.05)
188
+ detector.set_baseline(result_v1)
189
+ print(detector.detect(result_v2))
190
+
191
+ report = EvaluationReport(result)
192
+ print(report.to_json())
193
+ print(report.to_csv())
194
+ print(report.to_markdown())
195
+ ```
196
+
197
+ ### Audit Log & Cost Ledger
198
+
199
+ ```python
200
+ from agenteval.advanced import AuditLog, CostLedger
201
+
202
+ log = AuditLog()
203
+ log.log("eval_start", {"run_id": "ci-42"})
204
+
205
+ ledger = CostLedger()
206
+ ledger.record("t1", tokens=1200, cost_usd=0.024)
207
+ print(ledger.summary())
208
+ ```
209
+
210
+ ---
211
+
212
+ ## Live Trajectory Watcher
213
+
214
+ ```python
215
+ from agenteval import TrajectoryWatcher, TrajectoryStep, StepType
216
+
217
+ watcher = TrajectoryWatcher(
218
+ trajectory_id="live-001",
219
+ task="Summarize the paper",
220
+ on_step=lambda step, idx: print(f"Step {idx}: {step.step_type}"),
221
+ )
222
+
223
+ watcher.add_step(TrajectoryStep(step_index=0, step_type=StepType.THOUGHT, content="Reading..."))
224
+ trajectory = watcher.finish("Summary complete.")
225
+ score = evaluator.evaluate(trajectory)
226
+ ```
227
+
228
+ ---
229
+
230
+ ## Installation
231
+
232
+ ```bash
233
+ pip install agenteval
234
+ ```
235
+
236
+ Python 3.8+ · No external dependencies (stdlib + pydantic)
237
+
238
+ ---
239
+
240
+ ## License
241
+
242
+ MIT
@@ -0,0 +1,78 @@
1
+ """
2
+ agenteval — Agentic Multi-Step Trajectory Evaluation.
3
+
4
+ Evaluate any AI agent trajectory with production-grade metrics:
5
+ goal completion, tool accuracy, step efficiency, reasoning coherence,
6
+ loop detection, and answer faithfulness.
7
+ """
8
+ from agenteval.models import (
9
+ StepType,
10
+ TrajectoryStep,
11
+ Trajectory,
12
+ StepScore,
13
+ TrajectoryScore,
14
+ EvaluationResult,
15
+ )
16
+ from agenteval.evaluator import TrajectoryEvaluator
17
+ from agenteval.watcher import TrajectoryWatcher
18
+ from agenteval.exceptions import (
19
+ AgentEvalError,
20
+ TrajectoryValidationError,
21
+ EvaluationError,
22
+ MetricNotFoundError,
23
+ BudgetExceededError,
24
+ SchemaViolationError,
25
+ )
26
+ from agenteval.metrics import (
27
+ GoalCompletionMetric,
28
+ ToolAccuracyMetric,
29
+ StepEfficiencyMetric,
30
+ ReasoningCoherenceMetric,
31
+ LoopDetectionMetric,
32
+ AnswerFaithfulnessMetric,
33
+ )
34
+ from agenteval.advanced import (
35
+ TrajectoryCache,
36
+ EvalPipeline,
37
+ TrajectoryRule,
38
+ TrajectoryValidator,
39
+ ConfidenceScorer,
40
+ RateLimiter,
41
+ CancellationToken,
42
+ abatch_evaluate,
43
+ batch_evaluate,
44
+ evaluate_with_budget,
45
+ EvaluationProfiler,
46
+ DriftDetector,
47
+ EvaluationReport,
48
+ stream_scores,
49
+ scores_to_ndjson,
50
+ ScoreDiff,
51
+ diff_results,
52
+ RegressionTracker,
53
+ AuditLog,
54
+ CostLedger,
55
+ )
56
+
57
+ __version__ = "1.0.0"
58
+
59
+ __all__ = [
60
+ # Models
61
+ "StepType", "TrajectoryStep", "Trajectory", "StepScore",
62
+ "TrajectoryScore", "EvaluationResult",
63
+ # Core
64
+ "TrajectoryEvaluator", "TrajectoryWatcher",
65
+ # Exceptions
66
+ "AgentEvalError", "TrajectoryValidationError", "EvaluationError",
67
+ "MetricNotFoundError", "BudgetExceededError", "SchemaViolationError",
68
+ # Metrics
69
+ "GoalCompletionMetric", "ToolAccuracyMetric", "StepEfficiencyMetric",
70
+ "ReasoningCoherenceMetric", "LoopDetectionMetric", "AnswerFaithfulnessMetric",
71
+ # Advanced
72
+ "TrajectoryCache", "EvalPipeline", "TrajectoryRule", "TrajectoryValidator",
73
+ "ConfidenceScorer", "RateLimiter", "CancellationToken",
74
+ "abatch_evaluate", "batch_evaluate", "evaluate_with_budget",
75
+ "EvaluationProfiler", "DriftDetector", "EvaluationReport",
76
+ "stream_scores", "scores_to_ndjson", "ScoreDiff", "diff_results",
77
+ "RegressionTracker", "AuditLog", "CostLedger",
78
+ ]