agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/output.py ADDED
@@ -0,0 +1,204 @@
1
+ """Output formatting for evaluation results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from .runner import MetricResult, RunResult
9
+
10
+
11
+ def format_results(run_result: RunResult, fmt: str = "table") -> str:
12
+ if fmt == "json":
13
+ return _format_json(run_result)
14
+ elif fmt == "summary":
15
+ return _format_summary(run_result)
16
+ else:
17
+ return _format_table(run_result)
18
+
19
+
20
+ def _format_table(run_result: RunResult) -> str:
21
+ try:
22
+ from tabulate import tabulate
23
+ except ImportError:
24
+ return _format_summary(run_result)
25
+
26
+ lines: list[str] = []
27
+
28
+ if run_result.errors:
29
+ lines.append("Errors:")
30
+ for err in run_result.errors:
31
+ lines.append(f" - {err}")
32
+ lines.append("")
33
+
34
+ for trace_result in run_result.trace_results:
35
+ lines.append(f"Trace: {trace_result.trace_id}")
36
+ lines.append(f"Invocations: {trace_result.num_invocations}")
37
+
38
+ if trace_result.conversion_warnings:
39
+ for w in trace_result.conversion_warnings:
40
+ lines.append(f" Warning: {w}")
41
+
42
+ rows = []
43
+ for mr in trace_result.metric_results:
44
+ status_icon = _status_icon(mr.eval_status)
45
+ score_str = f"{mr.score:.4f}" if mr.score is not None else "N/A"
46
+ error_str = mr.error or ""
47
+ per_inv = (
48
+ ", ".join(f"{s:.4f}" if s is not None else "N/A" for s in mr.per_invocation_scores)
49
+ if mr.per_invocation_scores
50
+ else ""
51
+ )
52
+ rows.append(
53
+ [
54
+ status_icon,
55
+ mr.metric_name,
56
+ score_str,
57
+ mr.eval_status,
58
+ per_inv,
59
+ error_str,
60
+ ]
61
+ )
62
+
63
+ if rows:
64
+ table = tabulate(
65
+ rows,
66
+ headers=["", "Metric", "Score", "Status", "Per-Invocation", "Error"],
67
+ tablefmt="simple",
68
+ )
69
+ lines.append(table)
70
+
71
+ for mr in trace_result.metric_results:
72
+ if mr.details and mr.eval_status == "FAILED":
73
+ lines.append(_format_metric_details(mr))
74
+ lines.append("")
75
+
76
+ if trace_result.performance_metrics:
77
+ perf = trace_result.performance_metrics
78
+ lines.append("\n Performance Metrics:")
79
+
80
+ lat = perf["latency"]
81
+ lines.append(
82
+ f" Overall Latency: p50={lat['overall']['p50']:.0f}ms, p95={lat['overall']['p95']:.0f}ms, p99={lat['overall']['p99']:.0f}ms"
83
+ )
84
+ lines.append(
85
+ f" LLM Latency: p50={lat['llm_calls']['p50']:.0f}ms, p95={lat['llm_calls']['p95']:.0f}ms, p99={lat['llm_calls']['p99']:.0f}ms"
86
+ )
87
+ lines.append(
88
+ f" Tool Latency: p50={lat['tool_executions']['p50']:.0f}ms, p95={lat['tool_executions']['p95']:.0f}ms, p99={lat['tool_executions']['p99']:.0f}ms"
89
+ )
90
+
91
+ tok = perf["tokens"]
92
+ lines.append(
93
+ f" Tokens: {tok['total']} total ({tok['total_prompt']} prompt + {tok['total_output']} output)"
94
+ )
95
+ lines.append(
96
+ f" Per LLM Call: p50={tok['per_llm_call']['p50']:.0f}, p95={tok['per_llm_call']['p95']:.0f}, p99={tok['per_llm_call']['p99']:.0f}"
97
+ )
98
+
99
+ lines.append("")
100
+
101
+ if run_result.performance_metrics:
102
+ lines.append("Overall Performance:")
103
+ perf = run_result.performance_metrics
104
+ lines.append(
105
+ f" Total Tokens: {perf['tokens']['total']} ({perf['tokens']['total_prompt']} prompt + {perf['tokens']['total_output']} output)"
106
+ )
107
+ lines.append(
108
+ f" Avg per Trace: {perf['tokens']['avg_per_trace']['prompt']:.0f} prompt, {perf['tokens']['avg_per_trace']['output']:.0f} output"
109
+ )
110
+ lines.append("")
111
+
112
+ return "\n".join(lines)
113
+
114
+
115
+ def _format_json(run_result: RunResult) -> str:
116
+ data: dict[str, Any] = {
117
+ "traces": [],
118
+ "errors": run_result.errors,
119
+ }
120
+
121
+ for tr in run_result.trace_results:
122
+ trace_data: dict[str, Any] = {
123
+ "trace_id": tr.trace_id,
124
+ "num_invocations": tr.num_invocations,
125
+ "conversion_warnings": tr.conversion_warnings,
126
+ "metrics": [],
127
+ }
128
+ for mr in tr.metric_results:
129
+ metric_data = {
130
+ "metric_name": mr.metric_name,
131
+ "score": mr.score,
132
+ "eval_status": mr.eval_status,
133
+ "per_invocation_scores": mr.per_invocation_scores,
134
+ "error": mr.error,
135
+ }
136
+ if mr.details:
137
+ metric_data["details"] = mr.details
138
+ trace_data["metrics"].append(metric_data)
139
+ if tr.performance_metrics:
140
+ trace_data["performance_metrics"] = tr.performance_metrics
141
+ data["traces"].append(trace_data)
142
+
143
+ if run_result.performance_metrics:
144
+ data["performance_metrics"] = run_result.performance_metrics
145
+
146
+ return json.dumps(data, indent=2)
147
+
148
+
149
+ def _format_summary(run_result: RunResult) -> str:
150
+ lines: list[str] = []
151
+
152
+ if run_result.errors:
153
+ lines.append("Errors:")
154
+ for err in run_result.errors:
155
+ lines.append(f" - {err}")
156
+ lines.append("")
157
+
158
+ for tr in run_result.trace_results:
159
+ lines.append(f"Trace {tr.trace_id} ({tr.num_invocations} invocations):")
160
+ for mr in tr.metric_results:
161
+ icon = _status_icon(mr.eval_status)
162
+ if mr.error:
163
+ lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}")
164
+ elif mr.score is not None:
165
+ lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status})")
166
+ else:
167
+ lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status})")
168
+ lines.append("")
169
+
170
+ return "\n".join(lines)
171
+
172
+
173
+ def _format_metric_details(mr: MetricResult) -> str:
174
+ """Format detailed comparison for metrics with details field."""
175
+ lines = []
176
+
177
+ if mr.metric_name == "tool_trajectory_avg_score" and mr.details:
178
+ comparisons = mr.details.get("comparisons", [])
179
+ for i, comp in enumerate(comparisons, 1):
180
+ if not comp.get("matched", True):
181
+ lines.append(f" Invocation {i} trajectory mismatch:")
182
+ lines.append(" Expected:")
183
+ for tool in comp.get("expected", []):
184
+ args_str = json.dumps(tool["args"]) if tool["args"] else "{}"
185
+ lines.append(f" - {tool['name']}({args_str})")
186
+ if not comp.get("expected"):
187
+ lines.append(" (none)")
188
+
189
+ lines.append(" Actual:")
190
+ for tool in comp.get("actual", []):
191
+ args_str = json.dumps(tool["args"]) if tool["args"] else "{}"
192
+ lines.append(f" - {tool['name']}({args_str})")
193
+ if not comp.get("actual"):
194
+ lines.append(" (none)")
195
+
196
+ return "\n".join(lines)
197
+
198
+
199
+ def _status_icon(status: str) -> str:
200
+ return {
201
+ "PASSED": "[PASS]",
202
+ "FAILED": "[FAIL]",
203
+ "NOT_EVALUATED": "[----]",
204
+ }.get(status, "[????]")
agentevals/runner.py ADDED
@@ -0,0 +1,310 @@
1
+ """Evaluation runner — orchestrates trace loading, conversion, and scoring."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from collections.abc import Awaitable, Callable
9
+ from typing import Any
10
+
11
+ from google.adk.evaluation.eval_case import Invocation
12
+ from google.adk.evaluation.eval_set import EvalSet
13
+ from pydantic import BaseModel, ConfigDict, Field
14
+ from pydantic.alias_generators import to_camel
15
+
16
+ from .builtin_metrics import evaluate_builtin_metric
17
+ from .config import (
18
+ CustomEvaluatorDef,
19
+ EvalRunConfig,
20
+ )
21
+ from .converter import ConversionResult, convert_traces
22
+ from .loader.base import TraceLoader
23
+ from .loader.jaeger import JaegerJsonLoader
24
+ from .loader.otlp import OtlpJsonLoader
25
+ from .trace_metrics import extract_performance_metrics
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ ProgressCallback = Callable[[str], Awaitable[None]]
30
+ TraceProgressCallback = Callable[["TraceResult"], Awaitable[None]]
31
+
32
+
33
+ class MetricResult(BaseModel):
34
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
35
+
36
+ metric_name: str
37
+ score: float | None = None
38
+ eval_status: str = "NOT_EVALUATED"
39
+ per_invocation_scores: list[float | None] = Field(default_factory=list)
40
+ error: str | None = None
41
+ details: dict[str, Any] | None = None
42
+
43
+
44
+ class TraceResult(BaseModel):
45
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
46
+
47
+ trace_id: str
48
+ num_invocations: int = 0
49
+ metric_results: list[MetricResult] = Field(default_factory=list)
50
+ conversion_warnings: list[str] = Field(default_factory=list)
51
+ performance_metrics: dict[str, Any] | None = None
52
+
53
+
54
+ class RunResult(BaseModel):
55
+ model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
56
+
57
+ trace_results: list[TraceResult] = Field(default_factory=list)
58
+ errors: list[str] = Field(default_factory=list)
59
+ performance_metrics: dict[str, Any] | None = None
60
+
61
+
62
+ def get_loader(format_name: str) -> TraceLoader:
63
+ loaders: dict[str, type[TraceLoader]] = {
64
+ "jaeger-json": JaegerJsonLoader,
65
+ "otlp-json": OtlpJsonLoader,
66
+ }
67
+ if format_name not in loaders:
68
+ raise ValueError(f"Unknown trace format '{format_name}'. Available: {list(loaders.keys())}")
69
+ return loaders[format_name]()
70
+
71
+
72
+ def load_eval_set(path: str) -> EvalSet:
73
+ with open(path) as f:
74
+ data = json.load(f)
75
+ return EvalSet.model_validate(data)
76
+
77
+
78
+ async def run_evaluation(
79
+ config: EvalRunConfig,
80
+ progress_callback: ProgressCallback | None = None,
81
+ trace_progress_callback: TraceProgressCallback | None = None,
82
+ ) -> RunResult:
83
+ result = RunResult()
84
+
85
+ loader = get_loader(config.trace_format)
86
+ all_traces = []
87
+ for trace_file in config.trace_files:
88
+ try:
89
+ traces = loader.load(trace_file)
90
+ all_traces.extend(traces)
91
+ except Exception as exc:
92
+ msg = f"Failed to load trace file '{trace_file}': {exc}"
93
+ logger.error(msg)
94
+ result.errors.append(msg)
95
+
96
+ if not all_traces:
97
+ result.errors.append("No traces loaded.")
98
+ return result
99
+
100
+ conversion_results = convert_traces(all_traces)
101
+
102
+ trace_map = {t.trace_id: t for t in all_traces}
103
+
104
+ perf_metrics_map: dict[str, dict[str, Any]] = {}
105
+ for trace in all_traces:
106
+ perf_metrics_map[trace.trace_id] = extract_performance_metrics(trace)
107
+
108
+ eval_set: EvalSet | None = None
109
+ if config.eval_set_file:
110
+ try:
111
+ eval_set = load_eval_set(config.eval_set_file)
112
+ except Exception as exc:
113
+ msg = f"Failed to load eval set '{config.eval_set_file}': {exc}"
114
+ logger.error(msg)
115
+ result.errors.append(msg)
116
+
117
+ total_traces = len(conversion_results)
118
+ if progress_callback:
119
+ await progress_callback(f"Evaluating {total_traces} trace{'s' if total_traces != 1 else ''}...")
120
+
121
+ trace_semaphore = asyncio.Semaphore(config.max_concurrent_traces)
122
+ eval_semaphore = asyncio.Semaphore(config.max_concurrent_evals)
123
+
124
+ async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> TraceResult:
125
+ async with trace_semaphore:
126
+ if progress_callback:
127
+ trace_id_short = (
128
+ conv_result.trace_id[:12] + "..." if len(conv_result.trace_id) > 12 else conv_result.trace_id
129
+ )
130
+ await progress_callback(f"Trace {idx + 1}/{total_traces}: {trace_id_short}")
131
+
132
+ trace = trace_map.get(conv_result.trace_id)
133
+
134
+ return await _evaluate_trace(
135
+ conv_result=conv_result,
136
+ metrics=config.metrics,
137
+ custom_evaluators=config.custom_evaluators,
138
+ eval_set=eval_set,
139
+ judge_model=config.judge_model,
140
+ threshold=config.threshold,
141
+ eval_semaphore=eval_semaphore,
142
+ progress_callback=progress_callback,
143
+ trace_progress_callback=trace_progress_callback,
144
+ trace=trace,
145
+ performance_metrics=perf_metrics_map.get(conv_result.trace_id),
146
+ )
147
+
148
+ trace_results = await asyncio.gather(
149
+ *[_evaluate_trace_bounded(idx, conv_result) for idx, conv_result in enumerate(conversion_results)],
150
+ return_exceptions=True,
151
+ )
152
+
153
+ for tr in trace_results:
154
+ if isinstance(tr, Exception):
155
+ logger.error("Unexpected error evaluating trace: %s", tr)
156
+ result.errors.append(str(tr))
157
+ else:
158
+ result.trace_results.append(tr)
159
+
160
+ if progress_callback:
161
+ await progress_callback("Evaluation complete")
162
+
163
+ if result.trace_results:
164
+ all_tokens = {"prompt": [], "output": [], "total": []}
165
+
166
+ for tr in result.trace_results:
167
+ if tr.performance_metrics:
168
+ perf = tr.performance_metrics
169
+ all_tokens["prompt"].append(perf["tokens"]["total_prompt"])
170
+ all_tokens["output"].append(perf["tokens"]["total_output"])
171
+ all_tokens["total"].append(perf["tokens"]["total"])
172
+
173
+ if all_tokens["total"]:
174
+ result.performance_metrics = {
175
+ "tokens": {
176
+ "total_prompt": sum(all_tokens["prompt"]),
177
+ "total_output": sum(all_tokens["output"]),
178
+ "total": sum(all_tokens["total"]),
179
+ "avg_per_trace": {
180
+ "prompt": sum(all_tokens["prompt"]) / len(all_tokens["prompt"]),
181
+ "output": sum(all_tokens["output"]) / len(all_tokens["output"]),
182
+ },
183
+ },
184
+ "trace_count": len(result.trace_results),
185
+ }
186
+
187
+ return result
188
+
189
+
190
+ async def _evaluate_trace(
191
+ conv_result: ConversionResult,
192
+ metrics: list[str],
193
+ custom_evaluators: list[CustomEvaluatorDef],
194
+ eval_set: EvalSet | None,
195
+ judge_model: str | None,
196
+ threshold: float | None,
197
+ eval_semaphore: asyncio.Semaphore,
198
+ progress_callback: ProgressCallback | None = None,
199
+ trace_progress_callback: TraceProgressCallback | None = None,
200
+ trace=None,
201
+ performance_metrics: dict[str, Any] | None = None,
202
+ ) -> TraceResult:
203
+ trace_result = TraceResult(
204
+ trace_id=conv_result.trace_id,
205
+ num_invocations=len(conv_result.invocations),
206
+ conversion_warnings=conv_result.warnings,
207
+ )
208
+
209
+ if performance_metrics:
210
+ trace_result.performance_metrics = performance_metrics
211
+
212
+ if not conv_result.invocations:
213
+ trace_result.metric_results.append(
214
+ MetricResult(
215
+ metric_name="(all)",
216
+ error="No invocations extracted from trace.",
217
+ )
218
+ )
219
+ return trace_result
220
+
221
+ actual_invocations = conv_result.invocations
222
+
223
+ expected_invocations: list[Invocation] | None = None
224
+ if eval_set:
225
+ expected_invocations = _find_expected_invocations(actual_invocations, eval_set)
226
+
227
+ async def _append_result(result: MetricResult) -> MetricResult:
228
+ trace_result.metric_results.append(result)
229
+ if trace_progress_callback:
230
+ await trace_progress_callback(trace_result)
231
+ return result
232
+
233
+ async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
234
+ async with eval_semaphore:
235
+ if progress_callback:
236
+ await progress_callback(f"Running {metric_name}...")
237
+ result = await evaluate_builtin_metric(
238
+ metric_name=metric_name,
239
+ actual_invocations=actual_invocations,
240
+ expected_invocations=expected_invocations,
241
+ judge_model=judge_model,
242
+ threshold=threshold,
243
+ )
244
+ return await _append_result(result)
245
+
246
+ async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> MetricResult:
247
+ async with eval_semaphore:
248
+ if progress_callback:
249
+ await progress_callback(f"Running {evaluator_def.name}...")
250
+ from .custom_evaluators import evaluate_custom_evaluator
251
+
252
+ result = await evaluate_custom_evaluator(
253
+ evaluator_def=evaluator_def,
254
+ actual_invocations=actual_invocations,
255
+ expected_invocations=expected_invocations,
256
+ )
257
+ return await _append_result(result)
258
+
259
+ tasks = [_eval_builtin_with_semaphore(m) for m in metrics]
260
+ tasks.extend(_eval_custom_with_semaphore(g) for g in custom_evaluators)
261
+
262
+ await asyncio.gather(*tasks)
263
+
264
+ return trace_result
265
+
266
+
267
+ def _find_expected_invocations(
268
+ actual_invocations: list[Invocation],
269
+ eval_set: EvalSet,
270
+ ) -> list[Invocation] | None:
271
+ """Match actual invocations to an eval case. Uses the sole eval case if
272
+ there's only one, otherwise matches by user content text."""
273
+ if not eval_set.eval_cases:
274
+ return None
275
+
276
+ if len(eval_set.eval_cases) == 1:
277
+ case = eval_set.eval_cases[0]
278
+ if case.conversation:
279
+ return case.conversation
280
+ return None
281
+
282
+ actual_user_text = _get_user_text(actual_invocations[0]) if actual_invocations else None
283
+ if not actual_user_text:
284
+ case = eval_set.eval_cases[0]
285
+ return case.conversation if case.conversation else None
286
+
287
+ for case in eval_set.eval_cases:
288
+ if not case.conversation:
289
+ continue
290
+ expected_user_text = _get_user_text(case.conversation[0])
291
+ if expected_user_text and _text_matches(actual_user_text, expected_user_text):
292
+ return case.conversation
293
+
294
+ logger.warning(
295
+ "No matching eval case found for user text: '%s'. Using first eval case.",
296
+ actual_user_text[:100],
297
+ )
298
+ case = eval_set.eval_cases[0]
299
+ return case.conversation if case.conversation else None
300
+
301
+
302
+ def _get_user_text(invocation: Invocation) -> str | None:
303
+ if not invocation.user_content or not invocation.user_content.parts:
304
+ return None
305
+ texts = [p.text for p in invocation.user_content.parts if p.text]
306
+ return " ".join(texts) if texts else None
307
+
308
+
309
+ def _text_matches(a: str, b: str) -> bool:
310
+ return a.strip().lower() == b.strip().lower()