agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,554 @@
1
+ """API routes for agentevals."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import os
9
+ import shutil
10
+ import tempfile
11
+ from typing import Any
12
+
13
+ from fastapi import APIRouter, File, Form, HTTPException, UploadFile
14
+ from fastapi.responses import StreamingResponse
15
+ from pydantic.alias_generators import to_camel
16
+
17
+ from agentevals import __version__
18
+
19
+ from ..builtin_metrics import METRICS_NEEDING_EXPECTED, METRICS_NEEDING_GCP, METRICS_NEEDING_LLM
20
+ from ..config import (
21
+ BuiltinMetricDef,
22
+ CodeEvaluatorDef,
23
+ CustomEvaluatorDef,
24
+ EvalRunConfig,
25
+ )
26
+ from ..extraction import get_extractor
27
+ from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
28
+ from ..trace_metrics import extract_performance_metrics, extract_trace_metadata
29
+ from .models import (
30
+ ApiKeyStatus,
31
+ ConfigData,
32
+ EvalSetValidation,
33
+ HealthData,
34
+ MetricInfo,
35
+ SSEDoneEvent,
36
+ SSEErrorEvent,
37
+ SSEPerformanceMetricsEvent,
38
+ SSEProgressEvent,
39
+ SSETraceProgress,
40
+ SSETraceProgressEvent,
41
+ StandardResponse,
42
+ )
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ def _camel_keys(obj: Any) -> Any:
48
+ """Recursively convert dict keys from snake_case to camelCase."""
49
+ if isinstance(obj, dict):
50
+ return {to_camel(k): _camel_keys(v) for k, v in obj.items()}
51
+ if isinstance(obj, list):
52
+ return [_camel_keys(item) for item in obj]
53
+ return obj
54
+
55
+
56
+ router = APIRouter()
57
+
58
+ _TYPE_TO_MODEL = {
59
+ "builtin": BuiltinMetricDef,
60
+ "code": CodeEvaluatorDef,
61
+ }
62
+
63
+
64
+ def _parse_custom_evaluators(raw: list[dict]) -> list[CustomEvaluatorDef]:
65
+ """Parse a list of custom evaluator dicts from the API config JSON."""
66
+ defs: list[CustomEvaluatorDef] = []
67
+ for entry in raw:
68
+ evaluator_type = entry.get("type", "builtin")
69
+ model_cls = _TYPE_TO_MODEL.get(evaluator_type)
70
+ if not model_cls:
71
+ raise ValueError(f"Unknown custom evaluator type: {evaluator_type}")
72
+ defs.append(model_cls.model_validate(entry))
73
+ return defs
74
+
75
+
76
+ @router.get("/health", response_model=StandardResponse[HealthData])
77
+ async def health_check():
78
+ return StandardResponse(data=HealthData(status="ok", version=__version__))
79
+
80
+
81
+ @router.get("/config", response_model=StandardResponse[ConfigData])
82
+ async def get_config():
83
+ return StandardResponse(
84
+ data=ConfigData(
85
+ api_keys=ApiKeyStatus(
86
+ google=bool(os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")),
87
+ anthropic=bool(os.environ.get("ANTHROPIC_API_KEY")),
88
+ openai=bool(os.environ.get("OPENAI_API_KEY")),
89
+ )
90
+ )
91
+ )
92
+
93
+
94
+ @router.get("/metrics", response_model=StandardResponse[list[MetricInfo]])
95
+ async def list_metrics():
96
+ _METRICS_NEEDING_RUBRICS = {
97
+ "rubric_based_final_response_quality_v1",
98
+ "rubric_based_tool_use_quality_v1",
99
+ }
100
+
101
+ _METRIC_CATEGORIES = {
102
+ "tool_trajectory_avg_score": "trajectory",
103
+ "response_match_score": "response",
104
+ "response_evaluation_score": "response",
105
+ "final_response_match_v2": "response",
106
+ "rubric_based_final_response_quality_v1": "quality",
107
+ "rubric_based_tool_use_quality_v1": "quality",
108
+ "hallucinations_v1": "safety",
109
+ "safety_v1": "safety",
110
+ "per_turn_user_simulator_quality_v1": "simulation",
111
+ }
112
+
113
+ try:
114
+ from google.adk.evaluation.metric_evaluator_registry import (
115
+ DEFAULT_METRIC_EVALUATOR_REGISTRY,
116
+ )
117
+
118
+ registry_metrics = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
119
+
120
+ metrics = []
121
+ for m in registry_metrics:
122
+ if m.metric_name == "per_turn_user_simulator_quality_v1":
123
+ continue
124
+
125
+ metrics.append(
126
+ MetricInfo(
127
+ name=m.metric_name,
128
+ category=_METRIC_CATEGORIES.get(m.metric_name, "other"),
129
+ requires_eval_set=m.metric_name in METRICS_NEEDING_EXPECTED,
130
+ requires_llm=m.metric_name in METRICS_NEEDING_LLM,
131
+ requires_gcp=m.metric_name in METRICS_NEEDING_GCP,
132
+ requires_rubrics=m.metric_name in _METRICS_NEEDING_RUBRICS,
133
+ description=m.description or "No description available",
134
+ working=m.metric_name not in _METRICS_NEEDING_RUBRICS,
135
+ )
136
+ )
137
+
138
+ return StandardResponse(data=metrics)
139
+
140
+ except ImportError:
141
+ fallback = [
142
+ MetricInfo(
143
+ name="tool_trajectory_avg_score",
144
+ category="trajectory",
145
+ requires_eval_set=True,
146
+ requires_llm=False,
147
+ requires_gcp=False,
148
+ requires_rubrics=False,
149
+ working=True,
150
+ description="Compare tool call sequences against expected trajectory",
151
+ ),
152
+ MetricInfo(
153
+ name="response_match_score",
154
+ category="response",
155
+ requires_eval_set=True,
156
+ requires_llm=False,
157
+ requires_gcp=False,
158
+ requires_rubrics=False,
159
+ working=True,
160
+ description="Text similarity between actual and expected responses using ROUGE-1",
161
+ ),
162
+ MetricInfo(
163
+ name="response_evaluation_score",
164
+ category="response",
165
+ requires_eval_set=True,
166
+ requires_llm=False,
167
+ requires_gcp=True,
168
+ requires_rubrics=False,
169
+ working=True,
170
+ description="Semantic evaluation of response quality using Vertex AI",
171
+ ),
172
+ MetricInfo(
173
+ name="final_response_match_v2",
174
+ category="response",
175
+ requires_eval_set=True,
176
+ requires_llm=True,
177
+ requires_gcp=False,
178
+ requires_rubrics=False,
179
+ working=True,
180
+ description="LLM-based comparison of final responses",
181
+ ),
182
+ MetricInfo(
183
+ name="hallucinations_v1",
184
+ category="safety",
185
+ requires_eval_set=False,
186
+ requires_llm=True,
187
+ requires_gcp=False,
188
+ requires_rubrics=False,
189
+ working=True,
190
+ description="Detect hallucinated information in responses",
191
+ ),
192
+ MetricInfo(
193
+ name="safety_v1",
194
+ category="safety",
195
+ requires_eval_set=False,
196
+ requires_llm=False,
197
+ requires_gcp=True,
198
+ requires_rubrics=False,
199
+ working=True,
200
+ description="Safety and security assessment using Vertex AI",
201
+ ),
202
+ MetricInfo(
203
+ name="rubric_based_final_response_quality_v1",
204
+ category="quality",
205
+ requires_eval_set=False,
206
+ requires_llm=True,
207
+ requires_gcp=False,
208
+ requires_rubrics=True,
209
+ working=False,
210
+ description="Rubric-based quality assessment of responses (requires rubrics config)",
211
+ ),
212
+ MetricInfo(
213
+ name="rubric_based_tool_use_quality_v1",
214
+ category="quality",
215
+ requires_eval_set=False,
216
+ requires_llm=True,
217
+ requires_gcp=False,
218
+ requires_rubrics=True,
219
+ working=False,
220
+ description="Rubric-based assessment of tool usage quality (requires rubrics config)",
221
+ ),
222
+ ]
223
+ return StandardResponse(data=fallback)
224
+
225
+
226
+ @router.post("/validate/eval-set", response_model=StandardResponse[EvalSetValidation])
227
+ async def validate_eval_set(
228
+ eval_set_file: UploadFile = File(...),
229
+ ):
230
+ temp_dir = tempfile.mkdtemp()
231
+ try:
232
+ eval_set_path = os.path.join(temp_dir, eval_set_file.filename or "eval_set.json")
233
+ with open(eval_set_path, "wb") as f: # noqa: ASYNC230
234
+ content = await eval_set_file.read()
235
+ f.write(content)
236
+
237
+ try:
238
+ eval_set = load_eval_set(eval_set_path)
239
+ return StandardResponse(
240
+ data=EvalSetValidation(
241
+ valid=True,
242
+ eval_set_id=eval_set.eval_set_id,
243
+ num_cases=len(eval_set.eval_cases),
244
+ )
245
+ )
246
+ except Exception as exc:
247
+ return StandardResponse(
248
+ data=EvalSetValidation(
249
+ valid=False,
250
+ errors=[str(exc)],
251
+ )
252
+ )
253
+
254
+ finally:
255
+ shutil.rmtree(temp_dir)
256
+
257
+
258
+ @router.post("/evaluate", response_model=StandardResponse[RunResult])
259
+ async def evaluate_traces(
260
+ trace_files: list[UploadFile] = File(...),
261
+ config: str = Form(...),
262
+ eval_set_file: UploadFile | None = File(None),
263
+ ):
264
+ """
265
+ Evaluate agent traces using specified metrics.
266
+
267
+ Args:
268
+ trace_files: List of Jaeger JSON trace files
269
+ config: JSON string with evaluation configuration
270
+ eval_set_file: Optional golden eval set file
271
+
272
+ Returns:
273
+ RunResult with trace results and any errors
274
+ """
275
+ temp_dir = tempfile.mkdtemp()
276
+ try:
277
+ try:
278
+ config_dict = json.loads(config)
279
+ except json.JSONDecodeError as exc:
280
+ raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
281
+
282
+ trace_paths = []
283
+ for trace_file in trace_files:
284
+ if not trace_file.filename:
285
+ continue
286
+
287
+ if not (trace_file.filename.endswith(".json") or trace_file.filename.endswith(".jsonl")):
288
+ raise HTTPException(
289
+ status_code=400,
290
+ detail=f"Invalid file extension for {trace_file.filename}. Only .json and .jsonl files are allowed.",
291
+ )
292
+
293
+ trace_path = os.path.join(temp_dir, trace_file.filename)
294
+ with open(trace_path, "wb") as f: # noqa: ASYNC230
295
+ content = await trace_file.read()
296
+
297
+ if len(content) > 10 * 1024 * 1024:
298
+ raise HTTPException(
299
+ status_code=400,
300
+ detail=f"File {trace_file.filename} exceeds 10MB limit",
301
+ )
302
+
303
+ f.write(content)
304
+ trace_paths.append(trace_path)
305
+
306
+ if not trace_paths:
307
+ raise HTTPException(
308
+ status_code=400,
309
+ detail="No valid trace files provided",
310
+ )
311
+
312
+ trace_format = config_dict.get("trace_format")
313
+ if not trace_format:
314
+ first_file = trace_paths[0]
315
+ if first_file.endswith(".jsonl"):
316
+ trace_format = "otlp-json"
317
+ else:
318
+ trace_format = "jaeger-json"
319
+
320
+ eval_set_path = None
321
+ if eval_set_file and eval_set_file.filename:
322
+ if not eval_set_file.filename.endswith(".json"):
323
+ raise HTTPException(
324
+ status_code=400,
325
+ detail="Invalid file extension for eval set. Only .json files are allowed.",
326
+ )
327
+
328
+ eval_set_path = os.path.join(temp_dir, eval_set_file.filename)
329
+ with open(eval_set_path, "wb") as f: # noqa: ASYNC230
330
+ content = await eval_set_file.read()
331
+ if len(content) > 10 * 1024 * 1024:
332
+ raise HTTPException(
333
+ status_code=400,
334
+ detail="Eval set file exceeds 10MB limit",
335
+ )
336
+ f.write(content)
337
+
338
+ metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"])
339
+ if not metrics or not isinstance(metrics, list):
340
+ raise HTTPException(
341
+ status_code=400,
342
+ detail="Config must include 'metrics' as a non-empty array",
343
+ )
344
+
345
+ threshold = config_dict.get("threshold")
346
+ if threshold is not None and (threshold < 0 or threshold > 1):
347
+ raise HTTPException(
348
+ status_code=400,
349
+ detail="Threshold must be between 0 and 1",
350
+ )
351
+
352
+ custom_evaluators: list[CustomEvaluatorDef] = []
353
+ raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", []))
354
+ if raw_custom:
355
+ try:
356
+ custom_evaluators = _parse_custom_evaluators(raw_custom)
357
+ except Exception as exc:
358
+ raise HTTPException(status_code=400, detail=f"Invalid customEvaluators: {exc}") from exc
359
+
360
+ eval_config = EvalRunConfig(
361
+ trace_files=trace_paths,
362
+ eval_set_file=eval_set_path,
363
+ metrics=metrics,
364
+ custom_evaluators=custom_evaluators,
365
+ trace_format=trace_format,
366
+ judge_model=config_dict.get("judgeModel"),
367
+ threshold=threshold,
368
+ )
369
+
370
+ logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
371
+ result = await run_evaluation(eval_config)
372
+
373
+ result_dict = _camel_keys(result.model_dump(by_alias=True))
374
+ return StandardResponse(data=result_dict)
375
+
376
+ except HTTPException:
377
+ raise
378
+ except Exception as exc:
379
+ logger.exception("Evaluation failed")
380
+ raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
381
+
382
+ finally:
383
+ shutil.rmtree(temp_dir)
384
+
385
+
386
+ @router.post("/evaluate/stream")
387
+ async def evaluate_traces_stream(
388
+ trace_files: list[UploadFile] = File(...),
389
+ config: str = Form(...),
390
+ eval_set_file: UploadFile | None = File(None),
391
+ ):
392
+ """Evaluate traces with real-time progress via SSE."""
393
+ temp_dir = tempfile.mkdtemp()
394
+
395
+ async def event_generator():
396
+ try:
397
+ try:
398
+ config_dict = json.loads(config)
399
+ except json.JSONDecodeError as exc:
400
+ yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
401
+ return
402
+
403
+ trace_paths = []
404
+ for trace_file in trace_files:
405
+ if not trace_file.filename:
406
+ continue
407
+
408
+ if not (trace_file.filename.endswith(".json") or trace_file.filename.endswith(".jsonl")):
409
+ yield f"data: {SSEErrorEvent(error=f'Invalid file extension for {trace_file.filename}').model_dump_json(by_alias=True)}\n\n"
410
+ return
411
+
412
+ trace_path = os.path.join(temp_dir, trace_file.filename)
413
+ with open(trace_path, "wb") as f: # noqa: ASYNC230
414
+ content = await trace_file.read()
415
+
416
+ if len(content) > 10 * 1024 * 1024:
417
+ yield f"data: {SSEErrorEvent(error=f'File {trace_file.filename} exceeds 10MB').model_dump_json(by_alias=True)}\n\n"
418
+ return
419
+
420
+ f.write(content)
421
+ trace_paths.append(trace_path)
422
+
423
+ if not trace_paths:
424
+ yield f"data: {SSEErrorEvent(error='No valid trace files provided').model_dump_json(by_alias=True)}\n\n"
425
+ return
426
+
427
+ trace_format = config_dict.get("trace_format")
428
+ if not trace_format:
429
+ first_file = trace_paths[0]
430
+ if first_file.endswith(".jsonl"):
431
+ trace_format = "otlp-json"
432
+ else:
433
+ trace_format = "jaeger-json"
434
+
435
+ eval_set_path = None
436
+ if eval_set_file and eval_set_file.filename:
437
+ if not eval_set_file.filename.endswith(".json"):
438
+ yield f"data: {SSEErrorEvent(error='Invalid file extension for eval set').model_dump_json(by_alias=True)}\n\n"
439
+ return
440
+
441
+ eval_set_path = os.path.join(temp_dir, eval_set_file.filename)
442
+ with open(eval_set_path, "wb") as f: # noqa: ASYNC230
443
+ content = await eval_set_file.read()
444
+ if len(content) > 10 * 1024 * 1024:
445
+ yield f"data: {SSEErrorEvent(error='Eval set file exceeds 10MB').model_dump_json(by_alias=True)}\n\n"
446
+ return
447
+ f.write(content)
448
+
449
+ metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"])
450
+ if not metrics or not isinstance(metrics, list):
451
+ yield f"data: {SSEErrorEvent(error='Config must include metrics as a non-empty array').model_dump_json(by_alias=True)}\n\n"
452
+ return
453
+
454
+ threshold = config_dict.get("threshold")
455
+ if threshold is not None and (threshold < 0 or threshold > 1):
456
+ yield f"data: {SSEErrorEvent(error='Threshold must be between 0 and 1').model_dump_json(by_alias=True)}\n\n"
457
+ return
458
+
459
+ custom_evaluators: list[CustomEvaluatorDef] = []
460
+ raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", []))
461
+ if raw_custom:
462
+ try:
463
+ custom_evaluators = _parse_custom_evaluators(raw_custom)
464
+ except Exception as exc:
465
+ yield f"data: {SSEErrorEvent(error=f'Invalid customEvaluators: {exc}').model_dump_json(by_alias=True)}\n\n"
466
+ return
467
+
468
+ eval_config = EvalRunConfig(
469
+ trace_files=trace_paths,
470
+ eval_set_file=eval_set_path,
471
+ metrics=metrics,
472
+ custom_evaluators=custom_evaluators,
473
+ trace_format=trace_format,
474
+ judge_model=config_dict.get("judgeModel"),
475
+ threshold=threshold,
476
+ )
477
+
478
+ loader = get_loader(eval_config.trace_format)
479
+ for trace_file_path in trace_paths:
480
+ try:
481
+ traces = loader.load(trace_file_path)
482
+ for trace in traces:
483
+ extractor = get_extractor(trace)
484
+ perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
485
+ trace_metadata = _camel_keys(extract_trace_metadata(trace, extractor))
486
+ evt = SSEPerformanceMetricsEvent(
487
+ trace_id=trace.trace_id,
488
+ performance_metrics=perf_metrics,
489
+ trace_metadata=trace_metadata,
490
+ )
491
+ yield f"event: performance_metrics\ndata: {evt.model_dump_json(by_alias=True)}\n\n"
492
+ except Exception as e:
493
+ logger.error(f"Failed to extract early performance metrics from {trace_file_path}: {e}")
494
+
495
+ queue: asyncio.Queue = asyncio.Queue()
496
+
497
+ async def progress_callback(message: str):
498
+ await queue.put(("progress", message))
499
+
500
+ async def trace_progress_callback(trace_result):
501
+ await queue.put(("trace_progress", trace_result))
502
+
503
+ async def run_with_progress():
504
+ result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
505
+ await queue.put(("done", result))
506
+
507
+ eval_task = asyncio.create_task(run_with_progress())
508
+
509
+ try:
510
+ while True:
511
+ msg = await queue.get()
512
+ tag, payload = msg
513
+
514
+ if tag == "done":
515
+ evt = SSEDoneEvent(
516
+ result=_camel_keys(payload.model_dump(by_alias=True)),
517
+ )
518
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
519
+ break
520
+ elif tag == "trace_progress":
521
+ evt = SSETraceProgressEvent(
522
+ trace_progress=SSETraceProgress(
523
+ trace_id=payload.trace_id,
524
+ partial_result=_camel_keys(payload.model_dump(by_alias=True)),
525
+ )
526
+ )
527
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
528
+ elif tag == "progress":
529
+ evt = SSEProgressEvent(message=payload)
530
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
531
+ finally:
532
+ if not eval_task.done():
533
+ eval_task.cancel()
534
+ try:
535
+ await eval_task
536
+ except asyncio.CancelledError:
537
+ pass
538
+
539
+ except Exception as exc:
540
+ logger.exception("Evaluation stream failed")
541
+ evt = SSEErrorEvent(error=str(exc))
542
+ yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
543
+
544
+ finally:
545
+ shutil.rmtree(temp_dir)
546
+
547
+ return StreamingResponse(
548
+ event_generator(),
549
+ media_type="text/event-stream",
550
+ headers={
551
+ "Cache-Control": "no-cache",
552
+ "Connection": "keep-alive",
553
+ },
554
+ )