agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/api/routes.py
ADDED
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
"""API routes for agentevals."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import shutil
|
|
10
|
+
import tempfile
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
|
14
|
+
from fastapi.responses import StreamingResponse
|
|
15
|
+
from pydantic.alias_generators import to_camel
|
|
16
|
+
|
|
17
|
+
from agentevals import __version__
|
|
18
|
+
|
|
19
|
+
from ..builtin_metrics import METRICS_NEEDING_EXPECTED, METRICS_NEEDING_GCP, METRICS_NEEDING_LLM
|
|
20
|
+
from ..config import (
|
|
21
|
+
BuiltinMetricDef,
|
|
22
|
+
CodeEvaluatorDef,
|
|
23
|
+
CustomEvaluatorDef,
|
|
24
|
+
EvalRunConfig,
|
|
25
|
+
)
|
|
26
|
+
from ..extraction import get_extractor
|
|
27
|
+
from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
|
|
28
|
+
from ..trace_metrics import extract_performance_metrics, extract_trace_metadata
|
|
29
|
+
from .models import (
|
|
30
|
+
ApiKeyStatus,
|
|
31
|
+
ConfigData,
|
|
32
|
+
EvalSetValidation,
|
|
33
|
+
HealthData,
|
|
34
|
+
MetricInfo,
|
|
35
|
+
SSEDoneEvent,
|
|
36
|
+
SSEErrorEvent,
|
|
37
|
+
SSEPerformanceMetricsEvent,
|
|
38
|
+
SSEProgressEvent,
|
|
39
|
+
SSETraceProgress,
|
|
40
|
+
SSETraceProgressEvent,
|
|
41
|
+
StandardResponse,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _camel_keys(obj: Any) -> Any:
|
|
48
|
+
"""Recursively convert dict keys from snake_case to camelCase."""
|
|
49
|
+
if isinstance(obj, dict):
|
|
50
|
+
return {to_camel(k): _camel_keys(v) for k, v in obj.items()}
|
|
51
|
+
if isinstance(obj, list):
|
|
52
|
+
return [_camel_keys(item) for item in obj]
|
|
53
|
+
return obj
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
router = APIRouter()
|
|
57
|
+
|
|
58
|
+
_TYPE_TO_MODEL = {
|
|
59
|
+
"builtin": BuiltinMetricDef,
|
|
60
|
+
"code": CodeEvaluatorDef,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _parse_custom_evaluators(raw: list[dict]) -> list[CustomEvaluatorDef]:
|
|
65
|
+
"""Parse a list of custom evaluator dicts from the API config JSON."""
|
|
66
|
+
defs: list[CustomEvaluatorDef] = []
|
|
67
|
+
for entry in raw:
|
|
68
|
+
evaluator_type = entry.get("type", "builtin")
|
|
69
|
+
model_cls = _TYPE_TO_MODEL.get(evaluator_type)
|
|
70
|
+
if not model_cls:
|
|
71
|
+
raise ValueError(f"Unknown custom evaluator type: {evaluator_type}")
|
|
72
|
+
defs.append(model_cls.model_validate(entry))
|
|
73
|
+
return defs
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@router.get("/health", response_model=StandardResponse[HealthData])
|
|
77
|
+
async def health_check():
|
|
78
|
+
return StandardResponse(data=HealthData(status="ok", version=__version__))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@router.get("/config", response_model=StandardResponse[ConfigData])
|
|
82
|
+
async def get_config():
|
|
83
|
+
return StandardResponse(
|
|
84
|
+
data=ConfigData(
|
|
85
|
+
api_keys=ApiKeyStatus(
|
|
86
|
+
google=bool(os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")),
|
|
87
|
+
anthropic=bool(os.environ.get("ANTHROPIC_API_KEY")),
|
|
88
|
+
openai=bool(os.environ.get("OPENAI_API_KEY")),
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@router.get("/metrics", response_model=StandardResponse[list[MetricInfo]])
|
|
95
|
+
async def list_metrics():
|
|
96
|
+
_METRICS_NEEDING_RUBRICS = {
|
|
97
|
+
"rubric_based_final_response_quality_v1",
|
|
98
|
+
"rubric_based_tool_use_quality_v1",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
_METRIC_CATEGORIES = {
|
|
102
|
+
"tool_trajectory_avg_score": "trajectory",
|
|
103
|
+
"response_match_score": "response",
|
|
104
|
+
"response_evaluation_score": "response",
|
|
105
|
+
"final_response_match_v2": "response",
|
|
106
|
+
"rubric_based_final_response_quality_v1": "quality",
|
|
107
|
+
"rubric_based_tool_use_quality_v1": "quality",
|
|
108
|
+
"hallucinations_v1": "safety",
|
|
109
|
+
"safety_v1": "safety",
|
|
110
|
+
"per_turn_user_simulator_quality_v1": "simulation",
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
from google.adk.evaluation.metric_evaluator_registry import (
|
|
115
|
+
DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
registry_metrics = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
|
|
119
|
+
|
|
120
|
+
metrics = []
|
|
121
|
+
for m in registry_metrics:
|
|
122
|
+
if m.metric_name == "per_turn_user_simulator_quality_v1":
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
metrics.append(
|
|
126
|
+
MetricInfo(
|
|
127
|
+
name=m.metric_name,
|
|
128
|
+
category=_METRIC_CATEGORIES.get(m.metric_name, "other"),
|
|
129
|
+
requires_eval_set=m.metric_name in METRICS_NEEDING_EXPECTED,
|
|
130
|
+
requires_llm=m.metric_name in METRICS_NEEDING_LLM,
|
|
131
|
+
requires_gcp=m.metric_name in METRICS_NEEDING_GCP,
|
|
132
|
+
requires_rubrics=m.metric_name in _METRICS_NEEDING_RUBRICS,
|
|
133
|
+
description=m.description or "No description available",
|
|
134
|
+
working=m.metric_name not in _METRICS_NEEDING_RUBRICS,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return StandardResponse(data=metrics)
|
|
139
|
+
|
|
140
|
+
except ImportError:
|
|
141
|
+
fallback = [
|
|
142
|
+
MetricInfo(
|
|
143
|
+
name="tool_trajectory_avg_score",
|
|
144
|
+
category="trajectory",
|
|
145
|
+
requires_eval_set=True,
|
|
146
|
+
requires_llm=False,
|
|
147
|
+
requires_gcp=False,
|
|
148
|
+
requires_rubrics=False,
|
|
149
|
+
working=True,
|
|
150
|
+
description="Compare tool call sequences against expected trajectory",
|
|
151
|
+
),
|
|
152
|
+
MetricInfo(
|
|
153
|
+
name="response_match_score",
|
|
154
|
+
category="response",
|
|
155
|
+
requires_eval_set=True,
|
|
156
|
+
requires_llm=False,
|
|
157
|
+
requires_gcp=False,
|
|
158
|
+
requires_rubrics=False,
|
|
159
|
+
working=True,
|
|
160
|
+
description="Text similarity between actual and expected responses using ROUGE-1",
|
|
161
|
+
),
|
|
162
|
+
MetricInfo(
|
|
163
|
+
name="response_evaluation_score",
|
|
164
|
+
category="response",
|
|
165
|
+
requires_eval_set=True,
|
|
166
|
+
requires_llm=False,
|
|
167
|
+
requires_gcp=True,
|
|
168
|
+
requires_rubrics=False,
|
|
169
|
+
working=True,
|
|
170
|
+
description="Semantic evaluation of response quality using Vertex AI",
|
|
171
|
+
),
|
|
172
|
+
MetricInfo(
|
|
173
|
+
name="final_response_match_v2",
|
|
174
|
+
category="response",
|
|
175
|
+
requires_eval_set=True,
|
|
176
|
+
requires_llm=True,
|
|
177
|
+
requires_gcp=False,
|
|
178
|
+
requires_rubrics=False,
|
|
179
|
+
working=True,
|
|
180
|
+
description="LLM-based comparison of final responses",
|
|
181
|
+
),
|
|
182
|
+
MetricInfo(
|
|
183
|
+
name="hallucinations_v1",
|
|
184
|
+
category="safety",
|
|
185
|
+
requires_eval_set=False,
|
|
186
|
+
requires_llm=True,
|
|
187
|
+
requires_gcp=False,
|
|
188
|
+
requires_rubrics=False,
|
|
189
|
+
working=True,
|
|
190
|
+
description="Detect hallucinated information in responses",
|
|
191
|
+
),
|
|
192
|
+
MetricInfo(
|
|
193
|
+
name="safety_v1",
|
|
194
|
+
category="safety",
|
|
195
|
+
requires_eval_set=False,
|
|
196
|
+
requires_llm=False,
|
|
197
|
+
requires_gcp=True,
|
|
198
|
+
requires_rubrics=False,
|
|
199
|
+
working=True,
|
|
200
|
+
description="Safety and security assessment using Vertex AI",
|
|
201
|
+
),
|
|
202
|
+
MetricInfo(
|
|
203
|
+
name="rubric_based_final_response_quality_v1",
|
|
204
|
+
category="quality",
|
|
205
|
+
requires_eval_set=False,
|
|
206
|
+
requires_llm=True,
|
|
207
|
+
requires_gcp=False,
|
|
208
|
+
requires_rubrics=True,
|
|
209
|
+
working=False,
|
|
210
|
+
description="Rubric-based quality assessment of responses (requires rubrics config)",
|
|
211
|
+
),
|
|
212
|
+
MetricInfo(
|
|
213
|
+
name="rubric_based_tool_use_quality_v1",
|
|
214
|
+
category="quality",
|
|
215
|
+
requires_eval_set=False,
|
|
216
|
+
requires_llm=True,
|
|
217
|
+
requires_gcp=False,
|
|
218
|
+
requires_rubrics=True,
|
|
219
|
+
working=False,
|
|
220
|
+
description="Rubric-based assessment of tool usage quality (requires rubrics config)",
|
|
221
|
+
),
|
|
222
|
+
]
|
|
223
|
+
return StandardResponse(data=fallback)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
@router.post("/validate/eval-set", response_model=StandardResponse[EvalSetValidation])
|
|
227
|
+
async def validate_eval_set(
|
|
228
|
+
eval_set_file: UploadFile = File(...),
|
|
229
|
+
):
|
|
230
|
+
temp_dir = tempfile.mkdtemp()
|
|
231
|
+
try:
|
|
232
|
+
eval_set_path = os.path.join(temp_dir, eval_set_file.filename or "eval_set.json")
|
|
233
|
+
with open(eval_set_path, "wb") as f: # noqa: ASYNC230
|
|
234
|
+
content = await eval_set_file.read()
|
|
235
|
+
f.write(content)
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
eval_set = load_eval_set(eval_set_path)
|
|
239
|
+
return StandardResponse(
|
|
240
|
+
data=EvalSetValidation(
|
|
241
|
+
valid=True,
|
|
242
|
+
eval_set_id=eval_set.eval_set_id,
|
|
243
|
+
num_cases=len(eval_set.eval_cases),
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
return StandardResponse(
|
|
248
|
+
data=EvalSetValidation(
|
|
249
|
+
valid=False,
|
|
250
|
+
errors=[str(exc)],
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
finally:
|
|
255
|
+
shutil.rmtree(temp_dir)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
@router.post("/evaluate", response_model=StandardResponse[RunResult])
|
|
259
|
+
async def evaluate_traces(
|
|
260
|
+
trace_files: list[UploadFile] = File(...),
|
|
261
|
+
config: str = Form(...),
|
|
262
|
+
eval_set_file: UploadFile | None = File(None),
|
|
263
|
+
):
|
|
264
|
+
"""
|
|
265
|
+
Evaluate agent traces using specified metrics.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
trace_files: List of Jaeger JSON trace files
|
|
269
|
+
config: JSON string with evaluation configuration
|
|
270
|
+
eval_set_file: Optional golden eval set file
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
RunResult with trace results and any errors
|
|
274
|
+
"""
|
|
275
|
+
temp_dir = tempfile.mkdtemp()
|
|
276
|
+
try:
|
|
277
|
+
try:
|
|
278
|
+
config_dict = json.loads(config)
|
|
279
|
+
except json.JSONDecodeError as exc:
|
|
280
|
+
raise HTTPException(status_code=400, detail=f"Invalid config JSON: {exc}") from exc
|
|
281
|
+
|
|
282
|
+
trace_paths = []
|
|
283
|
+
for trace_file in trace_files:
|
|
284
|
+
if not trace_file.filename:
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
if not (trace_file.filename.endswith(".json") or trace_file.filename.endswith(".jsonl")):
|
|
288
|
+
raise HTTPException(
|
|
289
|
+
status_code=400,
|
|
290
|
+
detail=f"Invalid file extension for {trace_file.filename}. Only .json and .jsonl files are allowed.",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
trace_path = os.path.join(temp_dir, trace_file.filename)
|
|
294
|
+
with open(trace_path, "wb") as f: # noqa: ASYNC230
|
|
295
|
+
content = await trace_file.read()
|
|
296
|
+
|
|
297
|
+
if len(content) > 10 * 1024 * 1024:
|
|
298
|
+
raise HTTPException(
|
|
299
|
+
status_code=400,
|
|
300
|
+
detail=f"File {trace_file.filename} exceeds 10MB limit",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
f.write(content)
|
|
304
|
+
trace_paths.append(trace_path)
|
|
305
|
+
|
|
306
|
+
if not trace_paths:
|
|
307
|
+
raise HTTPException(
|
|
308
|
+
status_code=400,
|
|
309
|
+
detail="No valid trace files provided",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
trace_format = config_dict.get("trace_format")
|
|
313
|
+
if not trace_format:
|
|
314
|
+
first_file = trace_paths[0]
|
|
315
|
+
if first_file.endswith(".jsonl"):
|
|
316
|
+
trace_format = "otlp-json"
|
|
317
|
+
else:
|
|
318
|
+
trace_format = "jaeger-json"
|
|
319
|
+
|
|
320
|
+
eval_set_path = None
|
|
321
|
+
if eval_set_file and eval_set_file.filename:
|
|
322
|
+
if not eval_set_file.filename.endswith(".json"):
|
|
323
|
+
raise HTTPException(
|
|
324
|
+
status_code=400,
|
|
325
|
+
detail="Invalid file extension for eval set. Only .json files are allowed.",
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
eval_set_path = os.path.join(temp_dir, eval_set_file.filename)
|
|
329
|
+
with open(eval_set_path, "wb") as f: # noqa: ASYNC230
|
|
330
|
+
content = await eval_set_file.read()
|
|
331
|
+
if len(content) > 10 * 1024 * 1024:
|
|
332
|
+
raise HTTPException(
|
|
333
|
+
status_code=400,
|
|
334
|
+
detail="Eval set file exceeds 10MB limit",
|
|
335
|
+
)
|
|
336
|
+
f.write(content)
|
|
337
|
+
|
|
338
|
+
metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"])
|
|
339
|
+
if not metrics or not isinstance(metrics, list):
|
|
340
|
+
raise HTTPException(
|
|
341
|
+
status_code=400,
|
|
342
|
+
detail="Config must include 'metrics' as a non-empty array",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
threshold = config_dict.get("threshold")
|
|
346
|
+
if threshold is not None and (threshold < 0 or threshold > 1):
|
|
347
|
+
raise HTTPException(
|
|
348
|
+
status_code=400,
|
|
349
|
+
detail="Threshold must be between 0 and 1",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
custom_evaluators: list[CustomEvaluatorDef] = []
|
|
353
|
+
raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", []))
|
|
354
|
+
if raw_custom:
|
|
355
|
+
try:
|
|
356
|
+
custom_evaluators = _parse_custom_evaluators(raw_custom)
|
|
357
|
+
except Exception as exc:
|
|
358
|
+
raise HTTPException(status_code=400, detail=f"Invalid customEvaluators: {exc}") from exc
|
|
359
|
+
|
|
360
|
+
eval_config = EvalRunConfig(
|
|
361
|
+
trace_files=trace_paths,
|
|
362
|
+
eval_set_file=eval_set_path,
|
|
363
|
+
metrics=metrics,
|
|
364
|
+
custom_evaluators=custom_evaluators,
|
|
365
|
+
trace_format=trace_format,
|
|
366
|
+
judge_model=config_dict.get("judgeModel"),
|
|
367
|
+
threshold=threshold,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
logger.info(f"Evaluating {len(trace_paths)} trace file(s) with metrics: {metrics}")
|
|
371
|
+
result = await run_evaluation(eval_config)
|
|
372
|
+
|
|
373
|
+
result_dict = _camel_keys(result.model_dump(by_alias=True))
|
|
374
|
+
return StandardResponse(data=result_dict)
|
|
375
|
+
|
|
376
|
+
except HTTPException:
|
|
377
|
+
raise
|
|
378
|
+
except Exception as exc:
|
|
379
|
+
logger.exception("Evaluation failed")
|
|
380
|
+
raise HTTPException(status_code=500, detail=f"Internal error: {exc!s}") from exc
|
|
381
|
+
|
|
382
|
+
finally:
|
|
383
|
+
shutil.rmtree(temp_dir)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
@router.post("/evaluate/stream")
|
|
387
|
+
async def evaluate_traces_stream(
|
|
388
|
+
trace_files: list[UploadFile] = File(...),
|
|
389
|
+
config: str = Form(...),
|
|
390
|
+
eval_set_file: UploadFile | None = File(None),
|
|
391
|
+
):
|
|
392
|
+
"""Evaluate traces with real-time progress via SSE."""
|
|
393
|
+
temp_dir = tempfile.mkdtemp()
|
|
394
|
+
|
|
395
|
+
async def event_generator():
|
|
396
|
+
try:
|
|
397
|
+
try:
|
|
398
|
+
config_dict = json.loads(config)
|
|
399
|
+
except json.JSONDecodeError as exc:
|
|
400
|
+
yield f"data: {SSEErrorEvent(error=f'Invalid config JSON: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
401
|
+
return
|
|
402
|
+
|
|
403
|
+
trace_paths = []
|
|
404
|
+
for trace_file in trace_files:
|
|
405
|
+
if not trace_file.filename:
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
if not (trace_file.filename.endswith(".json") or trace_file.filename.endswith(".jsonl")):
|
|
409
|
+
yield f"data: {SSEErrorEvent(error=f'Invalid file extension for {trace_file.filename}').model_dump_json(by_alias=True)}\n\n"
|
|
410
|
+
return
|
|
411
|
+
|
|
412
|
+
trace_path = os.path.join(temp_dir, trace_file.filename)
|
|
413
|
+
with open(trace_path, "wb") as f: # noqa: ASYNC230
|
|
414
|
+
content = await trace_file.read()
|
|
415
|
+
|
|
416
|
+
if len(content) > 10 * 1024 * 1024:
|
|
417
|
+
yield f"data: {SSEErrorEvent(error=f'File {trace_file.filename} exceeds 10MB').model_dump_json(by_alias=True)}\n\n"
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
f.write(content)
|
|
421
|
+
trace_paths.append(trace_path)
|
|
422
|
+
|
|
423
|
+
if not trace_paths:
|
|
424
|
+
yield f"data: {SSEErrorEvent(error='No valid trace files provided').model_dump_json(by_alias=True)}\n\n"
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
trace_format = config_dict.get("trace_format")
|
|
428
|
+
if not trace_format:
|
|
429
|
+
first_file = trace_paths[0]
|
|
430
|
+
if first_file.endswith(".jsonl"):
|
|
431
|
+
trace_format = "otlp-json"
|
|
432
|
+
else:
|
|
433
|
+
trace_format = "jaeger-json"
|
|
434
|
+
|
|
435
|
+
eval_set_path = None
|
|
436
|
+
if eval_set_file and eval_set_file.filename:
|
|
437
|
+
if not eval_set_file.filename.endswith(".json"):
|
|
438
|
+
yield f"data: {SSEErrorEvent(error='Invalid file extension for eval set').model_dump_json(by_alias=True)}\n\n"
|
|
439
|
+
return
|
|
440
|
+
|
|
441
|
+
eval_set_path = os.path.join(temp_dir, eval_set_file.filename)
|
|
442
|
+
with open(eval_set_path, "wb") as f: # noqa: ASYNC230
|
|
443
|
+
content = await eval_set_file.read()
|
|
444
|
+
if len(content) > 10 * 1024 * 1024:
|
|
445
|
+
yield f"data: {SSEErrorEvent(error='Eval set file exceeds 10MB').model_dump_json(by_alias=True)}\n\n"
|
|
446
|
+
return
|
|
447
|
+
f.write(content)
|
|
448
|
+
|
|
449
|
+
metrics = config_dict.get("metrics", ["tool_trajectory_avg_score"])
|
|
450
|
+
if not metrics or not isinstance(metrics, list):
|
|
451
|
+
yield f"data: {SSEErrorEvent(error='Config must include metrics as a non-empty array').model_dump_json(by_alias=True)}\n\n"
|
|
452
|
+
return
|
|
453
|
+
|
|
454
|
+
threshold = config_dict.get("threshold")
|
|
455
|
+
if threshold is not None and (threshold < 0 or threshold > 1):
|
|
456
|
+
yield f"data: {SSEErrorEvent(error='Threshold must be between 0 and 1').model_dump_json(by_alias=True)}\n\n"
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
custom_evaluators: list[CustomEvaluatorDef] = []
|
|
460
|
+
raw_custom = config_dict.get("customEvaluators", config_dict.get("customMetrics", []))
|
|
461
|
+
if raw_custom:
|
|
462
|
+
try:
|
|
463
|
+
custom_evaluators = _parse_custom_evaluators(raw_custom)
|
|
464
|
+
except Exception as exc:
|
|
465
|
+
yield f"data: {SSEErrorEvent(error=f'Invalid customEvaluators: {exc}').model_dump_json(by_alias=True)}\n\n"
|
|
466
|
+
return
|
|
467
|
+
|
|
468
|
+
eval_config = EvalRunConfig(
|
|
469
|
+
trace_files=trace_paths,
|
|
470
|
+
eval_set_file=eval_set_path,
|
|
471
|
+
metrics=metrics,
|
|
472
|
+
custom_evaluators=custom_evaluators,
|
|
473
|
+
trace_format=trace_format,
|
|
474
|
+
judge_model=config_dict.get("judgeModel"),
|
|
475
|
+
threshold=threshold,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
loader = get_loader(eval_config.trace_format)
|
|
479
|
+
for trace_file_path in trace_paths:
|
|
480
|
+
try:
|
|
481
|
+
traces = loader.load(trace_file_path)
|
|
482
|
+
for trace in traces:
|
|
483
|
+
extractor = get_extractor(trace)
|
|
484
|
+
perf_metrics = _camel_keys(extract_performance_metrics(trace, extractor))
|
|
485
|
+
trace_metadata = _camel_keys(extract_trace_metadata(trace, extractor))
|
|
486
|
+
evt = SSEPerformanceMetricsEvent(
|
|
487
|
+
trace_id=trace.trace_id,
|
|
488
|
+
performance_metrics=perf_metrics,
|
|
489
|
+
trace_metadata=trace_metadata,
|
|
490
|
+
)
|
|
491
|
+
yield f"event: performance_metrics\ndata: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
492
|
+
except Exception as e:
|
|
493
|
+
logger.error(f"Failed to extract early performance metrics from {trace_file_path}: {e}")
|
|
494
|
+
|
|
495
|
+
queue: asyncio.Queue = asyncio.Queue()
|
|
496
|
+
|
|
497
|
+
async def progress_callback(message: str):
|
|
498
|
+
await queue.put(("progress", message))
|
|
499
|
+
|
|
500
|
+
async def trace_progress_callback(trace_result):
|
|
501
|
+
await queue.put(("trace_progress", trace_result))
|
|
502
|
+
|
|
503
|
+
async def run_with_progress():
|
|
504
|
+
result = await run_evaluation(eval_config, progress_callback, trace_progress_callback)
|
|
505
|
+
await queue.put(("done", result))
|
|
506
|
+
|
|
507
|
+
eval_task = asyncio.create_task(run_with_progress())
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
while True:
|
|
511
|
+
msg = await queue.get()
|
|
512
|
+
tag, payload = msg
|
|
513
|
+
|
|
514
|
+
if tag == "done":
|
|
515
|
+
evt = SSEDoneEvent(
|
|
516
|
+
result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
517
|
+
)
|
|
518
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
519
|
+
break
|
|
520
|
+
elif tag == "trace_progress":
|
|
521
|
+
evt = SSETraceProgressEvent(
|
|
522
|
+
trace_progress=SSETraceProgress(
|
|
523
|
+
trace_id=payload.trace_id,
|
|
524
|
+
partial_result=_camel_keys(payload.model_dump(by_alias=True)),
|
|
525
|
+
)
|
|
526
|
+
)
|
|
527
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
528
|
+
elif tag == "progress":
|
|
529
|
+
evt = SSEProgressEvent(message=payload)
|
|
530
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
531
|
+
finally:
|
|
532
|
+
if not eval_task.done():
|
|
533
|
+
eval_task.cancel()
|
|
534
|
+
try:
|
|
535
|
+
await eval_task
|
|
536
|
+
except asyncio.CancelledError:
|
|
537
|
+
pass
|
|
538
|
+
|
|
539
|
+
except Exception as exc:
|
|
540
|
+
logger.exception("Evaluation stream failed")
|
|
541
|
+
evt = SSEErrorEvent(error=str(exc))
|
|
542
|
+
yield f"data: {evt.model_dump_json(by_alias=True)}\n\n"
|
|
543
|
+
|
|
544
|
+
finally:
|
|
545
|
+
shutil.rmtree(temp_dir)
|
|
546
|
+
|
|
547
|
+
return StreamingResponse(
|
|
548
|
+
event_generator(),
|
|
549
|
+
media_type="text/event-stream",
|
|
550
|
+
headers={
|
|
551
|
+
"Cache-Control": "no-cache",
|
|
552
|
+
"Connection": "keep-alive",
|
|
553
|
+
},
|
|
554
|
+
)
|