agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/output.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Output formatting for evaluation results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .runner import MetricResult, RunResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def format_results(run_result: RunResult, fmt: str = "table") -> str:
|
|
12
|
+
if fmt == "json":
|
|
13
|
+
return _format_json(run_result)
|
|
14
|
+
elif fmt == "summary":
|
|
15
|
+
return _format_summary(run_result)
|
|
16
|
+
else:
|
|
17
|
+
return _format_table(run_result)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _format_table(run_result: RunResult) -> str:
|
|
21
|
+
try:
|
|
22
|
+
from tabulate import tabulate
|
|
23
|
+
except ImportError:
|
|
24
|
+
return _format_summary(run_result)
|
|
25
|
+
|
|
26
|
+
lines: list[str] = []
|
|
27
|
+
|
|
28
|
+
if run_result.errors:
|
|
29
|
+
lines.append("Errors:")
|
|
30
|
+
for err in run_result.errors:
|
|
31
|
+
lines.append(f" - {err}")
|
|
32
|
+
lines.append("")
|
|
33
|
+
|
|
34
|
+
for trace_result in run_result.trace_results:
|
|
35
|
+
lines.append(f"Trace: {trace_result.trace_id}")
|
|
36
|
+
lines.append(f"Invocations: {trace_result.num_invocations}")
|
|
37
|
+
|
|
38
|
+
if trace_result.conversion_warnings:
|
|
39
|
+
for w in trace_result.conversion_warnings:
|
|
40
|
+
lines.append(f" Warning: {w}")
|
|
41
|
+
|
|
42
|
+
rows = []
|
|
43
|
+
for mr in trace_result.metric_results:
|
|
44
|
+
status_icon = _status_icon(mr.eval_status)
|
|
45
|
+
score_str = f"{mr.score:.4f}" if mr.score is not None else "N/A"
|
|
46
|
+
error_str = mr.error or ""
|
|
47
|
+
per_inv = (
|
|
48
|
+
", ".join(f"{s:.4f}" if s is not None else "N/A" for s in mr.per_invocation_scores)
|
|
49
|
+
if mr.per_invocation_scores
|
|
50
|
+
else ""
|
|
51
|
+
)
|
|
52
|
+
rows.append(
|
|
53
|
+
[
|
|
54
|
+
status_icon,
|
|
55
|
+
mr.metric_name,
|
|
56
|
+
score_str,
|
|
57
|
+
mr.eval_status,
|
|
58
|
+
per_inv,
|
|
59
|
+
error_str,
|
|
60
|
+
]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if rows:
|
|
64
|
+
table = tabulate(
|
|
65
|
+
rows,
|
|
66
|
+
headers=["", "Metric", "Score", "Status", "Per-Invocation", "Error"],
|
|
67
|
+
tablefmt="simple",
|
|
68
|
+
)
|
|
69
|
+
lines.append(table)
|
|
70
|
+
|
|
71
|
+
for mr in trace_result.metric_results:
|
|
72
|
+
if mr.details and mr.eval_status == "FAILED":
|
|
73
|
+
lines.append(_format_metric_details(mr))
|
|
74
|
+
lines.append("")
|
|
75
|
+
|
|
76
|
+
if trace_result.performance_metrics:
|
|
77
|
+
perf = trace_result.performance_metrics
|
|
78
|
+
lines.append("\n Performance Metrics:")
|
|
79
|
+
|
|
80
|
+
lat = perf["latency"]
|
|
81
|
+
lines.append(
|
|
82
|
+
f" Overall Latency: p50={lat['overall']['p50']:.0f}ms, p95={lat['overall']['p95']:.0f}ms, p99={lat['overall']['p99']:.0f}ms"
|
|
83
|
+
)
|
|
84
|
+
lines.append(
|
|
85
|
+
f" LLM Latency: p50={lat['llm_calls']['p50']:.0f}ms, p95={lat['llm_calls']['p95']:.0f}ms, p99={lat['llm_calls']['p99']:.0f}ms"
|
|
86
|
+
)
|
|
87
|
+
lines.append(
|
|
88
|
+
f" Tool Latency: p50={lat['tool_executions']['p50']:.0f}ms, p95={lat['tool_executions']['p95']:.0f}ms, p99={lat['tool_executions']['p99']:.0f}ms"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
tok = perf["tokens"]
|
|
92
|
+
lines.append(
|
|
93
|
+
f" Tokens: {tok['total']} total ({tok['total_prompt']} prompt + {tok['total_output']} output)"
|
|
94
|
+
)
|
|
95
|
+
lines.append(
|
|
96
|
+
f" Per LLM Call: p50={tok['per_llm_call']['p50']:.0f}, p95={tok['per_llm_call']['p95']:.0f}, p99={tok['per_llm_call']['p99']:.0f}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
lines.append("")
|
|
100
|
+
|
|
101
|
+
if run_result.performance_metrics:
|
|
102
|
+
lines.append("Overall Performance:")
|
|
103
|
+
perf = run_result.performance_metrics
|
|
104
|
+
lines.append(
|
|
105
|
+
f" Total Tokens: {perf['tokens']['total']} ({perf['tokens']['total_prompt']} prompt + {perf['tokens']['total_output']} output)"
|
|
106
|
+
)
|
|
107
|
+
lines.append(
|
|
108
|
+
f" Avg per Trace: {perf['tokens']['avg_per_trace']['prompt']:.0f} prompt, {perf['tokens']['avg_per_trace']['output']:.0f} output"
|
|
109
|
+
)
|
|
110
|
+
lines.append("")
|
|
111
|
+
|
|
112
|
+
return "\n".join(lines)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _format_json(run_result: RunResult) -> str:
|
|
116
|
+
data: dict[str, Any] = {
|
|
117
|
+
"traces": [],
|
|
118
|
+
"errors": run_result.errors,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
for tr in run_result.trace_results:
|
|
122
|
+
trace_data: dict[str, Any] = {
|
|
123
|
+
"trace_id": tr.trace_id,
|
|
124
|
+
"num_invocations": tr.num_invocations,
|
|
125
|
+
"conversion_warnings": tr.conversion_warnings,
|
|
126
|
+
"metrics": [],
|
|
127
|
+
}
|
|
128
|
+
for mr in tr.metric_results:
|
|
129
|
+
metric_data = {
|
|
130
|
+
"metric_name": mr.metric_name,
|
|
131
|
+
"score": mr.score,
|
|
132
|
+
"eval_status": mr.eval_status,
|
|
133
|
+
"per_invocation_scores": mr.per_invocation_scores,
|
|
134
|
+
"error": mr.error,
|
|
135
|
+
}
|
|
136
|
+
if mr.details:
|
|
137
|
+
metric_data["details"] = mr.details
|
|
138
|
+
trace_data["metrics"].append(metric_data)
|
|
139
|
+
if tr.performance_metrics:
|
|
140
|
+
trace_data["performance_metrics"] = tr.performance_metrics
|
|
141
|
+
data["traces"].append(trace_data)
|
|
142
|
+
|
|
143
|
+
if run_result.performance_metrics:
|
|
144
|
+
data["performance_metrics"] = run_result.performance_metrics
|
|
145
|
+
|
|
146
|
+
return json.dumps(data, indent=2)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _format_summary(run_result: RunResult) -> str:
|
|
150
|
+
lines: list[str] = []
|
|
151
|
+
|
|
152
|
+
if run_result.errors:
|
|
153
|
+
lines.append("Errors:")
|
|
154
|
+
for err in run_result.errors:
|
|
155
|
+
lines.append(f" - {err}")
|
|
156
|
+
lines.append("")
|
|
157
|
+
|
|
158
|
+
for tr in run_result.trace_results:
|
|
159
|
+
lines.append(f"Trace {tr.trace_id} ({tr.num_invocations} invocations):")
|
|
160
|
+
for mr in tr.metric_results:
|
|
161
|
+
icon = _status_icon(mr.eval_status)
|
|
162
|
+
if mr.error:
|
|
163
|
+
lines.append(f" {icon} {mr.metric_name}: ERROR - {mr.error}")
|
|
164
|
+
elif mr.score is not None:
|
|
165
|
+
lines.append(f" {icon} {mr.metric_name}: {mr.score:.4f} ({mr.eval_status})")
|
|
166
|
+
else:
|
|
167
|
+
lines.append(f" {icon} {mr.metric_name}: N/A ({mr.eval_status})")
|
|
168
|
+
lines.append("")
|
|
169
|
+
|
|
170
|
+
return "\n".join(lines)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _format_metric_details(mr: MetricResult) -> str:
|
|
174
|
+
"""Format detailed comparison for metrics with details field."""
|
|
175
|
+
lines = []
|
|
176
|
+
|
|
177
|
+
if mr.metric_name == "tool_trajectory_avg_score" and mr.details:
|
|
178
|
+
comparisons = mr.details.get("comparisons", [])
|
|
179
|
+
for i, comp in enumerate(comparisons, 1):
|
|
180
|
+
if not comp.get("matched", True):
|
|
181
|
+
lines.append(f" Invocation {i} trajectory mismatch:")
|
|
182
|
+
lines.append(" Expected:")
|
|
183
|
+
for tool in comp.get("expected", []):
|
|
184
|
+
args_str = json.dumps(tool["args"]) if tool["args"] else "{}"
|
|
185
|
+
lines.append(f" - {tool['name']}({args_str})")
|
|
186
|
+
if not comp.get("expected"):
|
|
187
|
+
lines.append(" (none)")
|
|
188
|
+
|
|
189
|
+
lines.append(" Actual:")
|
|
190
|
+
for tool in comp.get("actual", []):
|
|
191
|
+
args_str = json.dumps(tool["args"]) if tool["args"] else "{}"
|
|
192
|
+
lines.append(f" - {tool['name']}({args_str})")
|
|
193
|
+
if not comp.get("actual"):
|
|
194
|
+
lines.append(" (none)")
|
|
195
|
+
|
|
196
|
+
return "\n".join(lines)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _status_icon(status: str) -> str:
|
|
200
|
+
return {
|
|
201
|
+
"PASSED": "[PASS]",
|
|
202
|
+
"FAILED": "[FAIL]",
|
|
203
|
+
"NOT_EVALUATED": "[----]",
|
|
204
|
+
}.get(status, "[????]")
|
agentevals/runner.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Evaluation runner — orchestrates trace loading, conversion, and scoring."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from collections.abc import Awaitable, Callable
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from google.adk.evaluation.eval_case import Invocation
|
|
12
|
+
from google.adk.evaluation.eval_set import EvalSet
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
from pydantic.alias_generators import to_camel
|
|
15
|
+
|
|
16
|
+
from .builtin_metrics import evaluate_builtin_metric
|
|
17
|
+
from .config import (
|
|
18
|
+
CustomEvaluatorDef,
|
|
19
|
+
EvalRunConfig,
|
|
20
|
+
)
|
|
21
|
+
from .converter import ConversionResult, convert_traces
|
|
22
|
+
from .loader.base import TraceLoader
|
|
23
|
+
from .loader.jaeger import JaegerJsonLoader
|
|
24
|
+
from .loader.otlp import OtlpJsonLoader
|
|
25
|
+
from .trace_metrics import extract_performance_metrics
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
ProgressCallback = Callable[[str], Awaitable[None]]
|
|
30
|
+
TraceProgressCallback = Callable[["TraceResult"], Awaitable[None]]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MetricResult(BaseModel):
|
|
34
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
35
|
+
|
|
36
|
+
metric_name: str
|
|
37
|
+
score: float | None = None
|
|
38
|
+
eval_status: str = "NOT_EVALUATED"
|
|
39
|
+
per_invocation_scores: list[float | None] = Field(default_factory=list)
|
|
40
|
+
error: str | None = None
|
|
41
|
+
details: dict[str, Any] | None = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TraceResult(BaseModel):
|
|
45
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
46
|
+
|
|
47
|
+
trace_id: str
|
|
48
|
+
num_invocations: int = 0
|
|
49
|
+
metric_results: list[MetricResult] = Field(default_factory=list)
|
|
50
|
+
conversion_warnings: list[str] = Field(default_factory=list)
|
|
51
|
+
performance_metrics: dict[str, Any] | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class RunResult(BaseModel):
|
|
55
|
+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
|
56
|
+
|
|
57
|
+
trace_results: list[TraceResult] = Field(default_factory=list)
|
|
58
|
+
errors: list[str] = Field(default_factory=list)
|
|
59
|
+
performance_metrics: dict[str, Any] | None = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_loader(format_name: str) -> TraceLoader:
|
|
63
|
+
loaders: dict[str, type[TraceLoader]] = {
|
|
64
|
+
"jaeger-json": JaegerJsonLoader,
|
|
65
|
+
"otlp-json": OtlpJsonLoader,
|
|
66
|
+
}
|
|
67
|
+
if format_name not in loaders:
|
|
68
|
+
raise ValueError(f"Unknown trace format '{format_name}'. Available: {list(loaders.keys())}")
|
|
69
|
+
return loaders[format_name]()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def load_eval_set(path: str) -> EvalSet:
|
|
73
|
+
with open(path) as f:
|
|
74
|
+
data = json.load(f)
|
|
75
|
+
return EvalSet.model_validate(data)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
async def run_evaluation(
|
|
79
|
+
config: EvalRunConfig,
|
|
80
|
+
progress_callback: ProgressCallback | None = None,
|
|
81
|
+
trace_progress_callback: TraceProgressCallback | None = None,
|
|
82
|
+
) -> RunResult:
|
|
83
|
+
result = RunResult()
|
|
84
|
+
|
|
85
|
+
loader = get_loader(config.trace_format)
|
|
86
|
+
all_traces = []
|
|
87
|
+
for trace_file in config.trace_files:
|
|
88
|
+
try:
|
|
89
|
+
traces = loader.load(trace_file)
|
|
90
|
+
all_traces.extend(traces)
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
msg = f"Failed to load trace file '{trace_file}': {exc}"
|
|
93
|
+
logger.error(msg)
|
|
94
|
+
result.errors.append(msg)
|
|
95
|
+
|
|
96
|
+
if not all_traces:
|
|
97
|
+
result.errors.append("No traces loaded.")
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
conversion_results = convert_traces(all_traces)
|
|
101
|
+
|
|
102
|
+
trace_map = {t.trace_id: t for t in all_traces}
|
|
103
|
+
|
|
104
|
+
perf_metrics_map: dict[str, dict[str, Any]] = {}
|
|
105
|
+
for trace in all_traces:
|
|
106
|
+
perf_metrics_map[trace.trace_id] = extract_performance_metrics(trace)
|
|
107
|
+
|
|
108
|
+
eval_set: EvalSet | None = None
|
|
109
|
+
if config.eval_set_file:
|
|
110
|
+
try:
|
|
111
|
+
eval_set = load_eval_set(config.eval_set_file)
|
|
112
|
+
except Exception as exc:
|
|
113
|
+
msg = f"Failed to load eval set '{config.eval_set_file}': {exc}"
|
|
114
|
+
logger.error(msg)
|
|
115
|
+
result.errors.append(msg)
|
|
116
|
+
|
|
117
|
+
total_traces = len(conversion_results)
|
|
118
|
+
if progress_callback:
|
|
119
|
+
await progress_callback(f"Evaluating {total_traces} trace{'s' if total_traces != 1 else ''}...")
|
|
120
|
+
|
|
121
|
+
trace_semaphore = asyncio.Semaphore(config.max_concurrent_traces)
|
|
122
|
+
eval_semaphore = asyncio.Semaphore(config.max_concurrent_evals)
|
|
123
|
+
|
|
124
|
+
async def _evaluate_trace_bounded(idx: int, conv_result: ConversionResult) -> TraceResult:
|
|
125
|
+
async with trace_semaphore:
|
|
126
|
+
if progress_callback:
|
|
127
|
+
trace_id_short = (
|
|
128
|
+
conv_result.trace_id[:12] + "..." if len(conv_result.trace_id) > 12 else conv_result.trace_id
|
|
129
|
+
)
|
|
130
|
+
await progress_callback(f"Trace {idx + 1}/{total_traces}: {trace_id_short}")
|
|
131
|
+
|
|
132
|
+
trace = trace_map.get(conv_result.trace_id)
|
|
133
|
+
|
|
134
|
+
return await _evaluate_trace(
|
|
135
|
+
conv_result=conv_result,
|
|
136
|
+
metrics=config.metrics,
|
|
137
|
+
custom_evaluators=config.custom_evaluators,
|
|
138
|
+
eval_set=eval_set,
|
|
139
|
+
judge_model=config.judge_model,
|
|
140
|
+
threshold=config.threshold,
|
|
141
|
+
eval_semaphore=eval_semaphore,
|
|
142
|
+
progress_callback=progress_callback,
|
|
143
|
+
trace_progress_callback=trace_progress_callback,
|
|
144
|
+
trace=trace,
|
|
145
|
+
performance_metrics=perf_metrics_map.get(conv_result.trace_id),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
trace_results = await asyncio.gather(
|
|
149
|
+
*[_evaluate_trace_bounded(idx, conv_result) for idx, conv_result in enumerate(conversion_results)],
|
|
150
|
+
return_exceptions=True,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
for tr in trace_results:
|
|
154
|
+
if isinstance(tr, Exception):
|
|
155
|
+
logger.error("Unexpected error evaluating trace: %s", tr)
|
|
156
|
+
result.errors.append(str(tr))
|
|
157
|
+
else:
|
|
158
|
+
result.trace_results.append(tr)
|
|
159
|
+
|
|
160
|
+
if progress_callback:
|
|
161
|
+
await progress_callback("Evaluation complete")
|
|
162
|
+
|
|
163
|
+
if result.trace_results:
|
|
164
|
+
all_tokens = {"prompt": [], "output": [], "total": []}
|
|
165
|
+
|
|
166
|
+
for tr in result.trace_results:
|
|
167
|
+
if tr.performance_metrics:
|
|
168
|
+
perf = tr.performance_metrics
|
|
169
|
+
all_tokens["prompt"].append(perf["tokens"]["total_prompt"])
|
|
170
|
+
all_tokens["output"].append(perf["tokens"]["total_output"])
|
|
171
|
+
all_tokens["total"].append(perf["tokens"]["total"])
|
|
172
|
+
|
|
173
|
+
if all_tokens["total"]:
|
|
174
|
+
result.performance_metrics = {
|
|
175
|
+
"tokens": {
|
|
176
|
+
"total_prompt": sum(all_tokens["prompt"]),
|
|
177
|
+
"total_output": sum(all_tokens["output"]),
|
|
178
|
+
"total": sum(all_tokens["total"]),
|
|
179
|
+
"avg_per_trace": {
|
|
180
|
+
"prompt": sum(all_tokens["prompt"]) / len(all_tokens["prompt"]),
|
|
181
|
+
"output": sum(all_tokens["output"]) / len(all_tokens["output"]),
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
"trace_count": len(result.trace_results),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return result
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
async def _evaluate_trace(
|
|
191
|
+
conv_result: ConversionResult,
|
|
192
|
+
metrics: list[str],
|
|
193
|
+
custom_evaluators: list[CustomEvaluatorDef],
|
|
194
|
+
eval_set: EvalSet | None,
|
|
195
|
+
judge_model: str | None,
|
|
196
|
+
threshold: float | None,
|
|
197
|
+
eval_semaphore: asyncio.Semaphore,
|
|
198
|
+
progress_callback: ProgressCallback | None = None,
|
|
199
|
+
trace_progress_callback: TraceProgressCallback | None = None,
|
|
200
|
+
trace=None,
|
|
201
|
+
performance_metrics: dict[str, Any] | None = None,
|
|
202
|
+
) -> TraceResult:
|
|
203
|
+
trace_result = TraceResult(
|
|
204
|
+
trace_id=conv_result.trace_id,
|
|
205
|
+
num_invocations=len(conv_result.invocations),
|
|
206
|
+
conversion_warnings=conv_result.warnings,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if performance_metrics:
|
|
210
|
+
trace_result.performance_metrics = performance_metrics
|
|
211
|
+
|
|
212
|
+
if not conv_result.invocations:
|
|
213
|
+
trace_result.metric_results.append(
|
|
214
|
+
MetricResult(
|
|
215
|
+
metric_name="(all)",
|
|
216
|
+
error="No invocations extracted from trace.",
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
return trace_result
|
|
220
|
+
|
|
221
|
+
actual_invocations = conv_result.invocations
|
|
222
|
+
|
|
223
|
+
expected_invocations: list[Invocation] | None = None
|
|
224
|
+
if eval_set:
|
|
225
|
+
expected_invocations = _find_expected_invocations(actual_invocations, eval_set)
|
|
226
|
+
|
|
227
|
+
async def _append_result(result: MetricResult) -> MetricResult:
|
|
228
|
+
trace_result.metric_results.append(result)
|
|
229
|
+
if trace_progress_callback:
|
|
230
|
+
await trace_progress_callback(trace_result)
|
|
231
|
+
return result
|
|
232
|
+
|
|
233
|
+
async def _eval_builtin_with_semaphore(metric_name: str) -> MetricResult:
|
|
234
|
+
async with eval_semaphore:
|
|
235
|
+
if progress_callback:
|
|
236
|
+
await progress_callback(f"Running {metric_name}...")
|
|
237
|
+
result = await evaluate_builtin_metric(
|
|
238
|
+
metric_name=metric_name,
|
|
239
|
+
actual_invocations=actual_invocations,
|
|
240
|
+
expected_invocations=expected_invocations,
|
|
241
|
+
judge_model=judge_model,
|
|
242
|
+
threshold=threshold,
|
|
243
|
+
)
|
|
244
|
+
return await _append_result(result)
|
|
245
|
+
|
|
246
|
+
async def _eval_custom_with_semaphore(evaluator_def: CustomEvaluatorDef) -> MetricResult:
|
|
247
|
+
async with eval_semaphore:
|
|
248
|
+
if progress_callback:
|
|
249
|
+
await progress_callback(f"Running {evaluator_def.name}...")
|
|
250
|
+
from .custom_evaluators import evaluate_custom_evaluator
|
|
251
|
+
|
|
252
|
+
result = await evaluate_custom_evaluator(
|
|
253
|
+
evaluator_def=evaluator_def,
|
|
254
|
+
actual_invocations=actual_invocations,
|
|
255
|
+
expected_invocations=expected_invocations,
|
|
256
|
+
)
|
|
257
|
+
return await _append_result(result)
|
|
258
|
+
|
|
259
|
+
tasks = [_eval_builtin_with_semaphore(m) for m in metrics]
|
|
260
|
+
tasks.extend(_eval_custom_with_semaphore(g) for g in custom_evaluators)
|
|
261
|
+
|
|
262
|
+
await asyncio.gather(*tasks)
|
|
263
|
+
|
|
264
|
+
return trace_result
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _find_expected_invocations(
|
|
268
|
+
actual_invocations: list[Invocation],
|
|
269
|
+
eval_set: EvalSet,
|
|
270
|
+
) -> list[Invocation] | None:
|
|
271
|
+
"""Match actual invocations to an eval case. Uses the sole eval case if
|
|
272
|
+
there's only one, otherwise matches by user content text."""
|
|
273
|
+
if not eval_set.eval_cases:
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
if len(eval_set.eval_cases) == 1:
|
|
277
|
+
case = eval_set.eval_cases[0]
|
|
278
|
+
if case.conversation:
|
|
279
|
+
return case.conversation
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
actual_user_text = _get_user_text(actual_invocations[0]) if actual_invocations else None
|
|
283
|
+
if not actual_user_text:
|
|
284
|
+
case = eval_set.eval_cases[0]
|
|
285
|
+
return case.conversation if case.conversation else None
|
|
286
|
+
|
|
287
|
+
for case in eval_set.eval_cases:
|
|
288
|
+
if not case.conversation:
|
|
289
|
+
continue
|
|
290
|
+
expected_user_text = _get_user_text(case.conversation[0])
|
|
291
|
+
if expected_user_text and _text_matches(actual_user_text, expected_user_text):
|
|
292
|
+
return case.conversation
|
|
293
|
+
|
|
294
|
+
logger.warning(
|
|
295
|
+
"No matching eval case found for user text: '%s'. Using first eval case.",
|
|
296
|
+
actual_user_text[:100],
|
|
297
|
+
)
|
|
298
|
+
case = eval_set.eval_cases[0]
|
|
299
|
+
return case.conversation if case.conversation else None
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _get_user_text(invocation: Invocation) -> str | None:
|
|
303
|
+
if not invocation.user_content or not invocation.user_content.parts:
|
|
304
|
+
return None
|
|
305
|
+
texts = [p.text for p in invocation.user_content.parts if p.text]
|
|
306
|
+
return " ".join(texts) if texts else None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _text_matches(a: str, b: str) -> bool:
|
|
310
|
+
return a.strip().lower() == b.strip().lower()
|