agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/cli.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
"""CLI entry point for agentevals.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
agentevals run samples/helm.json --eval-set samples/eval_set_helm.json
|
|
6
|
+
agentevals run samples/helm.json -m tool_trajectory_avg_score -m response_match_score
|
|
7
|
+
agentevals run samples/helm.json --eval-set samples/eval_set_helm.json --output json
|
|
8
|
+
agentevals list-metrics
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
|
|
19
|
+
import click
|
|
20
|
+
|
|
21
|
+
from . import __version__
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _relative_time(iso_str: str | None) -> str:
|
|
25
|
+
"""Format an ISO 8601 timestamp as a human-readable relative time string."""
|
|
26
|
+
if not iso_str:
|
|
27
|
+
return ""
|
|
28
|
+
try:
|
|
29
|
+
dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
|
|
30
|
+
delta = datetime.now(timezone.utc) - dt
|
|
31
|
+
seconds = int(delta.total_seconds())
|
|
32
|
+
if seconds < 0:
|
|
33
|
+
return "just now"
|
|
34
|
+
if seconds < 60:
|
|
35
|
+
return f"{seconds}s ago"
|
|
36
|
+
minutes = seconds // 60
|
|
37
|
+
if minutes < 60:
|
|
38
|
+
return f"{minutes}m ago"
|
|
39
|
+
hours = minutes // 60
|
|
40
|
+
if hours < 24:
|
|
41
|
+
return f"{hours}h ago"
|
|
42
|
+
days = hours // 24
|
|
43
|
+
if days < 30:
|
|
44
|
+
return f"{days}d ago"
|
|
45
|
+
months = days // 30
|
|
46
|
+
if months < 12:
|
|
47
|
+
return f"{months}mo ago"
|
|
48
|
+
years = days // 365
|
|
49
|
+
return f"{years}y ago"
|
|
50
|
+
except (ValueError, TypeError):
|
|
51
|
+
return ""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@click.group()
|
|
55
|
+
@click.version_option(version=__version__, prog_name="agentevals")
|
|
56
|
+
@click.option(
|
|
57
|
+
"-v",
|
|
58
|
+
"--verbose",
|
|
59
|
+
count=True,
|
|
60
|
+
help="Increase verbosity (-v for INFO, -vv for DEBUG).",
|
|
61
|
+
)
|
|
62
|
+
def main(verbose: int) -> None:
|
|
63
|
+
"""agentevals: Evaluate agent traces using ADK's scoring framework."""
|
|
64
|
+
level = logging.WARNING
|
|
65
|
+
if verbose == 1:
|
|
66
|
+
level = logging.INFO
|
|
67
|
+
elif verbose >= 2:
|
|
68
|
+
level = logging.DEBUG
|
|
69
|
+
logging.basicConfig(
|
|
70
|
+
level=level,
|
|
71
|
+
format="%(levelname)s %(name)s: %(message)s",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@main.command()
|
|
76
|
+
@click.argument("trace_files", nargs=-1, required=True, type=click.Path(exists=True))
|
|
77
|
+
@click.option(
|
|
78
|
+
"--eval-set",
|
|
79
|
+
"-e",
|
|
80
|
+
type=click.Path(exists=True),
|
|
81
|
+
default=None,
|
|
82
|
+
help="Path to a golden eval set JSON file (ADK EvalSet format).",
|
|
83
|
+
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--metric",
|
|
86
|
+
"-m",
|
|
87
|
+
multiple=True,
|
|
88
|
+
default=None,
|
|
89
|
+
help="Metric(s) to evaluate. Can be specified multiple times. Default: tool_trajectory_avg_score.",
|
|
90
|
+
)
|
|
91
|
+
@click.option(
|
|
92
|
+
"--format",
|
|
93
|
+
"-f",
|
|
94
|
+
"trace_format",
|
|
95
|
+
default="jaeger-json",
|
|
96
|
+
help="Trace file format.",
|
|
97
|
+
)
|
|
98
|
+
@click.option(
|
|
99
|
+
"--judge-model",
|
|
100
|
+
"-j",
|
|
101
|
+
default=None,
|
|
102
|
+
help="LLM model for judge-based metrics (default: gemini-2.5-flash).",
|
|
103
|
+
)
|
|
104
|
+
@click.option(
|
|
105
|
+
"--threshold",
|
|
106
|
+
"-t",
|
|
107
|
+
type=float,
|
|
108
|
+
default=None,
|
|
109
|
+
help="Score threshold for pass/fail.",
|
|
110
|
+
)
|
|
111
|
+
@click.option(
|
|
112
|
+
"--output",
|
|
113
|
+
"-o",
|
|
114
|
+
type=click.Choice(["table", "json", "summary"]),
|
|
115
|
+
default="table",
|
|
116
|
+
help="Output format.",
|
|
117
|
+
)
|
|
118
|
+
@click.option(
|
|
119
|
+
"--config",
|
|
120
|
+
"-c",
|
|
121
|
+
"config_file",
|
|
122
|
+
type=click.Path(exists=True),
|
|
123
|
+
default=None,
|
|
124
|
+
help="Path to an eval config YAML file defining metrics (including custom).",
|
|
125
|
+
)
|
|
126
|
+
def run(
|
|
127
|
+
trace_files: tuple[str, ...],
|
|
128
|
+
eval_set: str | None,
|
|
129
|
+
metric: tuple[str, ...] | None,
|
|
130
|
+
trace_format: str,
|
|
131
|
+
judge_model: str | None,
|
|
132
|
+
threshold: float | None,
|
|
133
|
+
output: str,
|
|
134
|
+
config_file: str | None,
|
|
135
|
+
) -> None:
|
|
136
|
+
"""Evaluate trace file(s) against specified metrics."""
|
|
137
|
+
from .config import EvalRunConfig
|
|
138
|
+
from .output import format_results
|
|
139
|
+
from .runner import run_evaluation
|
|
140
|
+
|
|
141
|
+
explicit_metrics = list(metric) if metric else []
|
|
142
|
+
|
|
143
|
+
if config_file:
|
|
144
|
+
from .eval_config_loader import load_eval_config, merge_configs
|
|
145
|
+
|
|
146
|
+
file_config = load_eval_config(config_file)
|
|
147
|
+
|
|
148
|
+
cli_config = EvalRunConfig(
|
|
149
|
+
trace_files=list(trace_files),
|
|
150
|
+
eval_set_file=eval_set,
|
|
151
|
+
metrics=explicit_metrics,
|
|
152
|
+
trace_format=trace_format,
|
|
153
|
+
judge_model=judge_model,
|
|
154
|
+
threshold=threshold,
|
|
155
|
+
output_format=output,
|
|
156
|
+
)
|
|
157
|
+
config = merge_configs(file_config, cli_config)
|
|
158
|
+
else:
|
|
159
|
+
effective_metrics = explicit_metrics or ["tool_trajectory_avg_score"]
|
|
160
|
+
config = EvalRunConfig(
|
|
161
|
+
trace_files=list(trace_files),
|
|
162
|
+
eval_set_file=eval_set,
|
|
163
|
+
metrics=effective_metrics,
|
|
164
|
+
trace_format=trace_format,
|
|
165
|
+
judge_model=judge_model,
|
|
166
|
+
threshold=threshold,
|
|
167
|
+
output_format=output,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
result = asyncio.run(run_evaluation(config))
|
|
171
|
+
formatted = format_results(result, fmt=output)
|
|
172
|
+
click.echo(formatted)
|
|
173
|
+
|
|
174
|
+
has_failure = any(mr.eval_status == "FAILED" or mr.error for tr in result.trace_results for mr in tr.metric_results)
|
|
175
|
+
if has_failure or result.errors:
|
|
176
|
+
sys.exit(1)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@main.command("list-metrics")
|
|
180
|
+
def list_metrics() -> None:
|
|
181
|
+
"""List all available evaluation metrics.
|
|
182
|
+
|
|
183
|
+
DEPRECATED: use ``agentevals evaluator list --source builtin`` instead.
|
|
184
|
+
"""
|
|
185
|
+
click.echo(
|
|
186
|
+
"Note: list-metrics is deprecated. Use 'agentevals evaluator list --source builtin' instead.\n",
|
|
187
|
+
err=True,
|
|
188
|
+
)
|
|
189
|
+
try:
|
|
190
|
+
from google.adk.evaluation.metric_evaluator_registry import (
|
|
191
|
+
DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
metrics = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
|
|
195
|
+
click.echo("Available metrics:\n")
|
|
196
|
+
for m in metrics:
|
|
197
|
+
desc = m.description or "No description"
|
|
198
|
+
click.echo(f" {m.metric_name}")
|
|
199
|
+
click.echo(f" {desc}")
|
|
200
|
+
if m.metric_value_info and m.metric_value_info.interval:
|
|
201
|
+
iv = m.metric_value_info.interval
|
|
202
|
+
lo = f"{'(' if iv.open_at_min else '['}{iv.min_value}"
|
|
203
|
+
hi = f"{iv.max_value}{')' if iv.open_at_max else ']'}"
|
|
204
|
+
click.echo(f" Value range: {lo}, {hi}")
|
|
205
|
+
click.echo()
|
|
206
|
+
except ImportError as exc:
|
|
207
|
+
click.echo(
|
|
208
|
+
f"Could not load full metric registry ({exc}).\n"
|
|
209
|
+
"Some eval dependencies may be missing. Install with:\n"
|
|
210
|
+
' pip install "google-adk[eval]"\n'
|
|
211
|
+
)
|
|
212
|
+
click.echo("Known built-in metrics:\n")
|
|
213
|
+
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
|
|
214
|
+
|
|
215
|
+
for pm in PrebuiltMetrics:
|
|
216
|
+
click.echo(f" {pm.value}")
|
|
217
|
+
click.echo()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# ---------------------------------------------------------------------------
|
|
221
|
+
# agentevals evaluator ...
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@main.group()
|
|
226
|
+
def evaluator() -> None:
|
|
227
|
+
"""Manage evaluators: scaffold, list, and discover."""
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
@evaluator.command("init")
|
|
231
|
+
@click.argument("name")
|
|
232
|
+
@click.option(
|
|
233
|
+
"--output-dir",
|
|
234
|
+
"-o",
|
|
235
|
+
type=click.Path(),
|
|
236
|
+
default=".",
|
|
237
|
+
help="Parent directory for the new evaluator folder (default: current directory).",
|
|
238
|
+
)
|
|
239
|
+
@click.option(
|
|
240
|
+
"--runtime",
|
|
241
|
+
"-r",
|
|
242
|
+
default=None,
|
|
243
|
+
help="Language runtime: py, js, ts (default: inferred from name or py).",
|
|
244
|
+
)
|
|
245
|
+
def evaluator_init(name: str, output_dir: str, runtime: str | None) -> None:
|
|
246
|
+
"""Scaffold a new evaluator with boilerplate code and an evaluator.yaml manifest.
|
|
247
|
+
|
|
248
|
+
NAME is the evaluator name. If it ends with a recognized extension (.py, .js,
|
|
249
|
+
.ts) the language is inferred automatically; otherwise use --runtime.
|
|
250
|
+
|
|
251
|
+
\b
|
|
252
|
+
Examples:
|
|
253
|
+
agentevals evaluator init my_evaluator
|
|
254
|
+
agentevals evaluator init my_evaluator.ts
|
|
255
|
+
agentevals evaluator init my_evaluator --runtime js
|
|
256
|
+
"""
|
|
257
|
+
from pathlib import Path as _Path
|
|
258
|
+
|
|
259
|
+
from .evaluator.templates import scaffold_evaluator
|
|
260
|
+
|
|
261
|
+
try:
|
|
262
|
+
evaluator_dir = scaffold_evaluator(name, output_dir=_Path(output_dir), runtime=runtime)
|
|
263
|
+
except (ValueError, OSError) as exc:
|
|
264
|
+
raise click.ClickException(str(exc)) from exc
|
|
265
|
+
|
|
266
|
+
click.echo(f"Created evaluator in {evaluator_dir}/")
|
|
267
|
+
click.echo()
|
|
268
|
+
click.echo("Files:")
|
|
269
|
+
for f in sorted(evaluator_dir.iterdir()):
|
|
270
|
+
click.echo(f" {f.relative_to(evaluator_dir.parent)}")
|
|
271
|
+
click.echo()
|
|
272
|
+
click.echo("Next steps:")
|
|
273
|
+
click.echo(" 1. Implement your scoring logic in the generated code file")
|
|
274
|
+
click.echo(" 2. Add it to your eval_config.yaml under 'evaluators':")
|
|
275
|
+
click.echo()
|
|
276
|
+
|
|
277
|
+
code_files = [f for f in evaluator_dir.iterdir() if f.suffix in (".py", ".js", ".ts")]
|
|
278
|
+
evaluator_name = evaluator_dir.name
|
|
279
|
+
if code_files:
|
|
280
|
+
rel = code_files[0].relative_to(evaluator_dir.parent)
|
|
281
|
+
click.echo(" evaluators:")
|
|
282
|
+
click.echo(f" - name: {evaluator_name}")
|
|
283
|
+
click.echo(" type: code")
|
|
284
|
+
click.echo(f" path: ./{rel}")
|
|
285
|
+
click.echo(" threshold: 0.5")
|
|
286
|
+
click.echo()
|
|
287
|
+
click.echo(" 3. Run: agentevals run <trace_file> --config eval_config.yaml")
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
@evaluator.command("runtimes")
|
|
291
|
+
def evaluator_runtimes() -> None:
|
|
292
|
+
"""Show supported language runtimes and execution environments."""
|
|
293
|
+
from .custom_evaluators import _EXECUTOR_FACTORIES, get_runtimes
|
|
294
|
+
|
|
295
|
+
click.echo("Language runtimes:\n")
|
|
296
|
+
for rt in get_runtimes():
|
|
297
|
+
exts = ", ".join(rt.extensions)
|
|
298
|
+
available = "available" if rt.is_available() else "not found"
|
|
299
|
+
click.echo(f" {rt.name:<12} extensions: {exts:<16} ({available})")
|
|
300
|
+
|
|
301
|
+
click.echo("\nExecutors:\n")
|
|
302
|
+
for name in sorted(_EXECUTOR_FACTORIES):
|
|
303
|
+
click.echo(f" {name}")
|
|
304
|
+
|
|
305
|
+
click.echo()
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@evaluator.command("list")
|
|
309
|
+
@click.option(
|
|
310
|
+
"--source",
|
|
311
|
+
"-s",
|
|
312
|
+
type=click.Choice(["all", "builtin", "github"]),
|
|
313
|
+
default="all",
|
|
314
|
+
help="Filter evaluators by source (default: all).",
|
|
315
|
+
)
|
|
316
|
+
@click.option(
|
|
317
|
+
"--refresh",
|
|
318
|
+
is_flag=True,
|
|
319
|
+
default=False,
|
|
320
|
+
help="Ignore cached results and fetch fresh data.",
|
|
321
|
+
)
|
|
322
|
+
def evaluator_list(source: str, refresh: bool) -> None:
|
|
323
|
+
"""List available evaluators from all registered sources."""
|
|
324
|
+
from .evaluator.sources import _cache_dir, get_sources
|
|
325
|
+
|
|
326
|
+
if refresh:
|
|
327
|
+
import shutil
|
|
328
|
+
|
|
329
|
+
cache = _cache_dir()
|
|
330
|
+
if cache.exists():
|
|
331
|
+
shutil.rmtree(cache, ignore_errors=True)
|
|
332
|
+
|
|
333
|
+
sources = get_sources()
|
|
334
|
+
if source != "all":
|
|
335
|
+
sources = [s for s in sources if s.source_name == source]
|
|
336
|
+
|
|
337
|
+
click.echo(" Fetching evaluators...", nl=False)
|
|
338
|
+
all_evaluators = asyncio.run(_collect_evaluators(sources))
|
|
339
|
+
click.echo("\r" + " " * 30 + "\r", nl=False)
|
|
340
|
+
|
|
341
|
+
if not all_evaluators:
|
|
342
|
+
click.echo("No evaluators found.")
|
|
343
|
+
return
|
|
344
|
+
|
|
345
|
+
max_name = max(len(g.name) for g in all_evaluators)
|
|
346
|
+
max_src = max(len(g.source) for g in all_evaluators)
|
|
347
|
+
|
|
348
|
+
has_updated = any(g.last_updated for g in all_evaluators)
|
|
349
|
+
updated_col_width = 10 if has_updated else 0
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
term_width = os.get_terminal_size().columns
|
|
353
|
+
except OSError:
|
|
354
|
+
term_width = 120
|
|
355
|
+
|
|
356
|
+
overhead = max_name + max_src + 8
|
|
357
|
+
if has_updated:
|
|
358
|
+
overhead += updated_col_width + 2
|
|
359
|
+
desc_width = max(20, term_width - overhead)
|
|
360
|
+
|
|
361
|
+
hdr_updated = f" {'UPDATED':<{updated_col_width}}" if has_updated else ""
|
|
362
|
+
sep_updated = f" {'-' * updated_col_width}" if has_updated else ""
|
|
363
|
+
|
|
364
|
+
click.echo(f" {'NAME':<{max_name}} {'SOURCE':<{max_src}}{hdr_updated} DESCRIPTION")
|
|
365
|
+
click.echo(f" {'-' * max_name} {'-' * max_src}{sep_updated} {'-' * min(40, desc_width)}")
|
|
366
|
+
|
|
367
|
+
for g in sorted(all_evaluators, key=lambda x: (x.source, x.name)):
|
|
368
|
+
lang = f" [{g.language}]" if g.language else ""
|
|
369
|
+
desc = g.description + lang
|
|
370
|
+
if len(desc) > desc_width:
|
|
371
|
+
desc = desc[: desc_width - 3] + "..."
|
|
372
|
+
col_updated = f" {_relative_time(g.last_updated):<{updated_col_width}}" if has_updated else ""
|
|
373
|
+
click.echo(f" {g.name:<{max_name}} {g.source:<{max_src}}{col_updated} {desc}")
|
|
374
|
+
|
|
375
|
+
click.echo(f"\n {len(all_evaluators)} evaluator(s) found.")
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
async def _collect_evaluators(sources):
|
|
379
|
+
"""Gather evaluator lists from all sources concurrently."""
|
|
380
|
+
import asyncio as _asyncio
|
|
381
|
+
|
|
382
|
+
from .evaluator.sources import EvaluatorInfo
|
|
383
|
+
|
|
384
|
+
results: list[EvaluatorInfo] = []
|
|
385
|
+
tasks = [s.list_evaluators() for s in sources]
|
|
386
|
+
for evaluators in await _asyncio.gather(*tasks, return_exceptions=True):
|
|
387
|
+
if isinstance(evaluators, BaseException):
|
|
388
|
+
click.echo(f" Warning: failed to fetch from a source: {evaluators}", err=True)
|
|
389
|
+
continue
|
|
390
|
+
results.extend(evaluators)
|
|
391
|
+
return results
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
@evaluator.command("config")
|
|
395
|
+
@click.argument("name")
|
|
396
|
+
@click.option(
|
|
397
|
+
"--path",
|
|
398
|
+
"-p",
|
|
399
|
+
"evaluator_path",
|
|
400
|
+
default=None,
|
|
401
|
+
help="Path to the evaluator script (used for local code evaluators).",
|
|
402
|
+
)
|
|
403
|
+
@click.option(
|
|
404
|
+
"--threshold",
|
|
405
|
+
"-t",
|
|
406
|
+
type=float,
|
|
407
|
+
default=None,
|
|
408
|
+
help="Score threshold (default: 0.5 for custom evaluators).",
|
|
409
|
+
)
|
|
410
|
+
def evaluator_config(name: str, evaluator_path: str | None, threshold: float | None) -> None:
|
|
411
|
+
"""Generate an eval_config.yaml snippet for an evaluator."""
|
|
412
|
+
import yaml as _yaml
|
|
413
|
+
|
|
414
|
+
from .builtin_metrics import METRICS_NEEDING_EXPECTED, METRICS_NEEDING_GCP, METRICS_NEEDING_LLM
|
|
415
|
+
from .evaluator.sources import get_sources
|
|
416
|
+
|
|
417
|
+
sources = get_sources()
|
|
418
|
+
all_evaluators = asyncio.run(_collect_evaluators(sources))
|
|
419
|
+
|
|
420
|
+
match = next((g for g in all_evaluators if g.name == name), None)
|
|
421
|
+
|
|
422
|
+
if match and match.source == "builtin":
|
|
423
|
+
needs_eval_set = name in METRICS_NEEDING_EXPECTED
|
|
424
|
+
needs_llm = name in METRICS_NEEDING_LLM
|
|
425
|
+
needs_gcp = name in METRICS_NEEDING_GCP
|
|
426
|
+
|
|
427
|
+
entry: dict = {"name": name, "type": "builtin"}
|
|
428
|
+
if threshold is not None:
|
|
429
|
+
entry["threshold"] = threshold
|
|
430
|
+
else:
|
|
431
|
+
entry["threshold"] = 0.5
|
|
432
|
+
if needs_llm:
|
|
433
|
+
entry["judge_model"] = "gemini-2.5-flash"
|
|
434
|
+
|
|
435
|
+
snippet: dict = {"evaluators": [entry]}
|
|
436
|
+
|
|
437
|
+
notes: list[str] = []
|
|
438
|
+
if needs_eval_set:
|
|
439
|
+
notes.append("Requires --eval-set (golden eval set with expected responses)")
|
|
440
|
+
if needs_llm:
|
|
441
|
+
notes.append("Requires GOOGLE_API_KEY (or GEMINI_API_KEY) for LLM judge")
|
|
442
|
+
if needs_gcp:
|
|
443
|
+
notes.append("Requires GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION (Vertex AI)")
|
|
444
|
+
|
|
445
|
+
comment = "# Add to your eval_config.yaml under 'evaluators':"
|
|
446
|
+
if notes:
|
|
447
|
+
comment += "\n#\n# Notes:\n" + "\n".join(f"# - {n}" for n in notes)
|
|
448
|
+
elif match and match.source != "builtin":
|
|
449
|
+
entry: dict = {
|
|
450
|
+
"name": name,
|
|
451
|
+
"type": "remote",
|
|
452
|
+
"source": match.source,
|
|
453
|
+
"ref": match.ref or f"evaluators/{name}",
|
|
454
|
+
}
|
|
455
|
+
if threshold is not None:
|
|
456
|
+
entry["threshold"] = threshold
|
|
457
|
+
else:
|
|
458
|
+
entry["threshold"] = 0.5
|
|
459
|
+
entry["executor"] = "local"
|
|
460
|
+
snippet = {"evaluators": [entry]}
|
|
461
|
+
comment = "# Add to your eval_config.yaml under 'evaluators':"
|
|
462
|
+
else:
|
|
463
|
+
path_val = evaluator_path or f"./{name}/{name}.py"
|
|
464
|
+
entry = {
|
|
465
|
+
"name": name,
|
|
466
|
+
"type": "code",
|
|
467
|
+
"path": path_val,
|
|
468
|
+
}
|
|
469
|
+
if threshold is not None:
|
|
470
|
+
entry["threshold"] = threshold
|
|
471
|
+
else:
|
|
472
|
+
entry["threshold"] = 0.5
|
|
473
|
+
entry["executor"] = "local"
|
|
474
|
+
snippet = {"evaluators": [entry]}
|
|
475
|
+
comment = "# Add to your eval_config.yaml under 'evaluators':"
|
|
476
|
+
|
|
477
|
+
rendered = _yaml.dump(snippet, default_flow_style=False, sort_keys=False)
|
|
478
|
+
click.echo(f"\n{comment}\n")
|
|
479
|
+
click.echo(rendered)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _link_server_shutdown(*servers) -> None:
|
|
483
|
+
"""Link multiple uvicorn servers so a single SIGINT shuts down all of them.
|
|
484
|
+
|
|
485
|
+
Uvicorn installs per-server signal handlers; the last server's handler
|
|
486
|
+
overwrites earlier ones. This replaces handle_exit on every server with
|
|
487
|
+
a shared callback that sets should_exit / force_exit on all of them.
|
|
488
|
+
"""
|
|
489
|
+
import signal as _signal
|
|
490
|
+
|
|
491
|
+
def _shared_exit(sig, frame):
|
|
492
|
+
force = all(s.should_exit for s in servers)
|
|
493
|
+
for s in servers:
|
|
494
|
+
if force and sig == _signal.SIGINT:
|
|
495
|
+
s.force_exit = True
|
|
496
|
+
else:
|
|
497
|
+
s.should_exit = True
|
|
498
|
+
|
|
499
|
+
for s in servers:
|
|
500
|
+
s.handle_exit = _shared_exit
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
async def _run_servers(
|
|
504
|
+
host: str,
|
|
505
|
+
port: int,
|
|
506
|
+
otlp_port: int,
|
|
507
|
+
*,
|
|
508
|
+
reload: bool = False,
|
|
509
|
+
reload_dirs: list[str] | None = None,
|
|
510
|
+
log_level: str = "warning",
|
|
511
|
+
) -> None:
|
|
512
|
+
"""Start the main API and OTLP HTTP servers."""
|
|
513
|
+
import uvicorn
|
|
514
|
+
|
|
515
|
+
shared_kwargs: dict = {
|
|
516
|
+
"host": host,
|
|
517
|
+
"reload": reload,
|
|
518
|
+
"log_level": log_level,
|
|
519
|
+
}
|
|
520
|
+
if reload_dirs:
|
|
521
|
+
shared_kwargs["reload_dirs"] = reload_dirs
|
|
522
|
+
|
|
523
|
+
main_server = uvicorn.Server(uvicorn.Config("agentevals.api.app:app", port=port, **shared_kwargs))
|
|
524
|
+
otlp_server = uvicorn.Server(uvicorn.Config("agentevals.api.otlp_app:otlp_app", port=otlp_port, **shared_kwargs))
|
|
525
|
+
_link_server_shutdown(main_server, otlp_server)
|
|
526
|
+
await asyncio.gather(main_server.serve(), otlp_server.serve())
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@main.command("serve")
|
|
530
|
+
@click.option(
|
|
531
|
+
"--dev",
|
|
532
|
+
is_flag=True,
|
|
533
|
+
help="Enable dev mode with WebSocket support for live streaming.",
|
|
534
|
+
)
|
|
535
|
+
@click.option(
|
|
536
|
+
"--host",
|
|
537
|
+
default="0.0.0.0",
|
|
538
|
+
help="Host to bind the server to.",
|
|
539
|
+
)
|
|
540
|
+
@click.option(
|
|
541
|
+
"--port",
|
|
542
|
+
"-p",
|
|
543
|
+
default=8001,
|
|
544
|
+
help="Port to bind the server to.",
|
|
545
|
+
)
|
|
546
|
+
@click.option(
|
|
547
|
+
"--otlp-port",
|
|
548
|
+
default=4318,
|
|
549
|
+
help="Port for OTLP HTTP receiver (default: 4318, standard OTLP HTTP port).",
|
|
550
|
+
)
|
|
551
|
+
@click.option(
|
|
552
|
+
"--eval-sets",
|
|
553
|
+
type=click.Path(exists=True),
|
|
554
|
+
default=None,
|
|
555
|
+
help="Directory containing eval set JSON files to pre-load.",
|
|
556
|
+
)
|
|
557
|
+
@click.option(
|
|
558
|
+
"--headless",
|
|
559
|
+
is_flag=True,
|
|
560
|
+
help="Run in headless mode (no browser launch).",
|
|
561
|
+
)
|
|
562
|
+
@click.option(
|
|
563
|
+
"-v",
|
|
564
|
+
"--verbose",
|
|
565
|
+
count=True,
|
|
566
|
+
help="Increase verbosity (-v for INFO, -vv for DEBUG).",
|
|
567
|
+
)
|
|
568
|
+
def serve(dev: bool, host: str, port: int, otlp_port: int, eval_sets: str | None, headless: bool, verbose: int) -> None:
|
|
569
|
+
"""Start the agentevals API server.
|
|
570
|
+
|
|
571
|
+
Use --dev to enable live streaming mode for agent development.
|
|
572
|
+
"""
|
|
573
|
+
from pathlib import Path
|
|
574
|
+
|
|
575
|
+
level = logging.WARNING
|
|
576
|
+
if verbose == 1:
|
|
577
|
+
level = logging.INFO
|
|
578
|
+
elif verbose >= 2:
|
|
579
|
+
level = logging.DEBUG
|
|
580
|
+
logging.basicConfig(
|
|
581
|
+
level=level,
|
|
582
|
+
format="%(levelname)s %(name)s: %(message)s",
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if headless:
|
|
586
|
+
os.environ["AGENTEVALS_HEADLESS"] = "1"
|
|
587
|
+
|
|
588
|
+
static_dir = Path(__file__).parent / "_static"
|
|
589
|
+
has_ui = static_dir.is_dir() and (static_dir / "index.html").exists()
|
|
590
|
+
|
|
591
|
+
os.environ["AGENTEVALS_LIVE"] = "1"
|
|
592
|
+
|
|
593
|
+
if dev:
|
|
594
|
+
click.echo("agentevals dev server starting...")
|
|
595
|
+
click.echo(f" OTLP HTTP: http://{host}:{otlp_port} (OTEL_EXPORTER_OTLP_ENDPOINT default)")
|
|
596
|
+
click.echo(f" WebSocket: ws://{host}:{port}/ws/traces")
|
|
597
|
+
click.echo(f" API: http://{host}:{port}/api")
|
|
598
|
+
click.echo(" Web UI: http://localhost:5173")
|
|
599
|
+
click.echo()
|
|
600
|
+
|
|
601
|
+
if eval_sets:
|
|
602
|
+
click.echo(f" Eval sets: {eval_sets}")
|
|
603
|
+
click.echo()
|
|
604
|
+
|
|
605
|
+
click.echo("Waiting for agent connections...")
|
|
606
|
+
click.echo()
|
|
607
|
+
|
|
608
|
+
src_path = Path(__file__).parent.parent
|
|
609
|
+
reload_dirs = [str(src_path)]
|
|
610
|
+
asyncio.run(_run_servers(host, port, otlp_port, reload=True, reload_dirs=reload_dirs, log_level="info"))
|
|
611
|
+
elif has_ui and not headless:
|
|
612
|
+
click.echo(f"agentevals: http://{host}:{port}")
|
|
613
|
+
click.echo(f" OTLP HTTP: http://{host}:{otlp_port}")
|
|
614
|
+
click.echo()
|
|
615
|
+
|
|
616
|
+
asyncio.run(_run_servers(host, port, otlp_port))
|
|
617
|
+
else:
|
|
618
|
+
click.echo(f"agentevals API: http://{host}:{port}/api")
|
|
619
|
+
click.echo(f" OTLP HTTP: http://{host}:{otlp_port}")
|
|
620
|
+
click.echo()
|
|
621
|
+
|
|
622
|
+
asyncio.run(_run_servers(host, port, otlp_port))
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
@main.command("mcp")
|
|
626
|
+
@click.option(
|
|
627
|
+
"--server-url",
|
|
628
|
+
default=None,
|
|
629
|
+
help="agentevals server URL for session tools (default: http://localhost:8001 or AGENTEVALS_SERVER_URL).",
|
|
630
|
+
)
|
|
631
|
+
def mcp_command(server_url: str | None) -> None:
|
|
632
|
+
"""Start the MCP server on stdio for use with Claude Code and other MCP clients."""
|
|
633
|
+
try:
|
|
634
|
+
from .mcp_server import create_server
|
|
635
|
+
except ImportError:
|
|
636
|
+
click.echo('MCP requires the live extras: pip install "agentevals[live]"', err=True)
|
|
637
|
+
sys.exit(1)
|
|
638
|
+
|
|
639
|
+
create_server(server_url=server_url).run("stdio")
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
if __name__ == "__main__":
|
|
643
|
+
main()
|