agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
agentevals/cli.py ADDED
@@ -0,0 +1,643 @@
1
+ """CLI entry point for agentevals.
2
+
3
+ Usage::
4
+
5
+ agentevals run samples/helm.json --eval-set samples/eval_set_helm.json
6
+ agentevals run samples/helm.json -m tool_trajectory_avg_score -m response_match_score
7
+ agentevals run samples/helm.json --eval-set samples/eval_set_helm.json --output json
8
+ agentevals list-metrics
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import logging
15
+ import os
16
+ import sys
17
+ from datetime import datetime, timezone
18
+
19
+ import click
20
+
21
+ from . import __version__
22
+
23
+
24
+ def _relative_time(iso_str: str | None) -> str:
25
+ """Format an ISO 8601 timestamp as a human-readable relative time string."""
26
+ if not iso_str:
27
+ return ""
28
+ try:
29
+ dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
30
+ delta = datetime.now(timezone.utc) - dt
31
+ seconds = int(delta.total_seconds())
32
+ if seconds < 0:
33
+ return "just now"
34
+ if seconds < 60:
35
+ return f"{seconds}s ago"
36
+ minutes = seconds // 60
37
+ if minutes < 60:
38
+ return f"{minutes}m ago"
39
+ hours = minutes // 60
40
+ if hours < 24:
41
+ return f"{hours}h ago"
42
+ days = hours // 24
43
+ if days < 30:
44
+ return f"{days}d ago"
45
+ months = days // 30
46
+ if months < 12:
47
+ return f"{months}mo ago"
48
+ years = days // 365
49
+ return f"{years}y ago"
50
+ except (ValueError, TypeError):
51
+ return ""
52
+
53
+
54
+ @click.group()
55
+ @click.version_option(version=__version__, prog_name="agentevals")
56
+ @click.option(
57
+ "-v",
58
+ "--verbose",
59
+ count=True,
60
+ help="Increase verbosity (-v for INFO, -vv for DEBUG).",
61
+ )
62
+ def main(verbose: int) -> None:
63
+ """agentevals: Evaluate agent traces using ADK's scoring framework."""
64
+ level = logging.WARNING
65
+ if verbose == 1:
66
+ level = logging.INFO
67
+ elif verbose >= 2:
68
+ level = logging.DEBUG
69
+ logging.basicConfig(
70
+ level=level,
71
+ format="%(levelname)s %(name)s: %(message)s",
72
+ )
73
+
74
+
75
+ @main.command()
76
+ @click.argument("trace_files", nargs=-1, required=True, type=click.Path(exists=True))
77
+ @click.option(
78
+ "--eval-set",
79
+ "-e",
80
+ type=click.Path(exists=True),
81
+ default=None,
82
+ help="Path to a golden eval set JSON file (ADK EvalSet format).",
83
+ )
84
+ @click.option(
85
+ "--metric",
86
+ "-m",
87
+ multiple=True,
88
+ default=None,
89
+ help="Metric(s) to evaluate. Can be specified multiple times. Default: tool_trajectory_avg_score.",
90
+ )
91
+ @click.option(
92
+ "--format",
93
+ "-f",
94
+ "trace_format",
95
+ default="jaeger-json",
96
+ help="Trace file format.",
97
+ )
98
+ @click.option(
99
+ "--judge-model",
100
+ "-j",
101
+ default=None,
102
+ help="LLM model for judge-based metrics (default: gemini-2.5-flash).",
103
+ )
104
+ @click.option(
105
+ "--threshold",
106
+ "-t",
107
+ type=float,
108
+ default=None,
109
+ help="Score threshold for pass/fail.",
110
+ )
111
+ @click.option(
112
+ "--output",
113
+ "-o",
114
+ type=click.Choice(["table", "json", "summary"]),
115
+ default="table",
116
+ help="Output format.",
117
+ )
118
+ @click.option(
119
+ "--config",
120
+ "-c",
121
+ "config_file",
122
+ type=click.Path(exists=True),
123
+ default=None,
124
+ help="Path to an eval config YAML file defining metrics (including custom).",
125
+ )
126
+ def run(
127
+ trace_files: tuple[str, ...],
128
+ eval_set: str | None,
129
+ metric: tuple[str, ...] | None,
130
+ trace_format: str,
131
+ judge_model: str | None,
132
+ threshold: float | None,
133
+ output: str,
134
+ config_file: str | None,
135
+ ) -> None:
136
+ """Evaluate trace file(s) against specified metrics."""
137
+ from .config import EvalRunConfig
138
+ from .output import format_results
139
+ from .runner import run_evaluation
140
+
141
+ explicit_metrics = list(metric) if metric else []
142
+
143
+ if config_file:
144
+ from .eval_config_loader import load_eval_config, merge_configs
145
+
146
+ file_config = load_eval_config(config_file)
147
+
148
+ cli_config = EvalRunConfig(
149
+ trace_files=list(trace_files),
150
+ eval_set_file=eval_set,
151
+ metrics=explicit_metrics,
152
+ trace_format=trace_format,
153
+ judge_model=judge_model,
154
+ threshold=threshold,
155
+ output_format=output,
156
+ )
157
+ config = merge_configs(file_config, cli_config)
158
+ else:
159
+ effective_metrics = explicit_metrics or ["tool_trajectory_avg_score"]
160
+ config = EvalRunConfig(
161
+ trace_files=list(trace_files),
162
+ eval_set_file=eval_set,
163
+ metrics=effective_metrics,
164
+ trace_format=trace_format,
165
+ judge_model=judge_model,
166
+ threshold=threshold,
167
+ output_format=output,
168
+ )
169
+
170
+ result = asyncio.run(run_evaluation(config))
171
+ formatted = format_results(result, fmt=output)
172
+ click.echo(formatted)
173
+
174
+ has_failure = any(mr.eval_status == "FAILED" or mr.error for tr in result.trace_results for mr in tr.metric_results)
175
+ if has_failure or result.errors:
176
+ sys.exit(1)
177
+
178
+
179
+ @main.command("list-metrics")
180
+ def list_metrics() -> None:
181
+ """List all available evaluation metrics.
182
+
183
+ DEPRECATED: use ``agentevals evaluator list --source builtin`` instead.
184
+ """
185
+ click.echo(
186
+ "Note: list-metrics is deprecated. Use 'agentevals evaluator list --source builtin' instead.\n",
187
+ err=True,
188
+ )
189
+ try:
190
+ from google.adk.evaluation.metric_evaluator_registry import (
191
+ DEFAULT_METRIC_EVALUATOR_REGISTRY,
192
+ )
193
+
194
+ metrics = DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics()
195
+ click.echo("Available metrics:\n")
196
+ for m in metrics:
197
+ desc = m.description or "No description"
198
+ click.echo(f" {m.metric_name}")
199
+ click.echo(f" {desc}")
200
+ if m.metric_value_info and m.metric_value_info.interval:
201
+ iv = m.metric_value_info.interval
202
+ lo = f"{'(' if iv.open_at_min else '['}{iv.min_value}"
203
+ hi = f"{iv.max_value}{')' if iv.open_at_max else ']'}"
204
+ click.echo(f" Value range: {lo}, {hi}")
205
+ click.echo()
206
+ except ImportError as exc:
207
+ click.echo(
208
+ f"Could not load full metric registry ({exc}).\n"
209
+ "Some eval dependencies may be missing. Install with:\n"
210
+ ' pip install "google-adk[eval]"\n'
211
+ )
212
+ click.echo("Known built-in metrics:\n")
213
+ from google.adk.evaluation.eval_metrics import PrebuiltMetrics
214
+
215
+ for pm in PrebuiltMetrics:
216
+ click.echo(f" {pm.value}")
217
+ click.echo()
218
+
219
+
220
+ # ---------------------------------------------------------------------------
221
+ # agentevals evaluator ...
222
+ # ---------------------------------------------------------------------------
223
+
224
+
225
+ @main.group()
226
+ def evaluator() -> None:
227
+ """Manage evaluators: scaffold, list, and discover."""
228
+
229
+
230
+ @evaluator.command("init")
231
+ @click.argument("name")
232
+ @click.option(
233
+ "--output-dir",
234
+ "-o",
235
+ type=click.Path(),
236
+ default=".",
237
+ help="Parent directory for the new evaluator folder (default: current directory).",
238
+ )
239
+ @click.option(
240
+ "--runtime",
241
+ "-r",
242
+ default=None,
243
+ help="Language runtime: py, js, ts (default: inferred from name or py).",
244
+ )
245
+ def evaluator_init(name: str, output_dir: str, runtime: str | None) -> None:
246
+ """Scaffold a new evaluator with boilerplate code and an evaluator.yaml manifest.
247
+
248
+ NAME is the evaluator name. If it ends with a recognized extension (.py, .js,
249
+ .ts) the language is inferred automatically; otherwise use --runtime.
250
+
251
+ \b
252
+ Examples:
253
+ agentevals evaluator init my_evaluator
254
+ agentevals evaluator init my_evaluator.ts
255
+ agentevals evaluator init my_evaluator --runtime js
256
+ """
257
+ from pathlib import Path as _Path
258
+
259
+ from .evaluator.templates import scaffold_evaluator
260
+
261
+ try:
262
+ evaluator_dir = scaffold_evaluator(name, output_dir=_Path(output_dir), runtime=runtime)
263
+ except (ValueError, OSError) as exc:
264
+ raise click.ClickException(str(exc)) from exc
265
+
266
+ click.echo(f"Created evaluator in {evaluator_dir}/")
267
+ click.echo()
268
+ click.echo("Files:")
269
+ for f in sorted(evaluator_dir.iterdir()):
270
+ click.echo(f" {f.relative_to(evaluator_dir.parent)}")
271
+ click.echo()
272
+ click.echo("Next steps:")
273
+ click.echo(" 1. Implement your scoring logic in the generated code file")
274
+ click.echo(" 2. Add it to your eval_config.yaml under 'evaluators':")
275
+ click.echo()
276
+
277
+ code_files = [f for f in evaluator_dir.iterdir() if f.suffix in (".py", ".js", ".ts")]
278
+ evaluator_name = evaluator_dir.name
279
+ if code_files:
280
+ rel = code_files[0].relative_to(evaluator_dir.parent)
281
+ click.echo(" evaluators:")
282
+ click.echo(f" - name: {evaluator_name}")
283
+ click.echo(" type: code")
284
+ click.echo(f" path: ./{rel}")
285
+ click.echo(" threshold: 0.5")
286
+ click.echo()
287
+ click.echo(" 3. Run: agentevals run <trace_file> --config eval_config.yaml")
288
+
289
+
290
+ @evaluator.command("runtimes")
291
+ def evaluator_runtimes() -> None:
292
+ """Show supported language runtimes and execution environments."""
293
+ from .custom_evaluators import _EXECUTOR_FACTORIES, get_runtimes
294
+
295
+ click.echo("Language runtimes:\n")
296
+ for rt in get_runtimes():
297
+ exts = ", ".join(rt.extensions)
298
+ available = "available" if rt.is_available() else "not found"
299
+ click.echo(f" {rt.name:<12} extensions: {exts:<16} ({available})")
300
+
301
+ click.echo("\nExecutors:\n")
302
+ for name in sorted(_EXECUTOR_FACTORIES):
303
+ click.echo(f" {name}")
304
+
305
+ click.echo()
306
+
307
+
308
+ @evaluator.command("list")
309
+ @click.option(
310
+ "--source",
311
+ "-s",
312
+ type=click.Choice(["all", "builtin", "github"]),
313
+ default="all",
314
+ help="Filter evaluators by source (default: all).",
315
+ )
316
+ @click.option(
317
+ "--refresh",
318
+ is_flag=True,
319
+ default=False,
320
+ help="Ignore cached results and fetch fresh data.",
321
+ )
322
+ def evaluator_list(source: str, refresh: bool) -> None:
323
+ """List available evaluators from all registered sources."""
324
+ from .evaluator.sources import _cache_dir, get_sources
325
+
326
+ if refresh:
327
+ import shutil
328
+
329
+ cache = _cache_dir()
330
+ if cache.exists():
331
+ shutil.rmtree(cache, ignore_errors=True)
332
+
333
+ sources = get_sources()
334
+ if source != "all":
335
+ sources = [s for s in sources if s.source_name == source]
336
+
337
+ click.echo(" Fetching evaluators...", nl=False)
338
+ all_evaluators = asyncio.run(_collect_evaluators(sources))
339
+ click.echo("\r" + " " * 30 + "\r", nl=False)
340
+
341
+ if not all_evaluators:
342
+ click.echo("No evaluators found.")
343
+ return
344
+
345
+ max_name = max(len(g.name) for g in all_evaluators)
346
+ max_src = max(len(g.source) for g in all_evaluators)
347
+
348
+ has_updated = any(g.last_updated for g in all_evaluators)
349
+ updated_col_width = 10 if has_updated else 0
350
+
351
+ try:
352
+ term_width = os.get_terminal_size().columns
353
+ except OSError:
354
+ term_width = 120
355
+
356
+ overhead = max_name + max_src + 8
357
+ if has_updated:
358
+ overhead += updated_col_width + 2
359
+ desc_width = max(20, term_width - overhead)
360
+
361
+ hdr_updated = f" {'UPDATED':<{updated_col_width}}" if has_updated else ""
362
+ sep_updated = f" {'-' * updated_col_width}" if has_updated else ""
363
+
364
+ click.echo(f" {'NAME':<{max_name}} {'SOURCE':<{max_src}}{hdr_updated} DESCRIPTION")
365
+ click.echo(f" {'-' * max_name} {'-' * max_src}{sep_updated} {'-' * min(40, desc_width)}")
366
+
367
+ for g in sorted(all_evaluators, key=lambda x: (x.source, x.name)):
368
+ lang = f" [{g.language}]" if g.language else ""
369
+ desc = g.description + lang
370
+ if len(desc) > desc_width:
371
+ desc = desc[: desc_width - 3] + "..."
372
+ col_updated = f" {_relative_time(g.last_updated):<{updated_col_width}}" if has_updated else ""
373
+ click.echo(f" {g.name:<{max_name}} {g.source:<{max_src}}{col_updated} {desc}")
374
+
375
+ click.echo(f"\n {len(all_evaluators)} evaluator(s) found.")
376
+
377
+
378
+ async def _collect_evaluators(sources):
379
+ """Gather evaluator lists from all sources concurrently."""
380
+ import asyncio as _asyncio
381
+
382
+ from .evaluator.sources import EvaluatorInfo
383
+
384
+ results: list[EvaluatorInfo] = []
385
+ tasks = [s.list_evaluators() for s in sources]
386
+ for evaluators in await _asyncio.gather(*tasks, return_exceptions=True):
387
+ if isinstance(evaluators, BaseException):
388
+ click.echo(f" Warning: failed to fetch from a source: {evaluators}", err=True)
389
+ continue
390
+ results.extend(evaluators)
391
+ return results
392
+
393
+
394
+ @evaluator.command("config")
395
+ @click.argument("name")
396
+ @click.option(
397
+ "--path",
398
+ "-p",
399
+ "evaluator_path",
400
+ default=None,
401
+ help="Path to the evaluator script (used for local code evaluators).",
402
+ )
403
+ @click.option(
404
+ "--threshold",
405
+ "-t",
406
+ type=float,
407
+ default=None,
408
+ help="Score threshold (default: 0.5 for custom evaluators).",
409
+ )
410
+ def evaluator_config(name: str, evaluator_path: str | None, threshold: float | None) -> None:
411
+ """Generate an eval_config.yaml snippet for an evaluator."""
412
+ import yaml as _yaml
413
+
414
+ from .builtin_metrics import METRICS_NEEDING_EXPECTED, METRICS_NEEDING_GCP, METRICS_NEEDING_LLM
415
+ from .evaluator.sources import get_sources
416
+
417
+ sources = get_sources()
418
+ all_evaluators = asyncio.run(_collect_evaluators(sources))
419
+
420
+ match = next((g for g in all_evaluators if g.name == name), None)
421
+
422
+ if match and match.source == "builtin":
423
+ needs_eval_set = name in METRICS_NEEDING_EXPECTED
424
+ needs_llm = name in METRICS_NEEDING_LLM
425
+ needs_gcp = name in METRICS_NEEDING_GCP
426
+
427
+ entry: dict = {"name": name, "type": "builtin"}
428
+ if threshold is not None:
429
+ entry["threshold"] = threshold
430
+ else:
431
+ entry["threshold"] = 0.5
432
+ if needs_llm:
433
+ entry["judge_model"] = "gemini-2.5-flash"
434
+
435
+ snippet: dict = {"evaluators": [entry]}
436
+
437
+ notes: list[str] = []
438
+ if needs_eval_set:
439
+ notes.append("Requires --eval-set (golden eval set with expected responses)")
440
+ if needs_llm:
441
+ notes.append("Requires GOOGLE_API_KEY (or GEMINI_API_KEY) for LLM judge")
442
+ if needs_gcp:
443
+ notes.append("Requires GOOGLE_CLOUD_PROJECT and GOOGLE_CLOUD_LOCATION (Vertex AI)")
444
+
445
+ comment = "# Add to your eval_config.yaml under 'evaluators':"
446
+ if notes:
447
+ comment += "\n#\n# Notes:\n" + "\n".join(f"# - {n}" for n in notes)
448
+ elif match and match.source != "builtin":
449
+ entry: dict = {
450
+ "name": name,
451
+ "type": "remote",
452
+ "source": match.source,
453
+ "ref": match.ref or f"evaluators/{name}",
454
+ }
455
+ if threshold is not None:
456
+ entry["threshold"] = threshold
457
+ else:
458
+ entry["threshold"] = 0.5
459
+ entry["executor"] = "local"
460
+ snippet = {"evaluators": [entry]}
461
+ comment = "# Add to your eval_config.yaml under 'evaluators':"
462
+ else:
463
+ path_val = evaluator_path or f"./{name}/{name}.py"
464
+ entry = {
465
+ "name": name,
466
+ "type": "code",
467
+ "path": path_val,
468
+ }
469
+ if threshold is not None:
470
+ entry["threshold"] = threshold
471
+ else:
472
+ entry["threshold"] = 0.5
473
+ entry["executor"] = "local"
474
+ snippet = {"evaluators": [entry]}
475
+ comment = "# Add to your eval_config.yaml under 'evaluators':"
476
+
477
+ rendered = _yaml.dump(snippet, default_flow_style=False, sort_keys=False)
478
+ click.echo(f"\n{comment}\n")
479
+ click.echo(rendered)
480
+
481
+
482
+ def _link_server_shutdown(*servers) -> None:
483
+ """Link multiple uvicorn servers so a single SIGINT shuts down all of them.
484
+
485
+ Uvicorn installs per-server signal handlers; the last server's handler
486
+ overwrites earlier ones. This replaces handle_exit on every server with
487
+ a shared callback that sets should_exit / force_exit on all of them.
488
+ """
489
+ import signal as _signal
490
+
491
+ def _shared_exit(sig, frame):
492
+ force = all(s.should_exit for s in servers)
493
+ for s in servers:
494
+ if force and sig == _signal.SIGINT:
495
+ s.force_exit = True
496
+ else:
497
+ s.should_exit = True
498
+
499
+ for s in servers:
500
+ s.handle_exit = _shared_exit
501
+
502
+
503
+ async def _run_servers(
504
+ host: str,
505
+ port: int,
506
+ otlp_port: int,
507
+ *,
508
+ reload: bool = False,
509
+ reload_dirs: list[str] | None = None,
510
+ log_level: str = "warning",
511
+ ) -> None:
512
+ """Start the main API and OTLP HTTP servers."""
513
+ import uvicorn
514
+
515
+ shared_kwargs: dict = {
516
+ "host": host,
517
+ "reload": reload,
518
+ "log_level": log_level,
519
+ }
520
+ if reload_dirs:
521
+ shared_kwargs["reload_dirs"] = reload_dirs
522
+
523
+ main_server = uvicorn.Server(uvicorn.Config("agentevals.api.app:app", port=port, **shared_kwargs))
524
+ otlp_server = uvicorn.Server(uvicorn.Config("agentevals.api.otlp_app:otlp_app", port=otlp_port, **shared_kwargs))
525
+ _link_server_shutdown(main_server, otlp_server)
526
+ await asyncio.gather(main_server.serve(), otlp_server.serve())
527
+
528
+
529
+ @main.command("serve")
530
+ @click.option(
531
+ "--dev",
532
+ is_flag=True,
533
+ help="Enable dev mode with WebSocket support for live streaming.",
534
+ )
535
+ @click.option(
536
+ "--host",
537
+ default="0.0.0.0",
538
+ help="Host to bind the server to.",
539
+ )
540
+ @click.option(
541
+ "--port",
542
+ "-p",
543
+ default=8001,
544
+ help="Port to bind the server to.",
545
+ )
546
+ @click.option(
547
+ "--otlp-port",
548
+ default=4318,
549
+ help="Port for OTLP HTTP receiver (default: 4318, standard OTLP HTTP port).",
550
+ )
551
+ @click.option(
552
+ "--eval-sets",
553
+ type=click.Path(exists=True),
554
+ default=None,
555
+ help="Directory containing eval set JSON files to pre-load.",
556
+ )
557
+ @click.option(
558
+ "--headless",
559
+ is_flag=True,
560
+ help="Run in headless mode (no browser launch).",
561
+ )
562
+ @click.option(
563
+ "-v",
564
+ "--verbose",
565
+ count=True,
566
+ help="Increase verbosity (-v for INFO, -vv for DEBUG).",
567
+ )
568
+ def serve(dev: bool, host: str, port: int, otlp_port: int, eval_sets: str | None, headless: bool, verbose: int) -> None:
569
+ """Start the agentevals API server.
570
+
571
+ Use --dev to enable live streaming mode for agent development.
572
+ """
573
+ from pathlib import Path
574
+
575
+ level = logging.WARNING
576
+ if verbose == 1:
577
+ level = logging.INFO
578
+ elif verbose >= 2:
579
+ level = logging.DEBUG
580
+ logging.basicConfig(
581
+ level=level,
582
+ format="%(levelname)s %(name)s: %(message)s",
583
+ )
584
+
585
+ if headless:
586
+ os.environ["AGENTEVALS_HEADLESS"] = "1"
587
+
588
+ static_dir = Path(__file__).parent / "_static"
589
+ has_ui = static_dir.is_dir() and (static_dir / "index.html").exists()
590
+
591
+ os.environ["AGENTEVALS_LIVE"] = "1"
592
+
593
+ if dev:
594
+ click.echo("agentevals dev server starting...")
595
+ click.echo(f" OTLP HTTP: http://{host}:{otlp_port} (OTEL_EXPORTER_OTLP_ENDPOINT default)")
596
+ click.echo(f" WebSocket: ws://{host}:{port}/ws/traces")
597
+ click.echo(f" API: http://{host}:{port}/api")
598
+ click.echo(" Web UI: http://localhost:5173")
599
+ click.echo()
600
+
601
+ if eval_sets:
602
+ click.echo(f" Eval sets: {eval_sets}")
603
+ click.echo()
604
+
605
+ click.echo("Waiting for agent connections...")
606
+ click.echo()
607
+
608
+ src_path = Path(__file__).parent.parent
609
+ reload_dirs = [str(src_path)]
610
+ asyncio.run(_run_servers(host, port, otlp_port, reload=True, reload_dirs=reload_dirs, log_level="info"))
611
+ elif has_ui and not headless:
612
+ click.echo(f"agentevals: http://{host}:{port}")
613
+ click.echo(f" OTLP HTTP: http://{host}:{otlp_port}")
614
+ click.echo()
615
+
616
+ asyncio.run(_run_servers(host, port, otlp_port))
617
+ else:
618
+ click.echo(f"agentevals API: http://{host}:{port}/api")
619
+ click.echo(f" OTLP HTTP: http://{host}:{otlp_port}")
620
+ click.echo()
621
+
622
+ asyncio.run(_run_servers(host, port, otlp_port))
623
+
624
+
625
+ @main.command("mcp")
626
+ @click.option(
627
+ "--server-url",
628
+ default=None,
629
+ help="agentevals server URL for session tools (default: http://localhost:8001 or AGENTEVALS_SERVER_URL).",
630
+ )
631
+ def mcp_command(server_url: str | None) -> None:
632
+ """Start the MCP server on stdio for use with Claude Code and other MCP clients."""
633
+ try:
634
+ from .mcp_server import create_server
635
+ except ImportError:
636
+ click.echo('MCP requires the live extras: pip install "agentevals[live]"', err=True)
637
+ sys.exit(1)
638
+
639
+ create_server(server_url=server_url).run("stdio")
640
+
641
+
642
+ if __name__ == "__main__":
643
+ main()