agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,468 @@
1
+ """Custom evaluators that run evaluators via pluggable backends.
2
+
3
+ Every backend implements the same protocol: accept :class:`EvalInput` (JSON)
4
+ and return :class:`EvalResult` (JSON). The transport varies — local
5
+ subprocess, HTTP, Docker container, etc.
6
+
7
+ The protocol types live in :mod:`agentevals._protocol` (CLI-internal) and are
8
+ JSON-wire-compatible with the types in the ``agentevals-evaluator-sdk`` package.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import abc
14
+ import asyncio
15
+ import logging
16
+ import shutil
17
+ import sys
18
+ from collections.abc import Callable
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from google.adk.evaluation.eval_case import Invocation, get_all_tool_calls
23
+ from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult, Evaluator, PerInvocationResult
24
+
25
+ from agentevals._protocol import (
26
+ EvalInput,
27
+ EvalResult,
28
+ InvocationData,
29
+ ToolCallData,
30
+ ToolResponseData,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # EvaluatorBackend — primary abstraction
38
+ # ---------------------------------------------------------------------------
39
+
40
+
41
+ class EvaluatorBackend(abc.ABC):
42
+ """Delivers :class:`EvalInput` to an evaluator and returns :class:`EvalResult`.
43
+
44
+ Subclasses encapsulate the *transport* — subprocess, HTTP, Docker, etc.
45
+ """
46
+
47
+ @abc.abstractmethod
48
+ async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
49
+ """Execute the evaluator and return its result."""
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Runtime — language-specific helpers for SubprocessBackend
54
+ # ---------------------------------------------------------------------------
55
+
56
+
57
+ class Runtime(abc.ABC):
58
+ """Maps a file extension to the command needed to run it."""
59
+
60
+ @property
61
+ @abc.abstractmethod
62
+ def name(self) -> str:
63
+ """Human-readable runtime name (e.g. ``"Python"``)."""
64
+
65
+ @property
66
+ @abc.abstractmethod
67
+ def extensions(self) -> tuple[str, ...]:
68
+ """File extensions this runtime handles (e.g. ``(".py",)``)."""
69
+
70
+ @abc.abstractmethod
71
+ def build_command(self, path: Path) -> list[str]:
72
+ """Return the argv list to execute *path*."""
73
+
74
+ def is_available(self) -> bool:
75
+ """Return True if the runtime's interpreter is found on the system."""
76
+ try:
77
+ self.build_command(Path("__probe__"))
78
+ return True
79
+ except RuntimeError:
80
+ return False
81
+
82
+
83
+ class PythonRuntime(Runtime):
84
+ @property
85
+ def name(self) -> str:
86
+ return "Python"
87
+
88
+ @property
89
+ def extensions(self) -> tuple[str, ...]:
90
+ return (".py",)
91
+
92
+ def build_command(self, path: Path) -> list[str]:
93
+ return [sys.executable, str(path)]
94
+
95
+ def is_available(self) -> bool:
96
+ return True
97
+
98
+
99
+ class NodeRuntime(Runtime):
100
+ @property
101
+ def name(self) -> str:
102
+ return "Node.js"
103
+
104
+ @property
105
+ def extensions(self) -> tuple[str, ...]:
106
+ return (".js", ".ts")
107
+
108
+ def build_command(self, path: Path) -> list[str]:
109
+ node = shutil.which("node")
110
+ if not node:
111
+ raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
112
+ return [node, str(path)]
113
+
114
+
115
+ _RUNTIMES: list[Runtime] = [
116
+ PythonRuntime(),
117
+ NodeRuntime(),
118
+ ]
119
+
120
+
121
+ def get_runtimes() -> list[Runtime]:
122
+ """Return all registered runtimes."""
123
+ return list(_RUNTIMES)
124
+
125
+
126
+ def supported_extensions() -> set[str]:
127
+ """All file extensions supported by registered runtimes."""
128
+ exts: set[str] = set()
129
+ for rt in _RUNTIMES:
130
+ exts.update(rt.extensions)
131
+ return exts
132
+
133
+
134
+ def _resolve_runtime(path: Path) -> Runtime:
135
+ """Find the runtime that handles *path*'s extension."""
136
+ suffix = path.suffix.lower()
137
+ for rt in _RUNTIMES:
138
+ if suffix in rt.extensions:
139
+ return rt
140
+ raise ValueError(f"No runtime registered for extension '{suffix}'. Supported: {sorted(supported_extensions())}")
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Subprocess runner (used by SubprocessBackend)
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ async def _run_subprocess(
149
+ cmd: list[str],
150
+ input_json: str,
151
+ timeout: int,
152
+ metric_name: str,
153
+ ) -> EvalResult:
154
+ """Run a subprocess, pipe JSON on stdin, read JSON from stdout."""
155
+ logger.info("Running custom evaluator %r: %s", metric_name, " ".join(cmd))
156
+
157
+ proc = await asyncio.create_subprocess_exec(
158
+ *cmd,
159
+ stdin=asyncio.subprocess.PIPE,
160
+ stdout=asyncio.subprocess.PIPE,
161
+ stderr=asyncio.subprocess.PIPE,
162
+ )
163
+
164
+ try:
165
+ stdout_bytes, stderr_bytes = await asyncio.wait_for(
166
+ proc.communicate(input=input_json.encode()),
167
+ timeout=timeout,
168
+ )
169
+ except TimeoutError as exc:
170
+ proc.kill()
171
+ await proc.wait()
172
+ raise TimeoutError(f"Custom evaluator '{metric_name}' timed out after {timeout}s") from exc
173
+
174
+ stderr_text = stderr_bytes.decode(errors="replace").strip()
175
+ if stderr_text:
176
+ logger.debug("Custom evaluator %r stderr:\n%s", metric_name, stderr_text)
177
+
178
+ if proc.returncode != 0:
179
+ raise RuntimeError(
180
+ f"Custom evaluator '{metric_name}' exited with code {proc.returncode}"
181
+ + (f": {stderr_text}" if stderr_text else "")
182
+ )
183
+
184
+ stdout_text = stdout_bytes.decode().strip()
185
+ if not stdout_text:
186
+ hint = ""
187
+ if stderr_text:
188
+ hint = f"\nEvaluator stderr:\n{stderr_text}"
189
+ raise RuntimeError(f"Custom evaluator '{metric_name}' produced no output on stdout" + hint)
190
+
191
+ try:
192
+ return EvalResult.model_validate_json(stdout_text)
193
+ except Exception as exc:
194
+ raise RuntimeError(f"Custom evaluator '{metric_name}' produced invalid JSON: {exc}") from exc
195
+
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # Backend implementations
199
+ # ---------------------------------------------------------------------------
200
+
201
+
202
+ class SubprocessBackend(EvaluatorBackend):
203
+ """Runs a local code file (.py, .js, .ts, …) as a subprocess.
204
+
205
+ The correct interpreter is resolved from the file extension via the
206
+ :data:`_RUNTIMES` registry.
207
+ """
208
+
209
+ def __init__(self, path: Path, timeout: int = 30):
210
+ self._path = path.resolve()
211
+ self._runtime = _resolve_runtime(self._path)
212
+ self._timeout = timeout
213
+
214
+ if not self._path.exists():
215
+ raise FileNotFoundError(f"Evaluator file not found: {self._path}")
216
+
217
+ async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
218
+ cmd = self._runtime.build_command(self._path)
219
+ return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
220
+
221
+
222
+ # ---------------------------------------------------------------------------
223
+ # Executor factory
224
+ # ---------------------------------------------------------------------------
225
+
226
+ _EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
227
+ "local": lambda path, timeout: SubprocessBackend(path, timeout),
228
+ }
229
+
230
+
231
+ def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
232
+ """Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
233
+ factory = _EXECUTOR_FACTORIES.get(executor_name)
234
+ if factory is None:
235
+ raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
236
+ return factory(path, timeout)
237
+
238
+
239
+ def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
240
+ """Register a new executor factory (e.g. for Docker support)."""
241
+ _EXECUTOR_FACTORIES[name] = factory
242
+
243
+
244
+ # ---------------------------------------------------------------------------
245
+ # ADK Invocation ↔ InvocationData conversion
246
+ # ---------------------------------------------------------------------------
247
+
248
+
249
+ def _content_to_text(content) -> str:
250
+ """Extract plain text from an ADK Content object."""
251
+ if content is None:
252
+ return ""
253
+ if isinstance(content, str):
254
+ return content
255
+ if hasattr(content, "parts") and content.parts:
256
+ texts = []
257
+ for part in content.parts:
258
+ if hasattr(part, "text") and part.text:
259
+ texts.append(part.text)
260
+ return " ".join(texts)
261
+ return ""
262
+
263
+
264
+ def _extract_tool_calls_from_invocation(inv: Invocation) -> list[ToolCallData]:
265
+ """Extract tool calls from an Invocation's intermediate_data."""
266
+ calls: list[ToolCallData] = []
267
+ if not inv.intermediate_data:
268
+ return calls
269
+
270
+ try:
271
+ tool_uses = get_all_tool_calls(inv.intermediate_data)
272
+ for tc in tool_uses:
273
+ calls.append(ToolCallData(name=tc.name or "", args=tc.args or {}))
274
+ except Exception:
275
+ pass
276
+
277
+ return calls
278
+
279
+
280
+ def _extract_tool_responses_from_invocation(inv: Invocation) -> list[ToolResponseData]:
281
+ """Extract tool responses from intermediate_data."""
282
+ responses: list[ToolResponseData] = []
283
+ if not inv.intermediate_data:
284
+ return responses
285
+
286
+ if hasattr(inv.intermediate_data, "tool_responses"):
287
+ for tr in inv.intermediate_data.tool_responses or []:
288
+ name = ""
289
+ output = ""
290
+ if hasattr(tr, "name"):
291
+ name = tr.name or ""
292
+ if hasattr(tr, "response"):
293
+ output = str(tr.response) if tr.response else ""
294
+ elif hasattr(tr, "output"):
295
+ output = str(tr.output) if tr.output else ""
296
+ responses.append(ToolResponseData(name=name, output=output))
297
+
298
+ return responses
299
+
300
+
301
+ def invocation_to_data(inv: Invocation) -> InvocationData:
302
+ """Convert an ADK Invocation to a simplified InvocationData for the protocol."""
303
+ return InvocationData(
304
+ invocation_id=inv.invocation_id or "",
305
+ user_content=_content_to_text(inv.user_content),
306
+ final_response=_content_to_text(inv.final_response) or None,
307
+ tool_calls=_extract_tool_calls_from_invocation(inv),
308
+ tool_responses=_extract_tool_responses_from_invocation(inv),
309
+ )
310
+
311
+
312
+ def invocations_to_data(invocations: list[Invocation] | None) -> list[InvocationData] | None:
313
+ """Convert a list of ADK Invocations, or return None."""
314
+ if invocations is None:
315
+ return None
316
+ return [invocation_to_data(inv) for inv in invocations]
317
+
318
+
319
+ # ---------------------------------------------------------------------------
320
+ # EvalResult → EvaluationResult conversion
321
+ # ---------------------------------------------------------------------------
322
+
323
+
324
+ def _eval_result_to_evaluation_result(
325
+ result: EvalResult,
326
+ threshold: float,
327
+ actual_invocations: list[Invocation],
328
+ ) -> EvaluationResult:
329
+ """Convert our protocol EvalResult into an ADK EvaluationResult."""
330
+ if result.status:
331
+ status_map = {
332
+ "PASSED": EvalStatus.PASSED,
333
+ "FAILED": EvalStatus.FAILED,
334
+ "NOT_EVALUATED": EvalStatus.NOT_EVALUATED,
335
+ }
336
+ overall_status = status_map.get(result.status.upper(), EvalStatus.NOT_EVALUATED)
337
+ else:
338
+ overall_status = EvalStatus.PASSED if result.score >= threshold else EvalStatus.FAILED
339
+
340
+ per_inv_results: list[PerInvocationResult] = []
341
+ for i, inv in enumerate(actual_invocations):
342
+ score = result.per_invocation_scores[i] if i < len(result.per_invocation_scores) else None
343
+ per_inv_results.append(
344
+ PerInvocationResult(
345
+ actual_invocation=inv,
346
+ score=score,
347
+ eval_status=overall_status,
348
+ )
349
+ )
350
+
351
+ return EvaluationResult(
352
+ overall_score=result.score,
353
+ overall_eval_status=overall_status,
354
+ per_invocation_results=per_inv_results,
355
+ )
356
+
357
+
358
+ # ---------------------------------------------------------------------------
359
+ # CustomEvaluatorRunner — ADK Evaluator adapter (backend-agnostic)
360
+ # ---------------------------------------------------------------------------
361
+
362
+
363
+ class CustomEvaluatorRunner(Evaluator):
364
+ """Wraps any :class:`EvaluatorBackend` as an ADK :class:`Evaluator`.
365
+
366
+ Handles the conversion between ADK ``Invocation`` objects and the
367
+ language-agnostic ``EvalInput``/``EvalResult`` protocol.
368
+ """
369
+
370
+ def __init__(
371
+ self,
372
+ backend: EvaluatorBackend,
373
+ metric_name: str,
374
+ threshold: float = 0.5,
375
+ config: dict[str, Any] | None = None,
376
+ ):
377
+ self._backend = backend
378
+ self._metric_name = metric_name
379
+ self._threshold = threshold
380
+ self._config = config or {}
381
+
382
+ async def evaluate_invocations(
383
+ self,
384
+ actual_invocations: list[Invocation],
385
+ expected_invocations: list[Invocation] | None = None,
386
+ conversation_scenario=None,
387
+ ) -> EvaluationResult:
388
+
389
+ eval_input = EvalInput(
390
+ metric_name=self._metric_name,
391
+ threshold=self._threshold,
392
+ config=self._config,
393
+ invocations=invocations_to_data(actual_invocations) or [],
394
+ expected_invocations=invocations_to_data(expected_invocations),
395
+ )
396
+
397
+ result = await self._backend.run(eval_input, self._metric_name)
398
+ return _eval_result_to_evaluation_result(result, self._threshold, actual_invocations)
399
+
400
+
401
+ # ---------------------------------------------------------------------------
402
+ # Public helper — build and run a custom evaluator from a config definition
403
+ # ---------------------------------------------------------------------------
404
+
405
+
406
+ async def evaluate_custom_evaluator(
407
+ evaluator_def,
408
+ actual_invocations: list[Invocation],
409
+ expected_invocations: list[Invocation] | None,
410
+ ):
411
+ """Evaluate a single custom evaluator and return a ``MetricResult``.
412
+
413
+ This is the entry point called by the runner. It constructs the
414
+ appropriate backend from the config definition, wraps it in a
415
+ ``CustomEvaluatorRunner``, and runs the evaluation.
416
+ """
417
+ import inspect as _inspect
418
+
419
+ from .config import CodeEvaluatorDef, RemoteEvaluatorDef
420
+ from .runner import MetricResult
421
+
422
+ if isinstance(evaluator_def, RemoteEvaluatorDef):
423
+ from .evaluator.resolver import get_default_resolver
424
+
425
+ evaluator_def = await get_default_resolver().resolve(evaluator_def)
426
+
427
+ if isinstance(evaluator_def, CodeEvaluatorDef):
428
+ backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
429
+ else:
430
+ raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
431
+
432
+ evaluator_instance = CustomEvaluatorRunner(
433
+ backend=backend,
434
+ metric_name=evaluator_def.name,
435
+ threshold=evaluator_def.threshold,
436
+ config=evaluator_def.config,
437
+ )
438
+
439
+ try:
440
+ if _inspect.iscoroutinefunction(evaluator_instance.evaluate_invocations):
441
+ eval_result: EvaluationResult = await evaluator_instance.evaluate_invocations(
442
+ actual_invocations=actual_invocations,
443
+ expected_invocations=expected_invocations,
444
+ )
445
+ else:
446
+ import asyncio
447
+
448
+ eval_result: EvaluationResult = await asyncio.to_thread(
449
+ evaluator_instance.evaluate_invocations,
450
+ actual_invocations=actual_invocations,
451
+ expected_invocations=expected_invocations,
452
+ )
453
+
454
+ per_inv_scores = [r.score for r in eval_result.per_invocation_results]
455
+
456
+ return MetricResult(
457
+ metric_name=evaluator_def.name,
458
+ score=eval_result.overall_score,
459
+ eval_status=eval_result.overall_eval_status.name,
460
+ per_invocation_scores=per_inv_scores,
461
+ )
462
+
463
+ except Exception as exc:
464
+ logger.exception("Failed to evaluate custom evaluator '%s'", evaluator_def.name)
465
+ return MetricResult(
466
+ metric_name=evaluator_def.name,
467
+ error=str(exc),
468
+ )
@@ -0,0 +1,147 @@
1
+ """Load evaluation configuration from a YAML file."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+ from .config import (
12
+ BuiltinMetricDef,
13
+ CodeEvaluatorDef,
14
+ CustomEvaluatorDef,
15
+ EvalRunConfig,
16
+ RemoteEvaluatorDef,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _TYPE_TO_MODEL = {
22
+ "builtin": BuiltinMetricDef,
23
+ "code": CodeEvaluatorDef,
24
+ "remote": RemoteEvaluatorDef,
25
+ }
26
+
27
+
28
+ def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEvaluatorDef | None]:
29
+ """Parse a single evaluator entry from the YAML config.
30
+
31
+ Every entry must be a dict with ``name`` and ``type`` fields.
32
+ Returns (builtin_name, custom_evaluator_def). Exactly one will be non-None.
33
+ """
34
+ if not isinstance(entry, dict):
35
+ raise ValueError(
36
+ f"Each evaluator entry must be a mapping with 'name' and 'type' fields, got {type(entry).__name__}: {entry!r}"
37
+ )
38
+
39
+ name = entry.get("name")
40
+ if not name:
41
+ raise ValueError(f"Evaluator entry must have a 'name' field: {entry}")
42
+
43
+ evaluator_type = entry.get("type")
44
+ if not evaluator_type:
45
+ raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)")
46
+
47
+ if evaluator_type not in _TYPE_TO_MODEL:
48
+ raise ValueError(
49
+ f"Unknown evaluator type '{evaluator_type}' for '{name}'. Valid types: {list(_TYPE_TO_MODEL.keys())}"
50
+ )
51
+
52
+ model_cls = _TYPE_TO_MODEL[evaluator_type]
53
+ evaluator_def = model_cls.model_validate(entry)
54
+
55
+ if evaluator_type == "builtin":
56
+ return name, evaluator_def if (
57
+ evaluator_def.threshold is not None or evaluator_def.judge_model is not None
58
+ ) else None
59
+
60
+ return None, evaluator_def
61
+
62
+
63
+ def load_eval_config(path: str | Path) -> EvalRunConfig:
64
+ """Load an eval config YAML file and return a partially-filled EvalRunConfig.
65
+
66
+ The YAML file uses an ``evaluators`` list where each entry is a dict with
67
+ ``name`` and ``type`` fields. Built-in entries populate ``metrics``;
68
+ code/remote entries populate ``custom_evaluators``.
69
+ """
70
+ path = Path(path)
71
+ if not path.exists():
72
+ raise FileNotFoundError(f"Eval config file not found: {path}")
73
+
74
+ with open(path) as f:
75
+ data = yaml.safe_load(f)
76
+
77
+ if not isinstance(data, dict):
78
+ raise ValueError(f"Eval config must be a YAML mapping, got {type(data).__name__}")
79
+
80
+ raw_evaluators = data.get("evaluators", [])
81
+ if not isinstance(raw_evaluators, list):
82
+ raise ValueError("'evaluators' must be a list")
83
+
84
+ builtin_names: list[str] = []
85
+ custom_defs: list[CustomEvaluatorDef] = []
86
+ builtin_overrides: dict[str, BuiltinMetricDef] = {}
87
+
88
+ for entry in raw_evaluators:
89
+ builtin_name, custom_def = _parse_evaluator_entry(entry)
90
+ if builtin_name:
91
+ builtin_names.append(builtin_name)
92
+ if custom_def:
93
+ if isinstance(custom_def, BuiltinMetricDef):
94
+ builtin_overrides[custom_def.name] = custom_def
95
+ if custom_def.name not in builtin_names:
96
+ builtin_names.append(custom_def.name)
97
+ else:
98
+ custom_defs.append(custom_def)
99
+
100
+ config = EvalRunConfig(
101
+ trace_files=[],
102
+ metrics=builtin_names,
103
+ custom_evaluators=custom_defs,
104
+ )
105
+
106
+ if "eval_set" in data:
107
+ config.eval_set_file = str(data["eval_set"])
108
+ if "judge_model" in data:
109
+ config.judge_model = data["judge_model"]
110
+ if "threshold" in data:
111
+ config.threshold = float(data["threshold"])
112
+ if "trace_format" in data:
113
+ config.trace_format = data["trace_format"]
114
+
115
+ config._builtin_overrides = builtin_overrides # type: ignore[attr-defined]
116
+
117
+ return config
118
+
119
+
120
+ def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> EvalRunConfig:
121
+ """Merge a file-based config with CLI overrides.
122
+
123
+ CLI values take precedence for scalar fields. Metrics lists are merged:
124
+ CLI ``--metric`` flags are added to the file config's built-in metrics
125
+ (duplicates removed).
126
+ """
127
+ merged = file_config.model_copy()
128
+
129
+ if cli_config.trace_files:
130
+ merged.trace_files = cli_config.trace_files
131
+ if cli_config.eval_set_file is not None:
132
+ merged.eval_set_file = cli_config.eval_set_file
133
+ if cli_config.judge_model is not None:
134
+ merged.judge_model = cli_config.judge_model
135
+ if cli_config.threshold is not None:
136
+ merged.threshold = cli_config.threshold
137
+ if cli_config.trace_format != "jaeger-json":
138
+ merged.trace_format = cli_config.trace_format
139
+ if cli_config.output_format != "table":
140
+ merged.output_format = cli_config.output_format
141
+
142
+ file_metric_names = set(merged.metrics)
143
+ for name in cli_config.metrics:
144
+ if name not in file_metric_names:
145
+ merged.metrics.append(name)
146
+
147
+ return merged
@@ -0,0 +1,24 @@
1
+ """Evaluator management: sources, templates, and resolution."""
2
+
3
+ from .resolver import EvaluatorResolver, get_default_resolver
4
+ from .sources import (
5
+ BuiltinEvaluatorSource,
6
+ EvaluatorInfo,
7
+ EvaluatorSource,
8
+ FileEvaluatorSource,
9
+ GitHubEvaluatorSource,
10
+ get_sources,
11
+ )
12
+ from .templates import scaffold_evaluator
13
+
14
+ __all__ = [
15
+ "BuiltinEvaluatorSource",
16
+ "EvaluatorInfo",
17
+ "EvaluatorResolver",
18
+ "EvaluatorSource",
19
+ "FileEvaluatorSource",
20
+ "GitHubEvaluatorSource",
21
+ "get_default_resolver",
22
+ "get_sources",
23
+ "scaffold_evaluator",
24
+ ]