agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
"""Custom evaluators that run evaluators via pluggable backends.
|
|
2
|
+
|
|
3
|
+
Every backend implements the same protocol: accept :class:`EvalInput` (JSON)
|
|
4
|
+
and return :class:`EvalResult` (JSON). The transport varies — local
|
|
5
|
+
subprocess, HTTP, Docker container, etc.
|
|
6
|
+
|
|
7
|
+
The protocol types live in :mod:`agentevals._protocol` (CLI-internal) and are
|
|
8
|
+
JSON-wire-compatible with the types in the ``agentevals-evaluator-sdk`` package.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import abc
|
|
14
|
+
import asyncio
|
|
15
|
+
import logging
|
|
16
|
+
import shutil
|
|
17
|
+
import sys
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from google.adk.evaluation.eval_case import Invocation, get_all_tool_calls
|
|
23
|
+
from google.adk.evaluation.evaluator import EvalStatus, EvaluationResult, Evaluator, PerInvocationResult
|
|
24
|
+
|
|
25
|
+
from agentevals._protocol import (
|
|
26
|
+
EvalInput,
|
|
27
|
+
EvalResult,
|
|
28
|
+
InvocationData,
|
|
29
|
+
ToolCallData,
|
|
30
|
+
ToolResponseData,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# EvaluatorBackend — primary abstraction
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EvaluatorBackend(abc.ABC):
|
|
42
|
+
"""Delivers :class:`EvalInput` to an evaluator and returns :class:`EvalResult`.
|
|
43
|
+
|
|
44
|
+
Subclasses encapsulate the *transport* — subprocess, HTTP, Docker, etc.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@abc.abstractmethod
|
|
48
|
+
async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
|
|
49
|
+
"""Execute the evaluator and return its result."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Runtime — language-specific helpers for SubprocessBackend
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class Runtime(abc.ABC):
|
|
58
|
+
"""Maps a file extension to the command needed to run it."""
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
@abc.abstractmethod
|
|
62
|
+
def name(self) -> str:
|
|
63
|
+
"""Human-readable runtime name (e.g. ``"Python"``)."""
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
@abc.abstractmethod
|
|
67
|
+
def extensions(self) -> tuple[str, ...]:
|
|
68
|
+
"""File extensions this runtime handles (e.g. ``(".py",)``)."""
|
|
69
|
+
|
|
70
|
+
@abc.abstractmethod
|
|
71
|
+
def build_command(self, path: Path) -> list[str]:
|
|
72
|
+
"""Return the argv list to execute *path*."""
|
|
73
|
+
|
|
74
|
+
def is_available(self) -> bool:
|
|
75
|
+
"""Return True if the runtime's interpreter is found on the system."""
|
|
76
|
+
try:
|
|
77
|
+
self.build_command(Path("__probe__"))
|
|
78
|
+
return True
|
|
79
|
+
except RuntimeError:
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PythonRuntime(Runtime):
|
|
84
|
+
@property
|
|
85
|
+
def name(self) -> str:
|
|
86
|
+
return "Python"
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def extensions(self) -> tuple[str, ...]:
|
|
90
|
+
return (".py",)
|
|
91
|
+
|
|
92
|
+
def build_command(self, path: Path) -> list[str]:
|
|
93
|
+
return [sys.executable, str(path)]
|
|
94
|
+
|
|
95
|
+
def is_available(self) -> bool:
|
|
96
|
+
return True
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class NodeRuntime(Runtime):
|
|
100
|
+
@property
|
|
101
|
+
def name(self) -> str:
|
|
102
|
+
return "Node.js"
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def extensions(self) -> tuple[str, ...]:
|
|
106
|
+
return (".js", ".ts")
|
|
107
|
+
|
|
108
|
+
def build_command(self, path: Path) -> list[str]:
|
|
109
|
+
node = shutil.which("node")
|
|
110
|
+
if not node:
|
|
111
|
+
raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
|
|
112
|
+
return [node, str(path)]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
_RUNTIMES: list[Runtime] = [
|
|
116
|
+
PythonRuntime(),
|
|
117
|
+
NodeRuntime(),
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_runtimes() -> list[Runtime]:
|
|
122
|
+
"""Return all registered runtimes."""
|
|
123
|
+
return list(_RUNTIMES)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def supported_extensions() -> set[str]:
|
|
127
|
+
"""All file extensions supported by registered runtimes."""
|
|
128
|
+
exts: set[str] = set()
|
|
129
|
+
for rt in _RUNTIMES:
|
|
130
|
+
exts.update(rt.extensions)
|
|
131
|
+
return exts
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _resolve_runtime(path: Path) -> Runtime:
|
|
135
|
+
"""Find the runtime that handles *path*'s extension."""
|
|
136
|
+
suffix = path.suffix.lower()
|
|
137
|
+
for rt in _RUNTIMES:
|
|
138
|
+
if suffix in rt.extensions:
|
|
139
|
+
return rt
|
|
140
|
+
raise ValueError(f"No runtime registered for extension '{suffix}'. Supported: {sorted(supported_extensions())}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# Subprocess runner (used by SubprocessBackend)
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
async def _run_subprocess(
|
|
149
|
+
cmd: list[str],
|
|
150
|
+
input_json: str,
|
|
151
|
+
timeout: int,
|
|
152
|
+
metric_name: str,
|
|
153
|
+
) -> EvalResult:
|
|
154
|
+
"""Run a subprocess, pipe JSON on stdin, read JSON from stdout."""
|
|
155
|
+
logger.info("Running custom evaluator %r: %s", metric_name, " ".join(cmd))
|
|
156
|
+
|
|
157
|
+
proc = await asyncio.create_subprocess_exec(
|
|
158
|
+
*cmd,
|
|
159
|
+
stdin=asyncio.subprocess.PIPE,
|
|
160
|
+
stdout=asyncio.subprocess.PIPE,
|
|
161
|
+
stderr=asyncio.subprocess.PIPE,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
|
166
|
+
proc.communicate(input=input_json.encode()),
|
|
167
|
+
timeout=timeout,
|
|
168
|
+
)
|
|
169
|
+
except TimeoutError as exc:
|
|
170
|
+
proc.kill()
|
|
171
|
+
await proc.wait()
|
|
172
|
+
raise TimeoutError(f"Custom evaluator '{metric_name}' timed out after {timeout}s") from exc
|
|
173
|
+
|
|
174
|
+
stderr_text = stderr_bytes.decode(errors="replace").strip()
|
|
175
|
+
if stderr_text:
|
|
176
|
+
logger.debug("Custom evaluator %r stderr:\n%s", metric_name, stderr_text)
|
|
177
|
+
|
|
178
|
+
if proc.returncode != 0:
|
|
179
|
+
raise RuntimeError(
|
|
180
|
+
f"Custom evaluator '{metric_name}' exited with code {proc.returncode}"
|
|
181
|
+
+ (f": {stderr_text}" if stderr_text else "")
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
stdout_text = stdout_bytes.decode().strip()
|
|
185
|
+
if not stdout_text:
|
|
186
|
+
hint = ""
|
|
187
|
+
if stderr_text:
|
|
188
|
+
hint = f"\nEvaluator stderr:\n{stderr_text}"
|
|
189
|
+
raise RuntimeError(f"Custom evaluator '{metric_name}' produced no output on stdout" + hint)
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
return EvalResult.model_validate_json(stdout_text)
|
|
193
|
+
except Exception as exc:
|
|
194
|
+
raise RuntimeError(f"Custom evaluator '{metric_name}' produced invalid JSON: {exc}") from exc
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
# Backend implementations
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class SubprocessBackend(EvaluatorBackend):
|
|
203
|
+
"""Runs a local code file (.py, .js, .ts, …) as a subprocess.
|
|
204
|
+
|
|
205
|
+
The correct interpreter is resolved from the file extension via the
|
|
206
|
+
:data:`_RUNTIMES` registry.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def __init__(self, path: Path, timeout: int = 30):
|
|
210
|
+
self._path = path.resolve()
|
|
211
|
+
self._runtime = _resolve_runtime(self._path)
|
|
212
|
+
self._timeout = timeout
|
|
213
|
+
|
|
214
|
+
if not self._path.exists():
|
|
215
|
+
raise FileNotFoundError(f"Evaluator file not found: {self._path}")
|
|
216
|
+
|
|
217
|
+
async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
|
|
218
|
+
cmd = self._runtime.build_command(self._path)
|
|
219
|
+
return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# ---------------------------------------------------------------------------
|
|
223
|
+
# Executor factory
|
|
224
|
+
# ---------------------------------------------------------------------------
|
|
225
|
+
|
|
226
|
+
_EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
|
|
227
|
+
"local": lambda path, timeout: SubprocessBackend(path, timeout),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
|
|
232
|
+
"""Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
|
|
233
|
+
factory = _EXECUTOR_FACTORIES.get(executor_name)
|
|
234
|
+
if factory is None:
|
|
235
|
+
raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
|
|
236
|
+
return factory(path, timeout)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
|
|
240
|
+
"""Register a new executor factory (e.g. for Docker support)."""
|
|
241
|
+
_EXECUTOR_FACTORIES[name] = factory
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
# ---------------------------------------------------------------------------
|
|
245
|
+
# ADK Invocation ↔ InvocationData conversion
|
|
246
|
+
# ---------------------------------------------------------------------------
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _content_to_text(content) -> str:
|
|
250
|
+
"""Extract plain text from an ADK Content object."""
|
|
251
|
+
if content is None:
|
|
252
|
+
return ""
|
|
253
|
+
if isinstance(content, str):
|
|
254
|
+
return content
|
|
255
|
+
if hasattr(content, "parts") and content.parts:
|
|
256
|
+
texts = []
|
|
257
|
+
for part in content.parts:
|
|
258
|
+
if hasattr(part, "text") and part.text:
|
|
259
|
+
texts.append(part.text)
|
|
260
|
+
return " ".join(texts)
|
|
261
|
+
return ""
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _extract_tool_calls_from_invocation(inv: Invocation) -> list[ToolCallData]:
|
|
265
|
+
"""Extract tool calls from an Invocation's intermediate_data."""
|
|
266
|
+
calls: list[ToolCallData] = []
|
|
267
|
+
if not inv.intermediate_data:
|
|
268
|
+
return calls
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
tool_uses = get_all_tool_calls(inv.intermediate_data)
|
|
272
|
+
for tc in tool_uses:
|
|
273
|
+
calls.append(ToolCallData(name=tc.name or "", args=tc.args or {}))
|
|
274
|
+
except Exception:
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
return calls
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _extract_tool_responses_from_invocation(inv: Invocation) -> list[ToolResponseData]:
|
|
281
|
+
"""Extract tool responses from intermediate_data."""
|
|
282
|
+
responses: list[ToolResponseData] = []
|
|
283
|
+
if not inv.intermediate_data:
|
|
284
|
+
return responses
|
|
285
|
+
|
|
286
|
+
if hasattr(inv.intermediate_data, "tool_responses"):
|
|
287
|
+
for tr in inv.intermediate_data.tool_responses or []:
|
|
288
|
+
name = ""
|
|
289
|
+
output = ""
|
|
290
|
+
if hasattr(tr, "name"):
|
|
291
|
+
name = tr.name or ""
|
|
292
|
+
if hasattr(tr, "response"):
|
|
293
|
+
output = str(tr.response) if tr.response else ""
|
|
294
|
+
elif hasattr(tr, "output"):
|
|
295
|
+
output = str(tr.output) if tr.output else ""
|
|
296
|
+
responses.append(ToolResponseData(name=name, output=output))
|
|
297
|
+
|
|
298
|
+
return responses
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def invocation_to_data(inv: Invocation) -> InvocationData:
|
|
302
|
+
"""Convert an ADK Invocation to a simplified InvocationData for the protocol."""
|
|
303
|
+
return InvocationData(
|
|
304
|
+
invocation_id=inv.invocation_id or "",
|
|
305
|
+
user_content=_content_to_text(inv.user_content),
|
|
306
|
+
final_response=_content_to_text(inv.final_response) or None,
|
|
307
|
+
tool_calls=_extract_tool_calls_from_invocation(inv),
|
|
308
|
+
tool_responses=_extract_tool_responses_from_invocation(inv),
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def invocations_to_data(invocations: list[Invocation] | None) -> list[InvocationData] | None:
|
|
313
|
+
"""Convert a list of ADK Invocations, or return None."""
|
|
314
|
+
if invocations is None:
|
|
315
|
+
return None
|
|
316
|
+
return [invocation_to_data(inv) for inv in invocations]
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# ---------------------------------------------------------------------------
|
|
320
|
+
# EvalResult → EvaluationResult conversion
|
|
321
|
+
# ---------------------------------------------------------------------------
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _eval_result_to_evaluation_result(
|
|
325
|
+
result: EvalResult,
|
|
326
|
+
threshold: float,
|
|
327
|
+
actual_invocations: list[Invocation],
|
|
328
|
+
) -> EvaluationResult:
|
|
329
|
+
"""Convert our protocol EvalResult into an ADK EvaluationResult."""
|
|
330
|
+
if result.status:
|
|
331
|
+
status_map = {
|
|
332
|
+
"PASSED": EvalStatus.PASSED,
|
|
333
|
+
"FAILED": EvalStatus.FAILED,
|
|
334
|
+
"NOT_EVALUATED": EvalStatus.NOT_EVALUATED,
|
|
335
|
+
}
|
|
336
|
+
overall_status = status_map.get(result.status.upper(), EvalStatus.NOT_EVALUATED)
|
|
337
|
+
else:
|
|
338
|
+
overall_status = EvalStatus.PASSED if result.score >= threshold else EvalStatus.FAILED
|
|
339
|
+
|
|
340
|
+
per_inv_results: list[PerInvocationResult] = []
|
|
341
|
+
for i, inv in enumerate(actual_invocations):
|
|
342
|
+
score = result.per_invocation_scores[i] if i < len(result.per_invocation_scores) else None
|
|
343
|
+
per_inv_results.append(
|
|
344
|
+
PerInvocationResult(
|
|
345
|
+
actual_invocation=inv,
|
|
346
|
+
score=score,
|
|
347
|
+
eval_status=overall_status,
|
|
348
|
+
)
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
return EvaluationResult(
|
|
352
|
+
overall_score=result.score,
|
|
353
|
+
overall_eval_status=overall_status,
|
|
354
|
+
per_invocation_results=per_inv_results,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# ---------------------------------------------------------------------------
|
|
359
|
+
# CustomEvaluatorRunner — ADK Evaluator adapter (backend-agnostic)
|
|
360
|
+
# ---------------------------------------------------------------------------
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class CustomEvaluatorRunner(Evaluator):
|
|
364
|
+
"""Wraps any :class:`EvaluatorBackend` as an ADK :class:`Evaluator`.
|
|
365
|
+
|
|
366
|
+
Handles the conversion between ADK ``Invocation`` objects and the
|
|
367
|
+
language-agnostic ``EvalInput``/``EvalResult`` protocol.
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
def __init__(
|
|
371
|
+
self,
|
|
372
|
+
backend: EvaluatorBackend,
|
|
373
|
+
metric_name: str,
|
|
374
|
+
threshold: float = 0.5,
|
|
375
|
+
config: dict[str, Any] | None = None,
|
|
376
|
+
):
|
|
377
|
+
self._backend = backend
|
|
378
|
+
self._metric_name = metric_name
|
|
379
|
+
self._threshold = threshold
|
|
380
|
+
self._config = config or {}
|
|
381
|
+
|
|
382
|
+
async def evaluate_invocations(
|
|
383
|
+
self,
|
|
384
|
+
actual_invocations: list[Invocation],
|
|
385
|
+
expected_invocations: list[Invocation] | None = None,
|
|
386
|
+
conversation_scenario=None,
|
|
387
|
+
) -> EvaluationResult:
|
|
388
|
+
|
|
389
|
+
eval_input = EvalInput(
|
|
390
|
+
metric_name=self._metric_name,
|
|
391
|
+
threshold=self._threshold,
|
|
392
|
+
config=self._config,
|
|
393
|
+
invocations=invocations_to_data(actual_invocations) or [],
|
|
394
|
+
expected_invocations=invocations_to_data(expected_invocations),
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
result = await self._backend.run(eval_input, self._metric_name)
|
|
398
|
+
return _eval_result_to_evaluation_result(result, self._threshold, actual_invocations)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
# ---------------------------------------------------------------------------
|
|
402
|
+
# Public helper — build and run a custom evaluator from a config definition
|
|
403
|
+
# ---------------------------------------------------------------------------
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
async def evaluate_custom_evaluator(
|
|
407
|
+
evaluator_def,
|
|
408
|
+
actual_invocations: list[Invocation],
|
|
409
|
+
expected_invocations: list[Invocation] | None,
|
|
410
|
+
):
|
|
411
|
+
"""Evaluate a single custom evaluator and return a ``MetricResult``.
|
|
412
|
+
|
|
413
|
+
This is the entry point called by the runner. It constructs the
|
|
414
|
+
appropriate backend from the config definition, wraps it in a
|
|
415
|
+
``CustomEvaluatorRunner``, and runs the evaluation.
|
|
416
|
+
"""
|
|
417
|
+
import inspect as _inspect
|
|
418
|
+
|
|
419
|
+
from .config import CodeEvaluatorDef, RemoteEvaluatorDef
|
|
420
|
+
from .runner import MetricResult
|
|
421
|
+
|
|
422
|
+
if isinstance(evaluator_def, RemoteEvaluatorDef):
|
|
423
|
+
from .evaluator.resolver import get_default_resolver
|
|
424
|
+
|
|
425
|
+
evaluator_def = await get_default_resolver().resolve(evaluator_def)
|
|
426
|
+
|
|
427
|
+
if isinstance(evaluator_def, CodeEvaluatorDef):
|
|
428
|
+
backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
|
|
429
|
+
else:
|
|
430
|
+
raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
|
|
431
|
+
|
|
432
|
+
evaluator_instance = CustomEvaluatorRunner(
|
|
433
|
+
backend=backend,
|
|
434
|
+
metric_name=evaluator_def.name,
|
|
435
|
+
threshold=evaluator_def.threshold,
|
|
436
|
+
config=evaluator_def.config,
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
if _inspect.iscoroutinefunction(evaluator_instance.evaluate_invocations):
|
|
441
|
+
eval_result: EvaluationResult = await evaluator_instance.evaluate_invocations(
|
|
442
|
+
actual_invocations=actual_invocations,
|
|
443
|
+
expected_invocations=expected_invocations,
|
|
444
|
+
)
|
|
445
|
+
else:
|
|
446
|
+
import asyncio
|
|
447
|
+
|
|
448
|
+
eval_result: EvaluationResult = await asyncio.to_thread(
|
|
449
|
+
evaluator_instance.evaluate_invocations,
|
|
450
|
+
actual_invocations=actual_invocations,
|
|
451
|
+
expected_invocations=expected_invocations,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
per_inv_scores = [r.score for r in eval_result.per_invocation_results]
|
|
455
|
+
|
|
456
|
+
return MetricResult(
|
|
457
|
+
metric_name=evaluator_def.name,
|
|
458
|
+
score=eval_result.overall_score,
|
|
459
|
+
eval_status=eval_result.overall_eval_status.name,
|
|
460
|
+
per_invocation_scores=per_inv_scores,
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
except Exception as exc:
|
|
464
|
+
logger.exception("Failed to evaluate custom evaluator '%s'", evaluator_def.name)
|
|
465
|
+
return MetricResult(
|
|
466
|
+
metric_name=evaluator_def.name,
|
|
467
|
+
error=str(exc),
|
|
468
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Load evaluation configuration from a YAML file."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from .config import (
|
|
12
|
+
BuiltinMetricDef,
|
|
13
|
+
CodeEvaluatorDef,
|
|
14
|
+
CustomEvaluatorDef,
|
|
15
|
+
EvalRunConfig,
|
|
16
|
+
RemoteEvaluatorDef,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_TYPE_TO_MODEL = {
|
|
22
|
+
"builtin": BuiltinMetricDef,
|
|
23
|
+
"code": CodeEvaluatorDef,
|
|
24
|
+
"remote": RemoteEvaluatorDef,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEvaluatorDef | None]:
|
|
29
|
+
"""Parse a single evaluator entry from the YAML config.
|
|
30
|
+
|
|
31
|
+
Every entry must be a dict with ``name`` and ``type`` fields.
|
|
32
|
+
Returns (builtin_name, custom_evaluator_def). Exactly one will be non-None.
|
|
33
|
+
"""
|
|
34
|
+
if not isinstance(entry, dict):
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"Each evaluator entry must be a mapping with 'name' and 'type' fields, got {type(entry).__name__}: {entry!r}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
name = entry.get("name")
|
|
40
|
+
if not name:
|
|
41
|
+
raise ValueError(f"Evaluator entry must have a 'name' field: {entry}")
|
|
42
|
+
|
|
43
|
+
evaluator_type = entry.get("type")
|
|
44
|
+
if not evaluator_type:
|
|
45
|
+
raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)")
|
|
46
|
+
|
|
47
|
+
if evaluator_type not in _TYPE_TO_MODEL:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Unknown evaluator type '{evaluator_type}' for '{name}'. Valid types: {list(_TYPE_TO_MODEL.keys())}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
model_cls = _TYPE_TO_MODEL[evaluator_type]
|
|
53
|
+
evaluator_def = model_cls.model_validate(entry)
|
|
54
|
+
|
|
55
|
+
if evaluator_type == "builtin":
|
|
56
|
+
return name, evaluator_def if (
|
|
57
|
+
evaluator_def.threshold is not None or evaluator_def.judge_model is not None
|
|
58
|
+
) else None
|
|
59
|
+
|
|
60
|
+
return None, evaluator_def
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_eval_config(path: str | Path) -> EvalRunConfig:
|
|
64
|
+
"""Load an eval config YAML file and return a partially-filled EvalRunConfig.
|
|
65
|
+
|
|
66
|
+
The YAML file uses an ``evaluators`` list where each entry is a dict with
|
|
67
|
+
``name`` and ``type`` fields. Built-in entries populate ``metrics``;
|
|
68
|
+
code/remote entries populate ``custom_evaluators``.
|
|
69
|
+
"""
|
|
70
|
+
path = Path(path)
|
|
71
|
+
if not path.exists():
|
|
72
|
+
raise FileNotFoundError(f"Eval config file not found: {path}")
|
|
73
|
+
|
|
74
|
+
with open(path) as f:
|
|
75
|
+
data = yaml.safe_load(f)
|
|
76
|
+
|
|
77
|
+
if not isinstance(data, dict):
|
|
78
|
+
raise ValueError(f"Eval config must be a YAML mapping, got {type(data).__name__}")
|
|
79
|
+
|
|
80
|
+
raw_evaluators = data.get("evaluators", [])
|
|
81
|
+
if not isinstance(raw_evaluators, list):
|
|
82
|
+
raise ValueError("'evaluators' must be a list")
|
|
83
|
+
|
|
84
|
+
builtin_names: list[str] = []
|
|
85
|
+
custom_defs: list[CustomEvaluatorDef] = []
|
|
86
|
+
builtin_overrides: dict[str, BuiltinMetricDef] = {}
|
|
87
|
+
|
|
88
|
+
for entry in raw_evaluators:
|
|
89
|
+
builtin_name, custom_def = _parse_evaluator_entry(entry)
|
|
90
|
+
if builtin_name:
|
|
91
|
+
builtin_names.append(builtin_name)
|
|
92
|
+
if custom_def:
|
|
93
|
+
if isinstance(custom_def, BuiltinMetricDef):
|
|
94
|
+
builtin_overrides[custom_def.name] = custom_def
|
|
95
|
+
if custom_def.name not in builtin_names:
|
|
96
|
+
builtin_names.append(custom_def.name)
|
|
97
|
+
else:
|
|
98
|
+
custom_defs.append(custom_def)
|
|
99
|
+
|
|
100
|
+
config = EvalRunConfig(
|
|
101
|
+
trace_files=[],
|
|
102
|
+
metrics=builtin_names,
|
|
103
|
+
custom_evaluators=custom_defs,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if "eval_set" in data:
|
|
107
|
+
config.eval_set_file = str(data["eval_set"])
|
|
108
|
+
if "judge_model" in data:
|
|
109
|
+
config.judge_model = data["judge_model"]
|
|
110
|
+
if "threshold" in data:
|
|
111
|
+
config.threshold = float(data["threshold"])
|
|
112
|
+
if "trace_format" in data:
|
|
113
|
+
config.trace_format = data["trace_format"]
|
|
114
|
+
|
|
115
|
+
config._builtin_overrides = builtin_overrides # type: ignore[attr-defined]
|
|
116
|
+
|
|
117
|
+
return config
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def merge_configs(file_config: EvalRunConfig, cli_config: EvalRunConfig) -> EvalRunConfig:
|
|
121
|
+
"""Merge a file-based config with CLI overrides.
|
|
122
|
+
|
|
123
|
+
CLI values take precedence for scalar fields. Metrics lists are merged:
|
|
124
|
+
CLI ``--metric`` flags are added to the file config's built-in metrics
|
|
125
|
+
(duplicates removed).
|
|
126
|
+
"""
|
|
127
|
+
merged = file_config.model_copy()
|
|
128
|
+
|
|
129
|
+
if cli_config.trace_files:
|
|
130
|
+
merged.trace_files = cli_config.trace_files
|
|
131
|
+
if cli_config.eval_set_file is not None:
|
|
132
|
+
merged.eval_set_file = cli_config.eval_set_file
|
|
133
|
+
if cli_config.judge_model is not None:
|
|
134
|
+
merged.judge_model = cli_config.judge_model
|
|
135
|
+
if cli_config.threshold is not None:
|
|
136
|
+
merged.threshold = cli_config.threshold
|
|
137
|
+
if cli_config.trace_format != "jaeger-json":
|
|
138
|
+
merged.trace_format = cli_config.trace_format
|
|
139
|
+
if cli_config.output_format != "table":
|
|
140
|
+
merged.output_format = cli_config.output_format
|
|
141
|
+
|
|
142
|
+
file_metric_names = set(merged.metrics)
|
|
143
|
+
for name in cli_config.metrics:
|
|
144
|
+
if name not in file_metric_names:
|
|
145
|
+
merged.metrics.append(name)
|
|
146
|
+
|
|
147
|
+
return merged
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Evaluator management: sources, templates, and resolution."""
|
|
2
|
+
|
|
3
|
+
from .resolver import EvaluatorResolver, get_default_resolver
|
|
4
|
+
from .sources import (
|
|
5
|
+
BuiltinEvaluatorSource,
|
|
6
|
+
EvaluatorInfo,
|
|
7
|
+
EvaluatorSource,
|
|
8
|
+
FileEvaluatorSource,
|
|
9
|
+
GitHubEvaluatorSource,
|
|
10
|
+
get_sources,
|
|
11
|
+
)
|
|
12
|
+
from .templates import scaffold_evaluator
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"BuiltinEvaluatorSource",
|
|
16
|
+
"EvaluatorInfo",
|
|
17
|
+
"EvaluatorResolver",
|
|
18
|
+
"EvaluatorSource",
|
|
19
|
+
"FileEvaluatorSource",
|
|
20
|
+
"GitHubEvaluatorSource",
|
|
21
|
+
"get_default_resolver",
|
|
22
|
+
"get_sources",
|
|
23
|
+
"scaffold_evaluator",
|
|
24
|
+
]
|