agentevals-cli 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentevals/__init__.py +16 -0
- agentevals/_protocol.py +83 -0
- agentevals/api/__init__.py +0 -0
- agentevals/api/app.py +137 -0
- agentevals/api/debug_routes.py +268 -0
- agentevals/api/models.py +204 -0
- agentevals/api/otlp_app.py +25 -0
- agentevals/api/otlp_routes.py +383 -0
- agentevals/api/routes.py +554 -0
- agentevals/api/streaming_routes.py +373 -0
- agentevals/builtin_metrics.py +234 -0
- agentevals/cli.py +643 -0
- agentevals/config.py +108 -0
- agentevals/converter.py +328 -0
- agentevals/custom_evaluators.py +468 -0
- agentevals/eval_config_loader.py +147 -0
- agentevals/evaluator/__init__.py +24 -0
- agentevals/evaluator/resolver.py +70 -0
- agentevals/evaluator/sources.py +293 -0
- agentevals/evaluator/templates.py +224 -0
- agentevals/extraction.py +444 -0
- agentevals/genai_converter.py +538 -0
- agentevals/loader/__init__.py +7 -0
- agentevals/loader/base.py +53 -0
- agentevals/loader/jaeger.py +112 -0
- agentevals/loader/otlp.py +193 -0
- agentevals/mcp_server.py +236 -0
- agentevals/output.py +204 -0
- agentevals/runner.py +310 -0
- agentevals/sdk.py +433 -0
- agentevals/streaming/__init__.py +120 -0
- agentevals/streaming/incremental_processor.py +337 -0
- agentevals/streaming/processor.py +285 -0
- agentevals/streaming/session.py +36 -0
- agentevals/streaming/ws_server.py +806 -0
- agentevals/trace_attrs.py +32 -0
- agentevals/trace_metrics.py +126 -0
- agentevals/utils/__init__.py +0 -0
- agentevals/utils/genai_messages.py +142 -0
- agentevals/utils/log_buffer.py +43 -0
- agentevals/utils/log_enrichment.py +187 -0
- agentevals_cli-0.5.2.dist-info/METADATA +22 -0
- agentevals_cli-0.5.2.dist-info/RECORD +46 -0
- agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
- agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
- agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Resolve remote evaluator references to local cached files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .sources import EvaluatorSource, get_sources
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_DEFAULT_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "evaluators"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvaluatorResolver:
|
|
16
|
+
"""Downloads and caches remote evaluators, converting them to local paths."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, cache_dir: Path | None = None):
|
|
19
|
+
self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR
|
|
20
|
+
self._sources: dict[str, EvaluatorSource] = {}
|
|
21
|
+
|
|
22
|
+
def register_source(self, source: EvaluatorSource) -> None:
|
|
23
|
+
self._sources[source.source_name] = source
|
|
24
|
+
|
|
25
|
+
async def resolve(self, evaluator_def) -> "CodeEvaluatorDef": # noqa: F821
|
|
26
|
+
"""Download a remote evaluator and return a CodeEvaluatorDef pointing to the cached file."""
|
|
27
|
+
from ..config import CodeEvaluatorDef, RemoteEvaluatorDef
|
|
28
|
+
|
|
29
|
+
if not isinstance(evaluator_def, RemoteEvaluatorDef):
|
|
30
|
+
raise TypeError(f"Expected RemoteEvaluatorDef, got {type(evaluator_def).__name__}")
|
|
31
|
+
|
|
32
|
+
source = self._sources.get(evaluator_def.source)
|
|
33
|
+
if source is None:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
f"Unknown evaluator source '{evaluator_def.source}'. Available: {sorted(self._sources.keys())}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
dest = self._cache_dir / evaluator_def.source / evaluator_def.ref
|
|
39
|
+
if not dest.exists():
|
|
40
|
+
logger.info(
|
|
41
|
+
"Downloading evaluator '%s' from %s (ref: %s)",
|
|
42
|
+
evaluator_def.name,
|
|
43
|
+
evaluator_def.source,
|
|
44
|
+
evaluator_def.ref,
|
|
45
|
+
)
|
|
46
|
+
await source.fetch_evaluator(evaluator_def.ref, dest)
|
|
47
|
+
else:
|
|
48
|
+
logger.debug("Using cached evaluator '%s' at %s", evaluator_def.name, dest)
|
|
49
|
+
|
|
50
|
+
return CodeEvaluatorDef(
|
|
51
|
+
name=evaluator_def.name,
|
|
52
|
+
path=str(dest),
|
|
53
|
+
threshold=evaluator_def.threshold,
|
|
54
|
+
timeout=evaluator_def.timeout,
|
|
55
|
+
config=evaluator_def.config,
|
|
56
|
+
executor=evaluator_def.executor,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
_default_resolver: EvaluatorResolver | None = None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_default_resolver() -> EvaluatorResolver:
|
|
64
|
+
"""Return a lazily-initialized resolver with all registered sources."""
|
|
65
|
+
global _default_resolver
|
|
66
|
+
if _default_resolver is None:
|
|
67
|
+
_default_resolver = EvaluatorResolver()
|
|
68
|
+
for source in get_sources():
|
|
69
|
+
_default_resolver.register_source(source)
|
|
70
|
+
return _default_resolver
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Evaluator source backends: discover and fetch evaluators from various registries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import abc
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
from dataclasses import asdict, dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_DEFAULT_REPO = "agentevals-dev/evaluators"
|
|
18
|
+
_DEFAULT_BRANCH = "main"
|
|
19
|
+
_DEFAULT_INDEX = "index.yaml"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class EvaluatorInfo:
|
|
24
|
+
"""Metadata for a single evaluator, regardless of where it comes from."""
|
|
25
|
+
|
|
26
|
+
name: str
|
|
27
|
+
description: str
|
|
28
|
+
source: str
|
|
29
|
+
language: str | None = None
|
|
30
|
+
ref: str | None = None
|
|
31
|
+
tags: list[str] = field(default_factory=list)
|
|
32
|
+
author: str | None = None
|
|
33
|
+
last_updated: str | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class EvaluatorSource(abc.ABC):
|
|
37
|
+
"""Registry backend that can list and fetch evaluators."""
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
@abc.abstractmethod
|
|
41
|
+
def source_name(self) -> str: ...
|
|
42
|
+
|
|
43
|
+
@abc.abstractmethod
|
|
44
|
+
async def list_evaluators(self) -> list[EvaluatorInfo]: ...
|
|
45
|
+
|
|
46
|
+
@abc.abstractmethod
|
|
47
|
+
async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
|
|
48
|
+
"""Download an evaluator identified by *ref* and write it to *dest*.
|
|
49
|
+
|
|
50
|
+
Returns the path to the downloaded file.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_CACHE_TTL_SECONDS = 86400 # 24 hours
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _cache_dir() -> Path:
|
|
58
|
+
base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
|
|
59
|
+
d = base / "agentevals"
|
|
60
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
return d
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _read_cache(key: str, ttl: int = _CACHE_TTL_SECONDS) -> list[EvaluatorInfo] | None:
|
|
65
|
+
cache_file = _cache_dir() / f"{key}.json"
|
|
66
|
+
if not cache_file.exists():
|
|
67
|
+
return None
|
|
68
|
+
try:
|
|
69
|
+
data = json.loads(cache_file.read_text())
|
|
70
|
+
if time.time() - data.get("ts", 0) > ttl:
|
|
71
|
+
return None
|
|
72
|
+
return [EvaluatorInfo(**item) for item in data["evaluators"]]
|
|
73
|
+
except Exception:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _write_cache(key: str, evaluators: list[EvaluatorInfo]) -> None:
|
|
78
|
+
cache_file = _cache_dir() / f"{key}.json"
|
|
79
|
+
try:
|
|
80
|
+
cache_file.write_text(
|
|
81
|
+
json.dumps(
|
|
82
|
+
{
|
|
83
|
+
"ts": time.time(),
|
|
84
|
+
"evaluators": [asdict(g) for g in evaluators],
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BuiltinEvaluatorSource(EvaluatorSource):
|
|
93
|
+
"""Wraps ADK's built-in metric registry as an evaluator source."""
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def source_name(self) -> str:
|
|
97
|
+
return "builtin"
|
|
98
|
+
|
|
99
|
+
async def list_evaluators(self) -> list[EvaluatorInfo]:
|
|
100
|
+
import asyncio
|
|
101
|
+
import warnings
|
|
102
|
+
|
|
103
|
+
cached = _read_cache("builtin")
|
|
104
|
+
if cached is not None:
|
|
105
|
+
return cached
|
|
106
|
+
|
|
107
|
+
def _load() -> list[EvaluatorInfo]:
|
|
108
|
+
infos: list[EvaluatorInfo] = []
|
|
109
|
+
try:
|
|
110
|
+
with warnings.catch_warnings():
|
|
111
|
+
warnings.simplefilter("ignore")
|
|
112
|
+
from google.adk.evaluation.metric_evaluator_registry import (
|
|
113
|
+
DEFAULT_METRIC_EVALUATOR_REGISTRY,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
for m in DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics():
|
|
117
|
+
infos.append(
|
|
118
|
+
EvaluatorInfo(
|
|
119
|
+
name=m.metric_name,
|
|
120
|
+
description=m.description or "No description",
|
|
121
|
+
source=self.source_name,
|
|
122
|
+
language=None,
|
|
123
|
+
ref=None,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
except ImportError:
|
|
127
|
+
from google.adk.evaluation.eval_metrics import PrebuiltMetrics
|
|
128
|
+
|
|
129
|
+
for pm in PrebuiltMetrics:
|
|
130
|
+
infos.append(
|
|
131
|
+
EvaluatorInfo(
|
|
132
|
+
name=pm.value,
|
|
133
|
+
description="(install google-adk[eval] for full details)",
|
|
134
|
+
source=self.source_name,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
return infos
|
|
138
|
+
|
|
139
|
+
result = await asyncio.to_thread(_load)
|
|
140
|
+
_write_cache("builtin", result)
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
|
|
144
|
+
raise NotImplementedError("Built-in evaluators are part of ADK and cannot be fetched as files.")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class GitHubEvaluatorSource(EvaluatorSource):
|
|
148
|
+
"""Fetches evaluators from a GitHub repository with a CI-generated index.yaml."""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
repo: str | None = None,
|
|
153
|
+
branch: str | None = None,
|
|
154
|
+
index_path: str | None = None,
|
|
155
|
+
token: str | None = None,
|
|
156
|
+
):
|
|
157
|
+
self._repo = repo or os.environ.get("AGENTEVALS_EVALUATOR_REPO", _DEFAULT_REPO)
|
|
158
|
+
self._branch = branch or os.environ.get("AGENTEVALS_EVALUATOR_BRANCH", _DEFAULT_BRANCH)
|
|
159
|
+
self._index_path = index_path or _DEFAULT_INDEX
|
|
160
|
+
self._token = token or os.environ.get("AGENTEVALS_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def source_name(self) -> str:
|
|
164
|
+
return "github"
|
|
165
|
+
|
|
166
|
+
def _raw_url(self, path: str) -> str:
|
|
167
|
+
return f"https://raw.githubusercontent.com/{self._repo}/{self._branch}/{path}"
|
|
168
|
+
|
|
169
|
+
def _headers(self) -> dict[str, str]:
|
|
170
|
+
if self._token:
|
|
171
|
+
return {"Authorization": f"token {self._token}"}
|
|
172
|
+
return {}
|
|
173
|
+
|
|
174
|
+
async def list_evaluators(self) -> list[EvaluatorInfo]:
|
|
175
|
+
import httpx
|
|
176
|
+
|
|
177
|
+
url = self._raw_url(self._index_path)
|
|
178
|
+
logger.debug("Fetching evaluator index from %s", url)
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
async with httpx.AsyncClient() as client:
|
|
182
|
+
resp = await client.get(url, headers=self._headers(), timeout=15)
|
|
183
|
+
resp.raise_for_status()
|
|
184
|
+
except httpx.HTTPError as exc:
|
|
185
|
+
logger.warning("Failed to fetch evaluator index from %s: %s", url, exc)
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
data = yaml.safe_load(resp.text)
|
|
189
|
+
if not isinstance(data, dict):
|
|
190
|
+
logger.warning("Evaluator index at %s is not a valid YAML mapping", url)
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
infos: list[EvaluatorInfo] = []
|
|
194
|
+
for entry in data.get("evaluators", []):
|
|
195
|
+
infos.append(
|
|
196
|
+
EvaluatorInfo(
|
|
197
|
+
name=entry.get("name", "unknown"),
|
|
198
|
+
description=entry.get("description", ""),
|
|
199
|
+
source=self.source_name,
|
|
200
|
+
language=entry.get("language"),
|
|
201
|
+
ref=entry.get("path"),
|
|
202
|
+
tags=entry.get("tags", []),
|
|
203
|
+
author=entry.get("author"),
|
|
204
|
+
last_updated=entry.get("lastUpdated"),
|
|
205
|
+
)
|
|
206
|
+
)
|
|
207
|
+
return infos
|
|
208
|
+
|
|
209
|
+
async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
|
|
210
|
+
import httpx
|
|
211
|
+
|
|
212
|
+
url = self._raw_url(ref)
|
|
213
|
+
logger.info("Downloading evaluator from %s", url)
|
|
214
|
+
|
|
215
|
+
async with httpx.AsyncClient() as client:
|
|
216
|
+
resp = await client.get(url, headers=self._headers(), timeout=30)
|
|
217
|
+
resp.raise_for_status()
|
|
218
|
+
|
|
219
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
220
|
+
dest.write_text(resp.text, encoding="utf-8") # noqa: ASYNC240
|
|
221
|
+
return dest
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class FileEvaluatorSource(EvaluatorSource):
|
|
225
|
+
"""Reads evaluators from a local index.yaml file (same schema as GitHubEvaluatorSource).
|
|
226
|
+
|
|
227
|
+
Useful for testing and local development. Not registered in the default
|
|
228
|
+
source list — instantiate directly when needed.
|
|
229
|
+
"""
|
|
230
|
+
|
|
231
|
+
def __init__(self, path: Path):
|
|
232
|
+
self._path = Path(path)
|
|
233
|
+
if not self._path.exists():
|
|
234
|
+
raise FileNotFoundError(f"Evaluator index file not found: {self._path}")
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def source_name(self) -> str:
|
|
238
|
+
return "file"
|
|
239
|
+
|
|
240
|
+
async def list_evaluators(self) -> list[EvaluatorInfo]:
|
|
241
|
+
data = yaml.safe_load(self._path.read_text(encoding="utf-8"))
|
|
242
|
+
if not isinstance(data, dict):
|
|
243
|
+
logger.warning("Evaluator index at %s is not a valid YAML mapping", self._path)
|
|
244
|
+
return []
|
|
245
|
+
|
|
246
|
+
infos: list[EvaluatorInfo] = []
|
|
247
|
+
for entry in data.get("evaluators", []):
|
|
248
|
+
infos.append(
|
|
249
|
+
EvaluatorInfo(
|
|
250
|
+
name=entry.get("name", "unknown"),
|
|
251
|
+
description=entry.get("description", ""),
|
|
252
|
+
source=self.source_name,
|
|
253
|
+
language=entry.get("language"),
|
|
254
|
+
ref=entry.get("path"),
|
|
255
|
+
tags=entry.get("tags", []),
|
|
256
|
+
author=entry.get("author"),
|
|
257
|
+
last_updated=entry.get("lastUpdated"),
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
return infos
|
|
261
|
+
|
|
262
|
+
async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
|
|
263
|
+
src = (self._path.parent / ref).resolve()
|
|
264
|
+
if not src.exists():
|
|
265
|
+
raise FileNotFoundError(f"Evaluator file not found: {src} (ref: {ref}, index: {self._path})")
|
|
266
|
+
import shutil
|
|
267
|
+
|
|
268
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
269
|
+
shutil.copy2(src, dest)
|
|
270
|
+
return dest
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
# ---------------------------------------------------------------------------
|
|
274
|
+
# Source registry
|
|
275
|
+
# ---------------------------------------------------------------------------
|
|
276
|
+
|
|
277
|
+
_SOURCES: list[EvaluatorSource] | None = None
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def get_sources() -> list[EvaluatorSource]:
|
|
281
|
+
"""Return all registered evaluator sources (lazily initialized)."""
|
|
282
|
+
global _SOURCES
|
|
283
|
+
if _SOURCES is None:
|
|
284
|
+
_SOURCES = [
|
|
285
|
+
BuiltinEvaluatorSource(),
|
|
286
|
+
GitHubEvaluatorSource(),
|
|
287
|
+
]
|
|
288
|
+
return _SOURCES
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def register_source(source: EvaluatorSource) -> None:
|
|
292
|
+
"""Add a custom evaluator source to the registry."""
|
|
293
|
+
get_sources().append(source)
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Evaluator scaffolding templates and the scaffold_evaluator function."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from string import Template
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
PYTHON_TEMPLATE = Template('''\
|
|
11
|
+
"""Custom evaluator: ${name}
|
|
12
|
+
|
|
13
|
+
Usage in eval_config.yaml:
|
|
14
|
+
|
|
15
|
+
evaluators:
|
|
16
|
+
- name: ${name}
|
|
17
|
+
type: code
|
|
18
|
+
path: ./${name}/${name}.py
|
|
19
|
+
threshold: 0.5
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from agentevals_evaluator_sdk import evaluator, EvalInput, EvalResult
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@evaluator
|
|
26
|
+
def ${name}(input: EvalInput) -> EvalResult:
|
|
27
|
+
scores: list[float] = []
|
|
28
|
+
|
|
29
|
+
for inv in input.invocations:
|
|
30
|
+
score = 1.0
|
|
31
|
+
|
|
32
|
+
if not inv.final_response:
|
|
33
|
+
score = 0.0
|
|
34
|
+
scores.append(score)
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
# TODO: implement your scoring logic here
|
|
38
|
+
|
|
39
|
+
scores.append(max(0.0, score))
|
|
40
|
+
|
|
41
|
+
overall = sum(scores) / len(scores) if scores else 0.0
|
|
42
|
+
return EvalResult(
|
|
43
|
+
score=overall,
|
|
44
|
+
per_invocation_scores=scores,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
${name}.run()
|
|
50
|
+
''')
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
JAVASCRIPT_TEMPLATE = Template("""\
|
|
54
|
+
/**
|
|
55
|
+
* Custom evaluator: ${name}
|
|
56
|
+
*
|
|
57
|
+
* Usage in eval_config.yaml:
|
|
58
|
+
*
|
|
59
|
+
* evaluators:
|
|
60
|
+
* - name: ${name}
|
|
61
|
+
* type: code
|
|
62
|
+
* path: ./${name}/${name}.js
|
|
63
|
+
* threshold: 0.5
|
|
64
|
+
*/
|
|
65
|
+
|
|
66
|
+
const input = JSON.parse(require("fs").readFileSync("/dev/stdin", "utf8"));
|
|
67
|
+
|
|
68
|
+
const scores = [];
|
|
69
|
+
|
|
70
|
+
for (const inv of input.invocations) {
|
|
71
|
+
let score = 1.0;
|
|
72
|
+
|
|
73
|
+
if (!inv.final_response) {
|
|
74
|
+
scores.push(0.0);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// TODO: implement your scoring logic here
|
|
79
|
+
|
|
80
|
+
scores.push(Math.max(0.0, score));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const overall = scores.length > 0
|
|
84
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
85
|
+
: 0.0;
|
|
86
|
+
|
|
87
|
+
console.log(JSON.stringify({
|
|
88
|
+
score: overall,
|
|
89
|
+
per_invocation_scores: scores,
|
|
90
|
+
}));
|
|
91
|
+
""")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
TYPESCRIPT_TEMPLATE = Template("""\
|
|
95
|
+
/**
|
|
96
|
+
* Custom evaluator: ${name}
|
|
97
|
+
*
|
|
98
|
+
* Usage in eval_config.yaml:
|
|
99
|
+
*
|
|
100
|
+
* evaluators:
|
|
101
|
+
* - name: ${name}
|
|
102
|
+
* type: code
|
|
103
|
+
* path: ./${name}/${name}.ts
|
|
104
|
+
* threshold: 0.5
|
|
105
|
+
*/
|
|
106
|
+
|
|
107
|
+
import * as fs from "fs";
|
|
108
|
+
|
|
109
|
+
interface IntermediateSteps {
|
|
110
|
+
tool_calls: { name: string; args: Record<string, unknown> }[];
|
|
111
|
+
tool_responses: { name: string; output: string }[];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
interface Invocation {
|
|
115
|
+
invocation_id: string;
|
|
116
|
+
user_content: string;
|
|
117
|
+
final_response: string | null;
|
|
118
|
+
intermediate_steps: IntermediateSteps;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
interface EvalInput {
|
|
122
|
+
protocol_version: string;
|
|
123
|
+
metric_name: string;
|
|
124
|
+
threshold: number;
|
|
125
|
+
config: Record<string, unknown>;
|
|
126
|
+
invocations: Invocation[];
|
|
127
|
+
expected_invocations: Invocation[] | null;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const input: EvalInput = JSON.parse(fs.readFileSync("/dev/stdin", "utf8"));
|
|
131
|
+
|
|
132
|
+
const scores: number[] = [];
|
|
133
|
+
|
|
134
|
+
for (const inv of input.invocations) {
|
|
135
|
+
let score = 1.0;
|
|
136
|
+
|
|
137
|
+
if (!inv.final_response) {
|
|
138
|
+
scores.push(0.0);
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// TODO: implement your scoring logic here
|
|
143
|
+
|
|
144
|
+
scores.push(Math.max(0.0, score));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const overall = scores.length > 0
|
|
148
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
149
|
+
: 0.0;
|
|
150
|
+
|
|
151
|
+
console.log(JSON.stringify({
|
|
152
|
+
score: overall,
|
|
153
|
+
per_invocation_scores: scores,
|
|
154
|
+
}));
|
|
155
|
+
""")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
_EXTENSION_TO_TEMPLATE: dict[str, Template] = {
|
|
159
|
+
".py": PYTHON_TEMPLATE,
|
|
160
|
+
".js": JAVASCRIPT_TEMPLATE,
|
|
161
|
+
".ts": TYPESCRIPT_TEMPLATE,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
_RUNTIME_ALIAS_TO_EXT: dict[str, str] = {
|
|
165
|
+
"py": ".py",
|
|
166
|
+
"python": ".py",
|
|
167
|
+
"js": ".js",
|
|
168
|
+
"javascript": ".js",
|
|
169
|
+
"ts": ".ts",
|
|
170
|
+
"typescript": ".ts",
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
_EXT_TO_LANGUAGE: dict[str, str] = {
|
|
174
|
+
".py": "python",
|
|
175
|
+
".js": "javascript",
|
|
176
|
+
".ts": "typescript",
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def scaffold_evaluator(
|
|
181
|
+
name: str,
|
|
182
|
+
output_dir: Path | None = None,
|
|
183
|
+
runtime: str | None = None,
|
|
184
|
+
) -> Path:
|
|
185
|
+
"""Create a new evaluator directory with code file and evaluator.yaml manifest.
|
|
186
|
+
|
|
187
|
+
Returns the path to the created directory.
|
|
188
|
+
"""
|
|
189
|
+
output_dir = output_dir or Path.cwd()
|
|
190
|
+
|
|
191
|
+
raw_path = Path(name)
|
|
192
|
+
suffix = raw_path.suffix.lower()
|
|
193
|
+
evaluator_name = raw_path.stem
|
|
194
|
+
|
|
195
|
+
if suffix and suffix in _EXTENSION_TO_TEMPLATE:
|
|
196
|
+
ext = suffix
|
|
197
|
+
elif runtime:
|
|
198
|
+
ext = _RUNTIME_ALIAS_TO_EXT.get(runtime.lower())
|
|
199
|
+
if ext is None:
|
|
200
|
+
raise ValueError(f"Unknown runtime '{runtime}'. Supported: {sorted(_RUNTIME_ALIAS_TO_EXT.keys())}")
|
|
201
|
+
else:
|
|
202
|
+
ext = ".py"
|
|
203
|
+
|
|
204
|
+
template = _EXTENSION_TO_TEMPLATE[ext]
|
|
205
|
+
language = _EXT_TO_LANGUAGE[ext]
|
|
206
|
+
|
|
207
|
+
evaluator_dir = output_dir / evaluator_name
|
|
208
|
+
evaluator_dir.mkdir(parents=True, exist_ok=True)
|
|
209
|
+
|
|
210
|
+
code_file = evaluator_dir / f"{evaluator_name}{ext}"
|
|
211
|
+
code_file.write_text(template.substitute(name=evaluator_name), encoding="utf-8")
|
|
212
|
+
|
|
213
|
+
manifest = {
|
|
214
|
+
"name": evaluator_name,
|
|
215
|
+
"description": f"TODO: describe what {evaluator_name} evaluates",
|
|
216
|
+
"language": language,
|
|
217
|
+
"entrypoint": f"{evaluator_name}{ext}",
|
|
218
|
+
"tags": [],
|
|
219
|
+
"author": "",
|
|
220
|
+
}
|
|
221
|
+
manifest_file = evaluator_dir / "evaluator.yaml"
|
|
222
|
+
manifest_file.write_text(yaml.dump(manifest, sort_keys=False), encoding="utf-8")
|
|
223
|
+
|
|
224
|
+
return evaluator_dir
|