agentevals-cli 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. agentevals/__init__.py +16 -0
  2. agentevals/_protocol.py +83 -0
  3. agentevals/api/__init__.py +0 -0
  4. agentevals/api/app.py +137 -0
  5. agentevals/api/debug_routes.py +268 -0
  6. agentevals/api/models.py +204 -0
  7. agentevals/api/otlp_app.py +25 -0
  8. agentevals/api/otlp_routes.py +383 -0
  9. agentevals/api/routes.py +554 -0
  10. agentevals/api/streaming_routes.py +373 -0
  11. agentevals/builtin_metrics.py +234 -0
  12. agentevals/cli.py +643 -0
  13. agentevals/config.py +108 -0
  14. agentevals/converter.py +328 -0
  15. agentevals/custom_evaluators.py +468 -0
  16. agentevals/eval_config_loader.py +147 -0
  17. agentevals/evaluator/__init__.py +24 -0
  18. agentevals/evaluator/resolver.py +70 -0
  19. agentevals/evaluator/sources.py +293 -0
  20. agentevals/evaluator/templates.py +224 -0
  21. agentevals/extraction.py +444 -0
  22. agentevals/genai_converter.py +538 -0
  23. agentevals/loader/__init__.py +7 -0
  24. agentevals/loader/base.py +53 -0
  25. agentevals/loader/jaeger.py +112 -0
  26. agentevals/loader/otlp.py +193 -0
  27. agentevals/mcp_server.py +236 -0
  28. agentevals/output.py +204 -0
  29. agentevals/runner.py +310 -0
  30. agentevals/sdk.py +433 -0
  31. agentevals/streaming/__init__.py +120 -0
  32. agentevals/streaming/incremental_processor.py +337 -0
  33. agentevals/streaming/processor.py +285 -0
  34. agentevals/streaming/session.py +36 -0
  35. agentevals/streaming/ws_server.py +806 -0
  36. agentevals/trace_attrs.py +32 -0
  37. agentevals/trace_metrics.py +126 -0
  38. agentevals/utils/__init__.py +0 -0
  39. agentevals/utils/genai_messages.py +142 -0
  40. agentevals/utils/log_buffer.py +43 -0
  41. agentevals/utils/log_enrichment.py +187 -0
  42. agentevals_cli-0.5.2.dist-info/METADATA +22 -0
  43. agentevals_cli-0.5.2.dist-info/RECORD +46 -0
  44. agentevals_cli-0.5.2.dist-info/WHEEL +4 -0
  45. agentevals_cli-0.5.2.dist-info/entry_points.txt +2 -0
  46. agentevals_cli-0.5.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,70 @@
1
+ """Resolve remote evaluator references to local cached files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ from .sources import EvaluatorSource, get_sources
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _DEFAULT_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "evaluators"
13
+
14
+
15
+ class EvaluatorResolver:
16
+ """Downloads and caches remote evaluators, converting them to local paths."""
17
+
18
+ def __init__(self, cache_dir: Path | None = None):
19
+ self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR
20
+ self._sources: dict[str, EvaluatorSource] = {}
21
+
22
+ def register_source(self, source: EvaluatorSource) -> None:
23
+ self._sources[source.source_name] = source
24
+
25
+ async def resolve(self, evaluator_def) -> "CodeEvaluatorDef": # noqa: F821
26
+ """Download a remote evaluator and return a CodeEvaluatorDef pointing to the cached file."""
27
+ from ..config import CodeEvaluatorDef, RemoteEvaluatorDef
28
+
29
+ if not isinstance(evaluator_def, RemoteEvaluatorDef):
30
+ raise TypeError(f"Expected RemoteEvaluatorDef, got {type(evaluator_def).__name__}")
31
+
32
+ source = self._sources.get(evaluator_def.source)
33
+ if source is None:
34
+ raise ValueError(
35
+ f"Unknown evaluator source '{evaluator_def.source}'. Available: {sorted(self._sources.keys())}"
36
+ )
37
+
38
+ dest = self._cache_dir / evaluator_def.source / evaluator_def.ref
39
+ if not dest.exists():
40
+ logger.info(
41
+ "Downloading evaluator '%s' from %s (ref: %s)",
42
+ evaluator_def.name,
43
+ evaluator_def.source,
44
+ evaluator_def.ref,
45
+ )
46
+ await source.fetch_evaluator(evaluator_def.ref, dest)
47
+ else:
48
+ logger.debug("Using cached evaluator '%s' at %s", evaluator_def.name, dest)
49
+
50
+ return CodeEvaluatorDef(
51
+ name=evaluator_def.name,
52
+ path=str(dest),
53
+ threshold=evaluator_def.threshold,
54
+ timeout=evaluator_def.timeout,
55
+ config=evaluator_def.config,
56
+ executor=evaluator_def.executor,
57
+ )
58
+
59
+
60
+ _default_resolver: EvaluatorResolver | None = None
61
+
62
+
63
+ def get_default_resolver() -> EvaluatorResolver:
64
+ """Return a lazily-initialized resolver with all registered sources."""
65
+ global _default_resolver
66
+ if _default_resolver is None:
67
+ _default_resolver = EvaluatorResolver()
68
+ for source in get_sources():
69
+ _default_resolver.register_source(source)
70
+ return _default_resolver
@@ -0,0 +1,293 @@
1
+ """Evaluator source backends: discover and fetch evaluators from various registries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import abc
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from dataclasses import asdict, dataclass, field
11
+ from pathlib import Path
12
+
13
+ import yaml
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _DEFAULT_REPO = "agentevals-dev/evaluators"
18
+ _DEFAULT_BRANCH = "main"
19
+ _DEFAULT_INDEX = "index.yaml"
20
+
21
+
22
+ @dataclass
23
+ class EvaluatorInfo:
24
+ """Metadata for a single evaluator, regardless of where it comes from."""
25
+
26
+ name: str
27
+ description: str
28
+ source: str
29
+ language: str | None = None
30
+ ref: str | None = None
31
+ tags: list[str] = field(default_factory=list)
32
+ author: str | None = None
33
+ last_updated: str | None = None
34
+
35
+
36
+ class EvaluatorSource(abc.ABC):
37
+ """Registry backend that can list and fetch evaluators."""
38
+
39
+ @property
40
+ @abc.abstractmethod
41
+ def source_name(self) -> str: ...
42
+
43
+ @abc.abstractmethod
44
+ async def list_evaluators(self) -> list[EvaluatorInfo]: ...
45
+
46
+ @abc.abstractmethod
47
+ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
48
+ """Download an evaluator identified by *ref* and write it to *dest*.
49
+
50
+ Returns the path to the downloaded file.
51
+ """
52
+
53
+
54
+ _CACHE_TTL_SECONDS = 86400 # 24 hours
55
+
56
+
57
+ def _cache_dir() -> Path:
58
+ base = Path(os.environ.get("XDG_CACHE_HOME", Path.home() / ".cache"))
59
+ d = base / "agentevals"
60
+ d.mkdir(parents=True, exist_ok=True)
61
+ return d
62
+
63
+
64
+ def _read_cache(key: str, ttl: int = _CACHE_TTL_SECONDS) -> list[EvaluatorInfo] | None:
65
+ cache_file = _cache_dir() / f"{key}.json"
66
+ if not cache_file.exists():
67
+ return None
68
+ try:
69
+ data = json.loads(cache_file.read_text())
70
+ if time.time() - data.get("ts", 0) > ttl:
71
+ return None
72
+ return [EvaluatorInfo(**item) for item in data["evaluators"]]
73
+ except Exception:
74
+ return None
75
+
76
+
77
+ def _write_cache(key: str, evaluators: list[EvaluatorInfo]) -> None:
78
+ cache_file = _cache_dir() / f"{key}.json"
79
+ try:
80
+ cache_file.write_text(
81
+ json.dumps(
82
+ {
83
+ "ts": time.time(),
84
+ "evaluators": [asdict(g) for g in evaluators],
85
+ }
86
+ )
87
+ )
88
+ except Exception:
89
+ pass
90
+
91
+
92
+ class BuiltinEvaluatorSource(EvaluatorSource):
93
+ """Wraps ADK's built-in metric registry as an evaluator source."""
94
+
95
+ @property
96
+ def source_name(self) -> str:
97
+ return "builtin"
98
+
99
+ async def list_evaluators(self) -> list[EvaluatorInfo]:
100
+ import asyncio
101
+ import warnings
102
+
103
+ cached = _read_cache("builtin")
104
+ if cached is not None:
105
+ return cached
106
+
107
+ def _load() -> list[EvaluatorInfo]:
108
+ infos: list[EvaluatorInfo] = []
109
+ try:
110
+ with warnings.catch_warnings():
111
+ warnings.simplefilter("ignore")
112
+ from google.adk.evaluation.metric_evaluator_registry import (
113
+ DEFAULT_METRIC_EVALUATOR_REGISTRY,
114
+ )
115
+
116
+ for m in DEFAULT_METRIC_EVALUATOR_REGISTRY.get_registered_metrics():
117
+ infos.append(
118
+ EvaluatorInfo(
119
+ name=m.metric_name,
120
+ description=m.description or "No description",
121
+ source=self.source_name,
122
+ language=None,
123
+ ref=None,
124
+ )
125
+ )
126
+ except ImportError:
127
+ from google.adk.evaluation.eval_metrics import PrebuiltMetrics
128
+
129
+ for pm in PrebuiltMetrics:
130
+ infos.append(
131
+ EvaluatorInfo(
132
+ name=pm.value,
133
+ description="(install google-adk[eval] for full details)",
134
+ source=self.source_name,
135
+ )
136
+ )
137
+ return infos
138
+
139
+ result = await asyncio.to_thread(_load)
140
+ _write_cache("builtin", result)
141
+ return result
142
+
143
+ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
144
+ raise NotImplementedError("Built-in evaluators are part of ADK and cannot be fetched as files.")
145
+
146
+
147
+ class GitHubEvaluatorSource(EvaluatorSource):
148
+ """Fetches evaluators from a GitHub repository with a CI-generated index.yaml."""
149
+
150
+ def __init__(
151
+ self,
152
+ repo: str | None = None,
153
+ branch: str | None = None,
154
+ index_path: str | None = None,
155
+ token: str | None = None,
156
+ ):
157
+ self._repo = repo or os.environ.get("AGENTEVALS_EVALUATOR_REPO", _DEFAULT_REPO)
158
+ self._branch = branch or os.environ.get("AGENTEVALS_EVALUATOR_BRANCH", _DEFAULT_BRANCH)
159
+ self._index_path = index_path or _DEFAULT_INDEX
160
+ self._token = token or os.environ.get("AGENTEVALS_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
161
+
162
+ @property
163
+ def source_name(self) -> str:
164
+ return "github"
165
+
166
+ def _raw_url(self, path: str) -> str:
167
+ return f"https://raw.githubusercontent.com/{self._repo}/{self._branch}/{path}"
168
+
169
+ def _headers(self) -> dict[str, str]:
170
+ if self._token:
171
+ return {"Authorization": f"token {self._token}"}
172
+ return {}
173
+
174
+ async def list_evaluators(self) -> list[EvaluatorInfo]:
175
+ import httpx
176
+
177
+ url = self._raw_url(self._index_path)
178
+ logger.debug("Fetching evaluator index from %s", url)
179
+
180
+ try:
181
+ async with httpx.AsyncClient() as client:
182
+ resp = await client.get(url, headers=self._headers(), timeout=15)
183
+ resp.raise_for_status()
184
+ except httpx.HTTPError as exc:
185
+ logger.warning("Failed to fetch evaluator index from %s: %s", url, exc)
186
+ return []
187
+
188
+ data = yaml.safe_load(resp.text)
189
+ if not isinstance(data, dict):
190
+ logger.warning("Evaluator index at %s is not a valid YAML mapping", url)
191
+ return []
192
+
193
+ infos: list[EvaluatorInfo] = []
194
+ for entry in data.get("evaluators", []):
195
+ infos.append(
196
+ EvaluatorInfo(
197
+ name=entry.get("name", "unknown"),
198
+ description=entry.get("description", ""),
199
+ source=self.source_name,
200
+ language=entry.get("language"),
201
+ ref=entry.get("path"),
202
+ tags=entry.get("tags", []),
203
+ author=entry.get("author"),
204
+ last_updated=entry.get("lastUpdated"),
205
+ )
206
+ )
207
+ return infos
208
+
209
+ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
210
+ import httpx
211
+
212
+ url = self._raw_url(ref)
213
+ logger.info("Downloading evaluator from %s", url)
214
+
215
+ async with httpx.AsyncClient() as client:
216
+ resp = await client.get(url, headers=self._headers(), timeout=30)
217
+ resp.raise_for_status()
218
+
219
+ dest.parent.mkdir(parents=True, exist_ok=True)
220
+ dest.write_text(resp.text, encoding="utf-8") # noqa: ASYNC240
221
+ return dest
222
+
223
+
224
+ class FileEvaluatorSource(EvaluatorSource):
225
+ """Reads evaluators from a local index.yaml file (same schema as GitHubEvaluatorSource).
226
+
227
+ Useful for testing and local development. Not registered in the default
228
+ source list — instantiate directly when needed.
229
+ """
230
+
231
+ def __init__(self, path: Path):
232
+ self._path = Path(path)
233
+ if not self._path.exists():
234
+ raise FileNotFoundError(f"Evaluator index file not found: {self._path}")
235
+
236
+ @property
237
+ def source_name(self) -> str:
238
+ return "file"
239
+
240
+ async def list_evaluators(self) -> list[EvaluatorInfo]:
241
+ data = yaml.safe_load(self._path.read_text(encoding="utf-8"))
242
+ if not isinstance(data, dict):
243
+ logger.warning("Evaluator index at %s is not a valid YAML mapping", self._path)
244
+ return []
245
+
246
+ infos: list[EvaluatorInfo] = []
247
+ for entry in data.get("evaluators", []):
248
+ infos.append(
249
+ EvaluatorInfo(
250
+ name=entry.get("name", "unknown"),
251
+ description=entry.get("description", ""),
252
+ source=self.source_name,
253
+ language=entry.get("language"),
254
+ ref=entry.get("path"),
255
+ tags=entry.get("tags", []),
256
+ author=entry.get("author"),
257
+ last_updated=entry.get("lastUpdated"),
258
+ )
259
+ )
260
+ return infos
261
+
262
+ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
263
+ src = (self._path.parent / ref).resolve()
264
+ if not src.exists():
265
+ raise FileNotFoundError(f"Evaluator file not found: {src} (ref: {ref}, index: {self._path})")
266
+ import shutil
267
+
268
+ dest.parent.mkdir(parents=True, exist_ok=True)
269
+ shutil.copy2(src, dest)
270
+ return dest
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Source registry
275
+ # ---------------------------------------------------------------------------
276
+
277
+ _SOURCES: list[EvaluatorSource] | None = None
278
+
279
+
280
+ def get_sources() -> list[EvaluatorSource]:
281
+ """Return all registered evaluator sources (lazily initialized)."""
282
+ global _SOURCES
283
+ if _SOURCES is None:
284
+ _SOURCES = [
285
+ BuiltinEvaluatorSource(),
286
+ GitHubEvaluatorSource(),
287
+ ]
288
+ return _SOURCES
289
+
290
+
291
+ def register_source(source: EvaluatorSource) -> None:
292
+ """Add a custom evaluator source to the registry."""
293
+ get_sources().append(source)
@@ -0,0 +1,224 @@
1
+ """Evaluator scaffolding templates and the scaffold_evaluator function."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from string import Template
7
+
8
+ import yaml
9
+
10
+ PYTHON_TEMPLATE = Template('''\
11
+ """Custom evaluator: ${name}
12
+
13
+ Usage in eval_config.yaml:
14
+
15
+ evaluators:
16
+ - name: ${name}
17
+ type: code
18
+ path: ./${name}/${name}.py
19
+ threshold: 0.5
20
+ """
21
+
22
+ from agentevals_evaluator_sdk import evaluator, EvalInput, EvalResult
23
+
24
+
25
+ @evaluator
26
+ def ${name}(input: EvalInput) -> EvalResult:
27
+ scores: list[float] = []
28
+
29
+ for inv in input.invocations:
30
+ score = 1.0
31
+
32
+ if not inv.final_response:
33
+ score = 0.0
34
+ scores.append(score)
35
+ continue
36
+
37
+ # TODO: implement your scoring logic here
38
+
39
+ scores.append(max(0.0, score))
40
+
41
+ overall = sum(scores) / len(scores) if scores else 0.0
42
+ return EvalResult(
43
+ score=overall,
44
+ per_invocation_scores=scores,
45
+ )
46
+
47
+
48
+ if __name__ == "__main__":
49
+ ${name}.run()
50
+ ''')
51
+
52
+
53
+ JAVASCRIPT_TEMPLATE = Template("""\
54
+ /**
55
+ * Custom evaluator: ${name}
56
+ *
57
+ * Usage in eval_config.yaml:
58
+ *
59
+ * evaluators:
60
+ * - name: ${name}
61
+ * type: code
62
+ * path: ./${name}/${name}.js
63
+ * threshold: 0.5
64
+ */
65
+
66
+ const input = JSON.parse(require("fs").readFileSync("/dev/stdin", "utf8"));
67
+
68
+ const scores = [];
69
+
70
+ for (const inv of input.invocations) {
71
+ let score = 1.0;
72
+
73
+ if (!inv.final_response) {
74
+ scores.push(0.0);
75
+ continue;
76
+ }
77
+
78
+ // TODO: implement your scoring logic here
79
+
80
+ scores.push(Math.max(0.0, score));
81
+ }
82
+
83
+ const overall = scores.length > 0
84
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
85
+ : 0.0;
86
+
87
+ console.log(JSON.stringify({
88
+ score: overall,
89
+ per_invocation_scores: scores,
90
+ }));
91
+ """)
92
+
93
+
94
+ TYPESCRIPT_TEMPLATE = Template("""\
95
+ /**
96
+ * Custom evaluator: ${name}
97
+ *
98
+ * Usage in eval_config.yaml:
99
+ *
100
+ * evaluators:
101
+ * - name: ${name}
102
+ * type: code
103
+ * path: ./${name}/${name}.ts
104
+ * threshold: 0.5
105
+ */
106
+
107
+ import * as fs from "fs";
108
+
109
+ interface IntermediateSteps {
110
+ tool_calls: { name: string; args: Record<string, unknown> }[];
111
+ tool_responses: { name: string; output: string }[];
112
+ }
113
+
114
+ interface Invocation {
115
+ invocation_id: string;
116
+ user_content: string;
117
+ final_response: string | null;
118
+ intermediate_steps: IntermediateSteps;
119
+ }
120
+
121
+ interface EvalInput {
122
+ protocol_version: string;
123
+ metric_name: string;
124
+ threshold: number;
125
+ config: Record<string, unknown>;
126
+ invocations: Invocation[];
127
+ expected_invocations: Invocation[] | null;
128
+ }
129
+
130
+ const input: EvalInput = JSON.parse(fs.readFileSync("/dev/stdin", "utf8"));
131
+
132
+ const scores: number[] = [];
133
+
134
+ for (const inv of input.invocations) {
135
+ let score = 1.0;
136
+
137
+ if (!inv.final_response) {
138
+ scores.push(0.0);
139
+ continue;
140
+ }
141
+
142
+ // TODO: implement your scoring logic here
143
+
144
+ scores.push(Math.max(0.0, score));
145
+ }
146
+
147
+ const overall = scores.length > 0
148
+ ? scores.reduce((a, b) => a + b, 0) / scores.length
149
+ : 0.0;
150
+
151
+ console.log(JSON.stringify({
152
+ score: overall,
153
+ per_invocation_scores: scores,
154
+ }));
155
+ """)
156
+
157
+
158
+ _EXTENSION_TO_TEMPLATE: dict[str, Template] = {
159
+ ".py": PYTHON_TEMPLATE,
160
+ ".js": JAVASCRIPT_TEMPLATE,
161
+ ".ts": TYPESCRIPT_TEMPLATE,
162
+ }
163
+
164
+ _RUNTIME_ALIAS_TO_EXT: dict[str, str] = {
165
+ "py": ".py",
166
+ "python": ".py",
167
+ "js": ".js",
168
+ "javascript": ".js",
169
+ "ts": ".ts",
170
+ "typescript": ".ts",
171
+ }
172
+
173
+ _EXT_TO_LANGUAGE: dict[str, str] = {
174
+ ".py": "python",
175
+ ".js": "javascript",
176
+ ".ts": "typescript",
177
+ }
178
+
179
+
180
+ def scaffold_evaluator(
181
+ name: str,
182
+ output_dir: Path | None = None,
183
+ runtime: str | None = None,
184
+ ) -> Path:
185
+ """Create a new evaluator directory with code file and evaluator.yaml manifest.
186
+
187
+ Returns the path to the created directory.
188
+ """
189
+ output_dir = output_dir or Path.cwd()
190
+
191
+ raw_path = Path(name)
192
+ suffix = raw_path.suffix.lower()
193
+ evaluator_name = raw_path.stem
194
+
195
+ if suffix and suffix in _EXTENSION_TO_TEMPLATE:
196
+ ext = suffix
197
+ elif runtime:
198
+ ext = _RUNTIME_ALIAS_TO_EXT.get(runtime.lower())
199
+ if ext is None:
200
+ raise ValueError(f"Unknown runtime '{runtime}'. Supported: {sorted(_RUNTIME_ALIAS_TO_EXT.keys())}")
201
+ else:
202
+ ext = ".py"
203
+
204
+ template = _EXTENSION_TO_TEMPLATE[ext]
205
+ language = _EXT_TO_LANGUAGE[ext]
206
+
207
+ evaluator_dir = output_dir / evaluator_name
208
+ evaluator_dir.mkdir(parents=True, exist_ok=True)
209
+
210
+ code_file = evaluator_dir / f"{evaluator_name}{ext}"
211
+ code_file.write_text(template.substitute(name=evaluator_name), encoding="utf-8")
212
+
213
+ manifest = {
214
+ "name": evaluator_name,
215
+ "description": f"TODO: describe what {evaluator_name} evaluates",
216
+ "language": language,
217
+ "entrypoint": f"{evaluator_name}{ext}",
218
+ "tags": [],
219
+ "author": "",
220
+ }
221
+ manifest_file = evaluator_dir / "evaluator.yaml"
222
+ manifest_file.write_text(yaml.dump(manifest, sort_keys=False), encoding="utf-8")
223
+
224
+ return evaluator_dir