bat-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- add/__init__.py +3 -0
- add/client.py +16 -0
- bat_cli-0.1.0.dist-info/METADATA +231 -0
- bat_cli-0.1.0.dist-info/RECORD +47 -0
- bat_cli-0.1.0.dist-info/WHEEL +5 -0
- bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
- bat_cli-0.1.0.dist-info/top_level.txt +8 -0
- build/__init__.py +3 -0
- build/build.py +79 -0
- cli.py +260 -0
- create/__init__.py +3 -0
- create/agent.py +312 -0
- create/templates/agent/.dockerignore +3 -0
- create/templates/agent/.env.template +4 -0
- create/templates/agent/.python-version +1 -0
- create/templates/agent/Dockerfile +37 -0
- create/templates/agent/Makefile +34 -0
- create/templates/agent/README.md +1 -0
- create/templates/agent/__main__.py +2 -0
- create/templates/agent/agent.json.template +12 -0
- create/templates/agent/agent.spec +45 -0
- create/templates/agent/config.yaml +1 -0
- create/templates/agent/llm_client.py.template +36 -0
- create/templates/agent/pyproject.toml.template +9 -0
- create/templates/agent/src/__init__.py +0 -0
- create/templates/agent/src/graph.py +50 -0
- create/templates/agent/src/llm_clients/__init__.py +0 -0
- create/templates/agent/tests/__init__.py +0 -0
- eval/__init__.py +1 -0
- eval/commands.py +562 -0
- eval/engine/__init__.py +1 -0
- eval/engine/adapter.py +251 -0
- eval/engine/bench_runner.py +149 -0
- eval/engine/contracts.py +115 -0
- eval/engine/eval_config.py +294 -0
- eval/engine/evaluator.py +85 -0
- eval/engine/metrics/__init__.py +1 -0
- eval/engine/metrics/llm_evaluators.py +383 -0
- eval/engine/metrics/metrics.py +135 -0
- eval/engine/metrics/qualitative_helpers.py +64 -0
- eval/engine/orchestrator.py +157 -0
- eval/engine/plotter.py +347 -0
- image_defaults.py +80 -0
- push/__init__.py +3 -0
- push/push.py +58 -0
- set/__init__.py +3 -0
- set/env.py +50 -0
eval/commands.py
ADDED
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import socket
|
|
9
|
+
import subprocess
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Iterator, Mapping
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
|
+
|
|
15
|
+
import typer
|
|
16
|
+
from dotenv import dotenv_values
|
|
17
|
+
|
|
18
|
+
from .engine.contracts import JudgeSpec
|
|
19
|
+
from .engine.orchestrator import run_evaluation
|
|
20
|
+
from .engine.eval_config import default_eval_yaml, default_tasks_json, load_eval_config
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Maps provider name → the env var its SDK reads for the API key.
|
|
24
|
+
_PROVIDER_API_KEY_ENV: dict[str, str] = {
|
|
25
|
+
"openai": "OPENAI_API_KEY",
|
|
26
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
27
|
+
"azure": "AZURE_OPENAI_API_KEY",
|
|
28
|
+
"cohere": "COHERE_API_KEY",
|
|
29
|
+
"mistral": "MISTRAL_API_KEY",
|
|
30
|
+
"groq": "GROQ_API_KEY",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _inject_judge_api_key(judge: JudgeSpec, agent_root: Path, env: dict[str, str]) -> None:
|
|
35
|
+
"""Resolve the judge's API key and inject it into env.
|
|
36
|
+
|
|
37
|
+
If judge.api_key_env is set, the agent's .env file is the ONLY source: the CLI
|
|
38
|
+
reads that variable name from <agent_root>/.env and uses it. Nothing else is
|
|
39
|
+
consulted (not the shell, not other files).
|
|
40
|
+
|
|
41
|
+
If judge.api_key_env is NOT set, fall back to:
|
|
42
|
+
1. Key already present in env (exported in the shell)
|
|
43
|
+
2. Key found in the agent's .env file under the provider's standard name
|
|
44
|
+
"""
|
|
45
|
+
api_key_var = _PROVIDER_API_KEY_ENV.get(judge.provider.lower())
|
|
46
|
+
if api_key_var is None:
|
|
47
|
+
return # local/no-key provider (e.g. ollama)
|
|
48
|
+
|
|
49
|
+
if judge.api_key_env:
|
|
50
|
+
agent_env_file = agent_root / ".env"
|
|
51
|
+
if not agent_env_file.exists():
|
|
52
|
+
typer.secho(
|
|
53
|
+
f"Warning: judge.api_key_env='{judge.api_key_env}' was set but no .env file "
|
|
54
|
+
f"exists at {agent_env_file}; the judge will likely fail when called.",
|
|
55
|
+
fg=typer.colors.YELLOW,
|
|
56
|
+
err=True,
|
|
57
|
+
)
|
|
58
|
+
return
|
|
59
|
+
agent_dotenv = dotenv_values(agent_env_file)
|
|
60
|
+
raw_value = agent_dotenv.get(judge.api_key_env)
|
|
61
|
+
value = (raw_value or "").strip()
|
|
62
|
+
if not value:
|
|
63
|
+
typer.secho(
|
|
64
|
+
f"Warning: judge.api_key_env='{judge.api_key_env}' was set but the variable is "
|
|
65
|
+
f"missing or empty in {agent_env_file}; the judge will likely fail when called.",
|
|
66
|
+
fg=typer.colors.YELLOW,
|
|
67
|
+
err=True,
|
|
68
|
+
)
|
|
69
|
+
return
|
|
70
|
+
env[api_key_var] = value
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
if api_key_var in env:
|
|
74
|
+
return # already available from the shell
|
|
75
|
+
|
|
76
|
+
agent_env_file = agent_root / ".env"
|
|
77
|
+
if agent_env_file.exists():
|
|
78
|
+
agent_dotenv = dotenv_values(agent_env_file)
|
|
79
|
+
if api_key_var in agent_dotenv:
|
|
80
|
+
env[api_key_var] = agent_dotenv[api_key_var] # type: ignore[assignment]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
_ENV_VAR_PATTERN = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
84
|
+
_SIMPLE_ENV_REF = re.compile(r"^\$([A-Za-z_][A-Za-z0-9_]*)$")
|
|
85
|
+
_BRACED_ENV_REF = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _validate_agent_root(agent_root: Path) -> None:
|
|
89
|
+
required = [
|
|
90
|
+
agent_root / "config.yaml",
|
|
91
|
+
agent_root / "agent.json",
|
|
92
|
+
agent_root / "pyproject.toml",
|
|
93
|
+
]
|
|
94
|
+
missing = [path for path in required if not path.exists()]
|
|
95
|
+
if missing:
|
|
96
|
+
missing_text = ", ".join(str(path.relative_to(agent_root)) for path in missing)
|
|
97
|
+
raise typer.BadParameter(
|
|
98
|
+
f"Current directory does not look like an agent root. Missing: {missing_text}. Please add this files or run this command from the root of an existing agent.",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@contextlib.contextmanager
|
|
103
|
+
def _temporary_env(overrides: Mapping[str, str]) -> Iterator[None]:
|
|
104
|
+
original_values: dict[str, str | None] = {}
|
|
105
|
+
try:
|
|
106
|
+
for key, value in overrides.items():
|
|
107
|
+
original_values[key] = os.environ.get(key)
|
|
108
|
+
os.environ[key] = value
|
|
109
|
+
yield
|
|
110
|
+
finally:
|
|
111
|
+
for key, original in original_values.items():
|
|
112
|
+
if original is None:
|
|
113
|
+
os.environ.pop(key, None)
|
|
114
|
+
else:
|
|
115
|
+
os.environ[key] = original
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _run_eval_orchestrator(
|
|
119
|
+
*,
|
|
120
|
+
agent_url: str,
|
|
121
|
+
model_provider: str,
|
|
122
|
+
model: str,
|
|
123
|
+
dataset: Path,
|
|
124
|
+
output_dir: Path,
|
|
125
|
+
task_id: str,
|
|
126
|
+
k: int,
|
|
127
|
+
run_name: str,
|
|
128
|
+
qualitative: bool,
|
|
129
|
+
env: Mapping[str, str],
|
|
130
|
+
) -> None:
|
|
131
|
+
with _temporary_env(env):
|
|
132
|
+
asyncio.run(
|
|
133
|
+
run_evaluation(
|
|
134
|
+
agent_url=agent_url,
|
|
135
|
+
model=model,
|
|
136
|
+
model_provider=model_provider,
|
|
137
|
+
input_path=dataset,
|
|
138
|
+
run_name=run_name,
|
|
139
|
+
task_id=task_id,
|
|
140
|
+
enable_scoring=True,
|
|
141
|
+
enable_qualitative_eval=qualitative,
|
|
142
|
+
k=k,
|
|
143
|
+
out_dir=str(output_dir),
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _find_agent_python(agent_root: Path) -> Path | None:
|
|
149
|
+
candidates = [
|
|
150
|
+
agent_root / ".venv" / "bin" / "python",
|
|
151
|
+
agent_root / ".venv" / "Scripts" / "python.exe",
|
|
152
|
+
]
|
|
153
|
+
for candidate in candidates:
|
|
154
|
+
if candidate.exists():
|
|
155
|
+
return candidate
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _resolve_env_value(
|
|
160
|
+
raw_value: str,
|
|
161
|
+
env: Mapping[str, str],
|
|
162
|
+
*,
|
|
163
|
+
section_name: str,
|
|
164
|
+
env_key: str,
|
|
165
|
+
) -> str:
|
|
166
|
+
simple_match = _SIMPLE_ENV_REF.fullmatch(raw_value.strip())
|
|
167
|
+
if simple_match:
|
|
168
|
+
ref_name = simple_match.group(1)
|
|
169
|
+
resolved = env.get(ref_name)
|
|
170
|
+
if resolved is None:
|
|
171
|
+
raise typer.BadParameter(
|
|
172
|
+
f"{section_name}.env.{env_key} references missing environment variable: {ref_name}"
|
|
173
|
+
)
|
|
174
|
+
return resolved
|
|
175
|
+
|
|
176
|
+
missing_refs: list[str] = []
|
|
177
|
+
|
|
178
|
+
def _substitute(match: re.Match[str]) -> str:
|
|
179
|
+
ref_name = match.group(1)
|
|
180
|
+
resolved = env.get(ref_name)
|
|
181
|
+
if resolved is None:
|
|
182
|
+
missing_refs.append(ref_name)
|
|
183
|
+
return ""
|
|
184
|
+
return resolved
|
|
185
|
+
|
|
186
|
+
rendered = _BRACED_ENV_REF.sub(_substitute, raw_value)
|
|
187
|
+
if missing_refs:
|
|
188
|
+
refs = ", ".join(sorted(set(missing_refs)))
|
|
189
|
+
raise typer.BadParameter(
|
|
190
|
+
f"{section_name}.env.{env_key} references missing environment variable(s): {refs}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return rendered
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _apply_env_overrides(
|
|
197
|
+
env: dict[str, str],
|
|
198
|
+
overrides: dict[str, str],
|
|
199
|
+
*,
|
|
200
|
+
section_name: str,
|
|
201
|
+
) -> None:
|
|
202
|
+
for key, value in overrides.items():
|
|
203
|
+
env_key = key.strip()
|
|
204
|
+
if not env_key:
|
|
205
|
+
continue
|
|
206
|
+
if not _ENV_VAR_PATTERN.fullmatch(env_key):
|
|
207
|
+
raise typer.BadParameter(
|
|
208
|
+
f"{section_name}.env contains invalid variable name: {env_key}"
|
|
209
|
+
)
|
|
210
|
+
env[env_key] = _resolve_env_value(
|
|
211
|
+
value,
|
|
212
|
+
env,
|
|
213
|
+
section_name=section_name,
|
|
214
|
+
env_key=env_key,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _parse_agent_url(agent_url: str) -> tuple[str, int, str]:
|
|
219
|
+
parsed = urlparse(agent_url.strip())
|
|
220
|
+
if not parsed.scheme or not parsed.hostname:
|
|
221
|
+
raise typer.BadParameter(
|
|
222
|
+
"evaluation.agent_url must be a full URL, for example: http://127.0.0.1:9900"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
port = parsed.port
|
|
226
|
+
if port is None:
|
|
227
|
+
if parsed.scheme == "https":
|
|
228
|
+
port = 443
|
|
229
|
+
elif parsed.scheme == "http":
|
|
230
|
+
port = 80
|
|
231
|
+
else:
|
|
232
|
+
raise typer.BadParameter("evaluation.agent_url must use http or https")
|
|
233
|
+
|
|
234
|
+
base_url = f"{parsed.scheme}://{parsed.hostname}"
|
|
235
|
+
return parsed.hostname, port, base_url
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _wait_for_agent_port(
|
|
239
|
+
agent_url: str,
|
|
240
|
+
timeout_s: int,
|
|
241
|
+
process: subprocess.Popen | None,
|
|
242
|
+
) -> None:
|
|
243
|
+
host, port, _ = _parse_agent_url(agent_url)
|
|
244
|
+
deadline = time.time() + timeout_s
|
|
245
|
+
|
|
246
|
+
while time.time() < deadline:
|
|
247
|
+
if process is not None and process.poll() is not None:
|
|
248
|
+
raise typer.BadParameter(
|
|
249
|
+
f"Agent process exited before becoming ready (exit code: {process.returncode})."
|
|
250
|
+
)
|
|
251
|
+
try:
|
|
252
|
+
with socket.create_connection((host, port), timeout=1.0):
|
|
253
|
+
return
|
|
254
|
+
except OSError:
|
|
255
|
+
time.sleep(0.2)
|
|
256
|
+
|
|
257
|
+
raise typer.BadParameter(
|
|
258
|
+
f"Agent did not become ready at {agent_url} within {timeout_s} seconds."
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _start_agent_process(agent_root: Path, env: dict[str, str]) -> subprocess.Popen:
|
|
263
|
+
try:
|
|
264
|
+
return subprocess.Popen(
|
|
265
|
+
["uv", "run", "."],
|
|
266
|
+
cwd=agent_root,
|
|
267
|
+
env=env,
|
|
268
|
+
)
|
|
269
|
+
except FileNotFoundError as exc:
|
|
270
|
+
raise typer.BadParameter("Cannot execute 'uv run .'. Ensure uv is installed and available in PATH.") from exc
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _stop_agent_process(process: subprocess.Popen, timeout_s: int) -> None:
|
|
274
|
+
if process.poll() is not None:
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
process.terminate()
|
|
278
|
+
try:
|
|
279
|
+
process.wait(timeout=timeout_s)
|
|
280
|
+
except subprocess.TimeoutExpired:
|
|
281
|
+
process.kill()
|
|
282
|
+
process.wait(timeout=timeout_s)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def eval_init(
|
|
286
|
+
force: bool = typer.Option(
|
|
287
|
+
False,
|
|
288
|
+
"--force",
|
|
289
|
+
"-f",
|
|
290
|
+
help="Overwrite eval/eval.yaml and eval/input/tasks.json if they already exist.",
|
|
291
|
+
),
|
|
292
|
+
) -> None:
|
|
293
|
+
agent_root = Path.cwd()
|
|
294
|
+
_validate_agent_root(agent_root)
|
|
295
|
+
|
|
296
|
+
eval_dir = agent_root / "eval"
|
|
297
|
+
input_dir = eval_dir / "input"
|
|
298
|
+
output_dir = eval_dir / "output"
|
|
299
|
+
|
|
300
|
+
input_dir.mkdir(parents=True, exist_ok=True)
|
|
301
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
302
|
+
|
|
303
|
+
eval_yaml_path = eval_dir / "eval.yaml"
|
|
304
|
+
tasks_path = input_dir / "tasks.json"
|
|
305
|
+
|
|
306
|
+
if eval_yaml_path.exists() and not force:
|
|
307
|
+
typer.secho(
|
|
308
|
+
f"{eval_yaml_path} already exists. Use --force to overwrite.",
|
|
309
|
+
fg=typer.colors.YELLOW,
|
|
310
|
+
)
|
|
311
|
+
else:
|
|
312
|
+
eval_yaml_path.write_text(default_eval_yaml(), encoding="utf-8")
|
|
313
|
+
typer.secho(f"Created {eval_yaml_path}", fg=typer.colors.GREEN)
|
|
314
|
+
|
|
315
|
+
if tasks_path.exists() and not force:
|
|
316
|
+
typer.secho(
|
|
317
|
+
f"{tasks_path} already exists. Use --force to overwrite.",
|
|
318
|
+
fg=typer.colors.YELLOW,
|
|
319
|
+
)
|
|
320
|
+
else:
|
|
321
|
+
tasks_path.write_text(default_tasks_json(), encoding="utf-8")
|
|
322
|
+
typer.secho(f"Created {tasks_path}", fg=typer.colors.GREEN)
|
|
323
|
+
|
|
324
|
+
typer.secho("Evaluation scaffold ready in eval/", fg=typer.colors.GREEN)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _print_eval_show(cfg) -> None:
|
|
328
|
+
judge_model = f"{cfg.judge.provider}:{cfg.judge.model}" if cfg.judge is not None else "not configured"
|
|
329
|
+
|
|
330
|
+
typer.secho("============================", fg=typer.colors.BLUE)
|
|
331
|
+
typer.secho(" EVALUATION CONFIGURATION", fg=typer.colors.BLUE, bold=True)
|
|
332
|
+
typer.secho("============================", fg=typer.colors.BLUE)
|
|
333
|
+
|
|
334
|
+
typer.secho("Dataset", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
|
|
335
|
+
typer.echo(f" : {cfg.dataset}")
|
|
336
|
+
|
|
337
|
+
typer.secho("k", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
|
|
338
|
+
typer.echo(f" : {cfg.k}")
|
|
339
|
+
|
|
340
|
+
typer.secho("Qualitative", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
|
|
341
|
+
typer.echo(f" : {'yes' if cfg.qualitative else 'no'}")
|
|
342
|
+
|
|
343
|
+
typer.secho("", nl=True)
|
|
344
|
+
typer.secho("Models:", fg=typer.colors.CYAN, bold=True)
|
|
345
|
+
for idx, model in enumerate(cfg.models, start=1):
|
|
346
|
+
typer.echo(f" [{idx}] {model.provider}:{model.model}")
|
|
347
|
+
|
|
348
|
+
typer.secho("", nl=True)
|
|
349
|
+
typer.secho("Judge model", fg=typer.colors.MAGENTA, bold=True, nl=False)
|
|
350
|
+
typer.echo(f" : {judge_model}")
|
|
351
|
+
typer.secho("============================", fg=typer.colors.BLUE)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def eval_show() -> None:
|
|
355
|
+
agent_root = Path.cwd()
|
|
356
|
+
_validate_agent_root(agent_root)
|
|
357
|
+
|
|
358
|
+
eval_yaml_path = agent_root / "eval" / "eval.yaml"
|
|
359
|
+
if not eval_yaml_path.exists():
|
|
360
|
+
raise typer.BadParameter("Missing ./eval/eval.yaml. Run 'bat eval init' first.")
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
cfg = load_eval_config(agent_root, eval_yaml_path)
|
|
364
|
+
except ValueError as exc:
|
|
365
|
+
raise typer.BadParameter(str(exc)) from exc
|
|
366
|
+
|
|
367
|
+
_print_eval_show(cfg)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def eval_run(
|
|
371
|
+
no_start_agent: bool = typer.Option(
|
|
372
|
+
False,
|
|
373
|
+
"--no-start-agent",
|
|
374
|
+
help=(
|
|
375
|
+
"Do not start (or stop) the agent process. Assume it is already running at the "
|
|
376
|
+
"agent_url declared in eval.yaml. Useful when the CLI cannot exec 'uv run .' itself "
|
|
377
|
+
"(e.g. when running from a strictly-confined snap)."
|
|
378
|
+
),
|
|
379
|
+
),
|
|
380
|
+
) -> None:
|
|
381
|
+
agent_root = Path.cwd()
|
|
382
|
+
_validate_agent_root(agent_root)
|
|
383
|
+
|
|
384
|
+
eval_yaml_path = agent_root / "eval" / "eval.yaml"
|
|
385
|
+
if not eval_yaml_path.exists():
|
|
386
|
+
raise typer.BadParameter("Missing ./eval/eval.yaml. Run 'bat eval init' first.")
|
|
387
|
+
|
|
388
|
+
try:
|
|
389
|
+
cfg = load_eval_config(agent_root, eval_yaml_path)
|
|
390
|
+
except ValueError as exc:
|
|
391
|
+
raise typer.BadParameter(str(exc)) from exc
|
|
392
|
+
|
|
393
|
+
if not cfg.dataset.exists():
|
|
394
|
+
raise typer.BadParameter(f"Dataset not found: {cfg.dataset}")
|
|
395
|
+
|
|
396
|
+
cfg.output_dir.mkdir(parents=True, exist_ok=True)
|
|
397
|
+
|
|
398
|
+
if not no_start_agent:
|
|
399
|
+
agent_python = _find_agent_python(agent_root)
|
|
400
|
+
if agent_python is None:
|
|
401
|
+
raise typer.BadParameter(
|
|
402
|
+
"No agent python found at .venv/bin/python. Create the agent virtual environment first."
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
task_id = time.strftime("%Y%m%d_%H%M%S")
|
|
406
|
+
typer.secho(
|
|
407
|
+
f"Running evaluation with {len(cfg.models)} model(s). task_id={task_id}",
|
|
408
|
+
fg=typer.colors.CYAN,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
_, parsed_port, base_url = _parse_agent_url(cfg.agent_url)
|
|
412
|
+
|
|
413
|
+
for idx, model_cfg in enumerate(cfg.models):
|
|
414
|
+
typer.secho(f"- {model_cfg.provider}:{model_cfg.model}", fg=typer.colors.CYAN)
|
|
415
|
+
|
|
416
|
+
server_env = os.environ.copy()
|
|
417
|
+
server_env["MODEL_PROVIDER"] = model_cfg.provider
|
|
418
|
+
server_env["MODEL"] = model_cfg.model
|
|
419
|
+
server_env["PORT"] = str(parsed_port)
|
|
420
|
+
server_env["URL"] = base_url
|
|
421
|
+
|
|
422
|
+
if model_cfg.base_url:
|
|
423
|
+
server_env["BASE_URL"] = model_cfg.base_url
|
|
424
|
+
else:
|
|
425
|
+
server_env.pop("BASE_URL", None)
|
|
426
|
+
|
|
427
|
+
_apply_env_overrides(
|
|
428
|
+
server_env,
|
|
429
|
+
model_cfg.env,
|
|
430
|
+
section_name=f"models[{idx}]",
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
if no_start_agent:
|
|
434
|
+
process = None
|
|
435
|
+
typer.secho(
|
|
436
|
+
f" (--no-start-agent) connecting to externally-managed agent at {cfg.agent_url}",
|
|
437
|
+
fg=typer.colors.YELLOW,
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
process = _start_agent_process(agent_root, server_env)
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
_wait_for_agent_port(
|
|
444
|
+
cfg.agent_url,
|
|
445
|
+
timeout_s=cfg.agent_startup_timeout_s,
|
|
446
|
+
process=process,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
runner_env = server_env.copy()
|
|
450
|
+
if cfg.qualitative:
|
|
451
|
+
if cfg.judge is None:
|
|
452
|
+
raise typer.BadParameter(
|
|
453
|
+
"When evaluation.qualitative is true, judge.provider and judge.model are required"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
runner_env["JUDGE_PROVIDER"] = cfg.judge.provider
|
|
457
|
+
runner_env["JUDGE_MODEL"] = cfg.judge.model
|
|
458
|
+
if cfg.judge.base_url:
|
|
459
|
+
runner_env["JUDGE_BASE_URL"] = cfg.judge.base_url
|
|
460
|
+
else:
|
|
461
|
+
runner_env.pop("JUDGE_BASE_URL", None)
|
|
462
|
+
|
|
463
|
+
for prompt_key in ("relevance", "task_completion", "hallucination", "tool_call"):
|
|
464
|
+
env_name = f"JUDGE_PROMPT_{prompt_key.upper()}"
|
|
465
|
+
text = cfg.judge.prompts.get(prompt_key)
|
|
466
|
+
if text:
|
|
467
|
+
runner_env[env_name] = text
|
|
468
|
+
else:
|
|
469
|
+
runner_env.pop(env_name, None)
|
|
470
|
+
|
|
471
|
+
_inject_judge_api_key(cfg.judge, agent_root, runner_env)
|
|
472
|
+
|
|
473
|
+
_apply_env_overrides(
|
|
474
|
+
runner_env,
|
|
475
|
+
cfg.judge.env,
|
|
476
|
+
section_name="judge",
|
|
477
|
+
)
|
|
478
|
+
else:
|
|
479
|
+
runner_env.pop("JUDGE_PROVIDER", None)
|
|
480
|
+
runner_env.pop("JUDGE_MODEL", None)
|
|
481
|
+
runner_env.pop("JUDGE_BASE_URL", None)
|
|
482
|
+
for prompt_key in ("relevance", "task_completion", "hallucination", "tool_call"):
|
|
483
|
+
runner_env.pop(f"JUDGE_PROMPT_{prompt_key.upper()}", None)
|
|
484
|
+
|
|
485
|
+
_run_eval_orchestrator(
|
|
486
|
+
agent_url=cfg.agent_url,
|
|
487
|
+
model_provider=model_cfg.provider,
|
|
488
|
+
model=model_cfg.model,
|
|
489
|
+
dataset=cfg.dataset,
|
|
490
|
+
output_dir=cfg.output_dir,
|
|
491
|
+
task_id=task_id,
|
|
492
|
+
k=cfg.k,
|
|
493
|
+
run_name=cfg.run_name,
|
|
494
|
+
qualitative=cfg.qualitative,
|
|
495
|
+
env=runner_env,
|
|
496
|
+
)
|
|
497
|
+
finally:
|
|
498
|
+
if process is not None:
|
|
499
|
+
_stop_agent_process(process, timeout_s=cfg.agent_shutdown_timeout_s)
|
|
500
|
+
|
|
501
|
+
typer.secho(
|
|
502
|
+
f"Evaluation completed. Output: {cfg.output_dir / task_id}",
|
|
503
|
+
fg=typer.colors.GREEN,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def eval_plot(
|
|
508
|
+
folder: Path = typer.Option(
|
|
509
|
+
...,
|
|
510
|
+
"--folder",
|
|
511
|
+
"-f",
|
|
512
|
+
help="Path to an evaluation output folder. Each sub-folder containing a metrics.json is treated as one run.",
|
|
513
|
+
),
|
|
514
|
+
filter: str | None = typer.Option(
|
|
515
|
+
None,
|
|
516
|
+
"--filter",
|
|
517
|
+
"-F",
|
|
518
|
+
help="Substring match on task_id. Restricts the per-task charts to tasks whose id contains this substring. Summary charts are not affected.",
|
|
519
|
+
),
|
|
520
|
+
) -> None:
|
|
521
|
+
folder = folder.resolve()
|
|
522
|
+
|
|
523
|
+
if not folder.is_dir():
|
|
524
|
+
raise typer.BadParameter(f"Folder not found: {folder}")
|
|
525
|
+
|
|
526
|
+
metrics: dict[str, dict] = {}
|
|
527
|
+
for sub in sorted(folder.iterdir()):
|
|
528
|
+
if sub.is_dir():
|
|
529
|
+
metrics_file = sub / "metrics.json"
|
|
530
|
+
if metrics_file.exists():
|
|
531
|
+
with open(metrics_file, encoding="utf-8") as f:
|
|
532
|
+
metrics[sub.name] = json.load(f)
|
|
533
|
+
|
|
534
|
+
if not metrics:
|
|
535
|
+
raise typer.BadParameter(
|
|
536
|
+
f"No valid evaluation results found in {folder}. "
|
|
537
|
+
"A sub-folder is a valid run only if it contains a metrics.json file."
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
typer.secho(
|
|
541
|
+
f"Found {len(metrics)} run(s): {', '.join(metrics)}",
|
|
542
|
+
fg=typer.colors.CYAN,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
if filter:
|
|
546
|
+
typer.secho(
|
|
547
|
+
f"Per-task filter active: only task ids containing '{filter}' will be plotted",
|
|
548
|
+
fg=typer.colors.CYAN,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
from .engine.plotter import generate_and_save_plots
|
|
552
|
+
|
|
553
|
+
saved = generate_and_save_plots(metrics, folder, task_filter=filter)
|
|
554
|
+
|
|
555
|
+
for path in saved:
|
|
556
|
+
typer.secho(f" {path.relative_to(folder)}", fg=typer.colors.GREEN)
|
|
557
|
+
|
|
558
|
+
typer.secho(
|
|
559
|
+
f"\nSaved {len(saved)} chart(s) to {folder}",
|
|
560
|
+
fg=typer.colors.GREEN,
|
|
561
|
+
bold=True,
|
|
562
|
+
)
|
eval/engine/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Embedded evaluation engine adapted from Aletheia."""
|