bat-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- add/__init__.py +3 -0
- add/client.py +16 -0
- bat_cli-0.1.0.dist-info/METADATA +231 -0
- bat_cli-0.1.0.dist-info/RECORD +47 -0
- bat_cli-0.1.0.dist-info/WHEEL +5 -0
- bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
- bat_cli-0.1.0.dist-info/top_level.txt +8 -0
- build/__init__.py +3 -0
- build/build.py +79 -0
- cli.py +260 -0
- create/__init__.py +3 -0
- create/agent.py +312 -0
- create/templates/agent/.dockerignore +3 -0
- create/templates/agent/.env.template +4 -0
- create/templates/agent/.python-version +1 -0
- create/templates/agent/Dockerfile +37 -0
- create/templates/agent/Makefile +34 -0
- create/templates/agent/README.md +1 -0
- create/templates/agent/__main__.py +2 -0
- create/templates/agent/agent.json.template +12 -0
- create/templates/agent/agent.spec +45 -0
- create/templates/agent/config.yaml +1 -0
- create/templates/agent/llm_client.py.template +36 -0
- create/templates/agent/pyproject.toml.template +9 -0
- create/templates/agent/src/__init__.py +0 -0
- create/templates/agent/src/graph.py +50 -0
- create/templates/agent/src/llm_clients/__init__.py +0 -0
- create/templates/agent/tests/__init__.py +0 -0
- eval/__init__.py +1 -0
- eval/commands.py +562 -0
- eval/engine/__init__.py +1 -0
- eval/engine/adapter.py +251 -0
- eval/engine/bench_runner.py +149 -0
- eval/engine/contracts.py +115 -0
- eval/engine/eval_config.py +294 -0
- eval/engine/evaluator.py +85 -0
- eval/engine/metrics/__init__.py +1 -0
- eval/engine/metrics/llm_evaluators.py +383 -0
- eval/engine/metrics/metrics.py +135 -0
- eval/engine/metrics/qualitative_helpers.py +64 -0
- eval/engine/orchestrator.py +157 -0
- eval/engine/plotter.py +347 -0
- image_defaults.py +80 -0
- push/__init__.py +3 -0
- push/push.py +58 -0
- set/__init__.py +3 -0
- set/env.py +50 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from .contracts import EvalConfig, JudgeSpec, ModelSpec
|
|
10
|
+
|
|
11
|
+
_ENV_VAR_NAME = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _to_bool(value: Any, *, default: bool) -> bool:
|
|
15
|
+
if value is None:
|
|
16
|
+
return default
|
|
17
|
+
if isinstance(value, bool):
|
|
18
|
+
return value
|
|
19
|
+
if isinstance(value, str):
|
|
20
|
+
return value.strip().lower() in {"1", "true", "yes", "on"}
|
|
21
|
+
return bool(value)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _resolve_path(base_dir: Path, raw_path: str | None, fallback: str) -> Path:
|
|
25
|
+
path_value = raw_path or fallback
|
|
26
|
+
path = Path(path_value)
|
|
27
|
+
if path.is_absolute():
|
|
28
|
+
return path
|
|
29
|
+
return (base_dir / path).resolve()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _to_optional_str(value: Any) -> str | None:
|
|
33
|
+
if value is None:
|
|
34
|
+
return None
|
|
35
|
+
text = str(value).strip()
|
|
36
|
+
return text or None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _to_positive_int(value: Any, *, field_name: str, default: int) -> int:
|
|
40
|
+
raw = default if value is None else value
|
|
41
|
+
try:
|
|
42
|
+
parsed = int(raw)
|
|
43
|
+
except Exception as exc:
|
|
44
|
+
raise ValueError(f"{field_name} must be an integer") from exc
|
|
45
|
+
if parsed < 1:
|
|
46
|
+
raise ValueError(f"{field_name} must be >= 1")
|
|
47
|
+
return parsed
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _split_provider_model(value: str, *, field_name: str) -> tuple[str, str]:
|
|
51
|
+
raw = value.strip()
|
|
52
|
+
if not raw or ":" not in raw:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"{field_name} must use '<provider>:<model>' format when provider is omitted"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
provider, model = raw.split(":", 1)
|
|
58
|
+
provider = provider.strip()
|
|
59
|
+
model = model.strip()
|
|
60
|
+
if not provider or not model:
|
|
61
|
+
raise ValueError(
|
|
62
|
+
f"{field_name} must use '<provider>:<model>' format when provider is omitted"
|
|
63
|
+
)
|
|
64
|
+
return provider, model
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_JUDGE_PROMPT_KEYS = ("relevance", "task_completion", "hallucination", "tool_call")
|
|
68
|
+
_JUDGE_PROMPT_MAX_LEN = 1000
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _parse_judge_prompts(raw: Any) -> dict[str, str]:
|
|
72
|
+
if raw is None:
|
|
73
|
+
return {}
|
|
74
|
+
if not isinstance(raw, dict):
|
|
75
|
+
raise ValueError("judge.prompts must be a mapping")
|
|
76
|
+
|
|
77
|
+
unknown = set(raw) - set(_JUDGE_PROMPT_KEYS)
|
|
78
|
+
if unknown:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
f"judge.prompts has unknown key(s) {sorted(unknown)}; "
|
|
81
|
+
f"allowed: {list(_JUDGE_PROMPT_KEYS)}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
out: dict[str, str] = {}
|
|
85
|
+
for key in _JUDGE_PROMPT_KEYS:
|
|
86
|
+
value = raw.get(key)
|
|
87
|
+
if value is None:
|
|
88
|
+
continue
|
|
89
|
+
if not isinstance(value, str):
|
|
90
|
+
raise ValueError(f"judge.prompts.{key} must be a string")
|
|
91
|
+
text = value.strip()
|
|
92
|
+
if not text:
|
|
93
|
+
continue
|
|
94
|
+
if len(text) > _JUDGE_PROMPT_MAX_LEN:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"judge.prompts.{key} exceeds the {_JUDGE_PROMPT_MAX_LEN}-character limit "
|
|
97
|
+
f"(got {len(text)})"
|
|
98
|
+
)
|
|
99
|
+
out[key] = text
|
|
100
|
+
return out
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _parse_env_map(raw: Any, *, section_name: str) -> dict[str, str]:
|
|
104
|
+
if raw is None:
|
|
105
|
+
return {}
|
|
106
|
+
if not isinstance(raw, dict):
|
|
107
|
+
raise ValueError(f"{section_name}.env must be a mapping of environment variables")
|
|
108
|
+
|
|
109
|
+
parsed: dict[str, str] = {}
|
|
110
|
+
for key, value in raw.items():
|
|
111
|
+
env_key = str(key).strip()
|
|
112
|
+
if not env_key or value is None:
|
|
113
|
+
continue
|
|
114
|
+
parsed[env_key] = str(value)
|
|
115
|
+
return parsed
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _parse_model_spec(item: Any, *, section_name: str) -> ModelSpec:
|
|
119
|
+
if isinstance(item, str):
|
|
120
|
+
provider, model = _split_provider_model(item, field_name=section_name)
|
|
121
|
+
return ModelSpec(provider=provider, model=model)
|
|
122
|
+
|
|
123
|
+
if not isinstance(item, dict):
|
|
124
|
+
raise ValueError(f"{section_name} must be either a mapping or '<provider>:<model>' string")
|
|
125
|
+
|
|
126
|
+
provider = _to_optional_str(item.get("provider"))
|
|
127
|
+
model = _to_optional_str(item.get("model"))
|
|
128
|
+
base_url = _to_optional_str(item.get("base_url"))
|
|
129
|
+
env = _parse_env_map(item.get("env"), section_name=section_name)
|
|
130
|
+
|
|
131
|
+
if model and not provider and ":" in model:
|
|
132
|
+
provider, model = _split_provider_model(model, field_name=f"{section_name}.model")
|
|
133
|
+
|
|
134
|
+
if not provider or not model:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"{section_name} must define at least one valid provider and model (or model as '<provider>:<model>')"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return ModelSpec(
|
|
140
|
+
provider=provider,
|
|
141
|
+
model=model,
|
|
142
|
+
base_url=base_url,
|
|
143
|
+
env=env,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _parse_judge_spec(item: Any) -> JudgeSpec | None:
|
|
148
|
+
if item is None:
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
if isinstance(item, str):
|
|
152
|
+
provider, model = _split_provider_model(item, field_name="judge")
|
|
153
|
+
return JudgeSpec(provider=provider, model=model)
|
|
154
|
+
|
|
155
|
+
if not isinstance(item, dict):
|
|
156
|
+
raise ValueError("judge must be either a mapping or '<provider>:<model>' string")
|
|
157
|
+
|
|
158
|
+
provider = _to_optional_str(item.get("provider"))
|
|
159
|
+
model = _to_optional_str(item.get("model"))
|
|
160
|
+
base_url = _to_optional_str(item.get("base_url"))
|
|
161
|
+
api_key_env = _to_optional_str(item.get("api_key_env"))
|
|
162
|
+
env = _parse_env_map(item.get("env"), section_name="judge")
|
|
163
|
+
prompts = _parse_judge_prompts(item.get("prompts"))
|
|
164
|
+
|
|
165
|
+
if not any([provider, model, base_url, api_key_env, env, prompts]):
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
if api_key_env and not _ENV_VAR_NAME.fullmatch(api_key_env):
|
|
169
|
+
raise ValueError(f"judge.api_key_env is not a valid environment variable name: {api_key_env}")
|
|
170
|
+
|
|
171
|
+
if model and not provider and ":" in model:
|
|
172
|
+
provider, model = _split_provider_model(model, field_name="judge.model")
|
|
173
|
+
|
|
174
|
+
if not provider or not model:
|
|
175
|
+
raise ValueError(
|
|
176
|
+
"judge must define provider and model (or model as '<provider>:<model>')"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
return JudgeSpec(
|
|
180
|
+
provider=provider,
|
|
181
|
+
model=model,
|
|
182
|
+
base_url=base_url,
|
|
183
|
+
api_key_env=api_key_env,
|
|
184
|
+
env=env,
|
|
185
|
+
prompts=prompts,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def load_eval_config(agent_root: Path, config_path: Path) -> EvalConfig:
|
|
190
|
+
raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
191
|
+
if not isinstance(raw, dict):
|
|
192
|
+
raise ValueError("eval.yaml must define a mapping at top level")
|
|
193
|
+
|
|
194
|
+
evaluation_section = raw.get("evaluation") or {}
|
|
195
|
+
if not isinstance(evaluation_section, dict):
|
|
196
|
+
raise ValueError("evaluation section must be a mapping")
|
|
197
|
+
|
|
198
|
+
models_raw = raw.get("models") or []
|
|
199
|
+
if not isinstance(models_raw, list):
|
|
200
|
+
raise ValueError("models section must be a list")
|
|
201
|
+
|
|
202
|
+
models: list[ModelSpec] = []
|
|
203
|
+
for idx, item in enumerate(models_raw):
|
|
204
|
+
models.append(_parse_model_spec(item, section_name=f"models[{idx}]"))
|
|
205
|
+
|
|
206
|
+
if not models:
|
|
207
|
+
raise ValueError("No valid models configured in eval/eval.yaml")
|
|
208
|
+
|
|
209
|
+
dataset = _resolve_path(agent_root, evaluation_section.get("dataset"), "eval/input/tasks.json")
|
|
210
|
+
output_dir = _resolve_path(agent_root, evaluation_section.get("output_dir"), "eval/output")
|
|
211
|
+
agent_url = _to_optional_str(evaluation_section.get("agent_url")) or "http://127.0.0.1:9900"
|
|
212
|
+
|
|
213
|
+
agent_startup_timeout_s = _to_positive_int(
|
|
214
|
+
evaluation_section.get("agent_startup_timeout_s"),
|
|
215
|
+
field_name="evaluation.agent_startup_timeout_s",
|
|
216
|
+
default=45,
|
|
217
|
+
)
|
|
218
|
+
agent_shutdown_timeout_s = _to_positive_int(
|
|
219
|
+
evaluation_section.get("agent_shutdown_timeout_s"),
|
|
220
|
+
field_name="evaluation.agent_shutdown_timeout_s",
|
|
221
|
+
default=10,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
k = int(evaluation_section.get("k", 1))
|
|
225
|
+
if k < 1:
|
|
226
|
+
raise ValueError("evaluation.k must be >= 1")
|
|
227
|
+
|
|
228
|
+
qualitative = _to_bool(evaluation_section.get("qualitative"), default=False)
|
|
229
|
+
run_name = _to_optional_str(evaluation_section.get("run_name")) or "benchmark"
|
|
230
|
+
|
|
231
|
+
judge = _parse_judge_spec(raw.get("judge"))
|
|
232
|
+
if qualitative and judge is None:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
"When evaluation.qualitative is true, set judge.provider and judge.model in eval/eval.yaml"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return EvalConfig(
|
|
238
|
+
dataset=dataset,
|
|
239
|
+
output_dir=output_dir,
|
|
240
|
+
agent_url=agent_url,
|
|
241
|
+
agent_startup_timeout_s=agent_startup_timeout_s,
|
|
242
|
+
agent_shutdown_timeout_s=agent_shutdown_timeout_s,
|
|
243
|
+
k=k,
|
|
244
|
+
qualitative=qualitative,
|
|
245
|
+
run_name=run_name,
|
|
246
|
+
models=models,
|
|
247
|
+
judge=judge,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def default_eval_yaml() -> str:
|
|
252
|
+
return (
|
|
253
|
+
"evaluation:\n"
|
|
254
|
+
" dataset: eval/input/tasks.json\n"
|
|
255
|
+
" output_dir: eval/output\n"
|
|
256
|
+
" agent_url: http://127.0.0.1:9900\n"
|
|
257
|
+
" agent_startup_timeout_s: 45\n"
|
|
258
|
+
" agent_shutdown_timeout_s: 10\n"
|
|
259
|
+
" k: 1\n"
|
|
260
|
+
" qualitative: false\n"
|
|
261
|
+
"\n"
|
|
262
|
+
"judge:\n"
|
|
263
|
+
" provider: ollama\n"
|
|
264
|
+
" model: local-judge-model\n"
|
|
265
|
+
" base_url: http://localhost:11434\n"
|
|
266
|
+
" # api_key_env: BAT_JUDGE_API_KEY # name of the env var holding the judge's API key\n"
|
|
267
|
+
"\n"
|
|
268
|
+
"models:\n"
|
|
269
|
+
" - provider: openai\n"
|
|
270
|
+
" model: your-model-name\n"
|
|
271
|
+
" - provider: ollama\n"
|
|
272
|
+
" model: your-local-model\n"
|
|
273
|
+
" base_url: http://localhost:11434\n"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def default_tasks_json() -> str:
|
|
278
|
+
return (
|
|
279
|
+
"[\n"
|
|
280
|
+
" {\n"
|
|
281
|
+
" \"id\": \"smoke_test\",\n"
|
|
282
|
+
" \"turns\": [\n"
|
|
283
|
+
" \"Describe what you can do in one short paragraph.\"\n"
|
|
284
|
+
" ],\n"
|
|
285
|
+
" \"expected\": {\n"
|
|
286
|
+
" \"status\": \"completed\",\n"
|
|
287
|
+
" \"expected_outcome\": \"The agent describes its capabilities clearly in one short paragraph.\"\n"
|
|
288
|
+
" },\n"
|
|
289
|
+
" \"meta\": {\n"
|
|
290
|
+
" \"category\": \"smoke\"\n"
|
|
291
|
+
" }\n"
|
|
292
|
+
" }\n"
|
|
293
|
+
"]\n"
|
|
294
|
+
)
|
eval/engine/evaluator.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .contracts import EpisodeVerdict, ExpectedToolCall, TaskExpected
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is_subset(expected: Any, actual: Any) -> bool:
|
|
10
|
+
if isinstance(expected, dict):
|
|
11
|
+
if not isinstance(actual, dict):
|
|
12
|
+
return False
|
|
13
|
+
for key, value in expected.items():
|
|
14
|
+
if key not in actual or not _is_subset(value, actual[key]):
|
|
15
|
+
return False
|
|
16
|
+
return True
|
|
17
|
+
|
|
18
|
+
if isinstance(expected, list):
|
|
19
|
+
if not isinstance(actual, list) or len(expected) > len(actual):
|
|
20
|
+
return False
|
|
21
|
+
used = [False] * len(actual)
|
|
22
|
+
for expected_item in expected:
|
|
23
|
+
matched = False
|
|
24
|
+
for idx, actual_item in enumerate(actual):
|
|
25
|
+
if used[idx]:
|
|
26
|
+
continue
|
|
27
|
+
if _is_subset(expected_item, actual_item):
|
|
28
|
+
used[idx] = True
|
|
29
|
+
matched = True
|
|
30
|
+
break
|
|
31
|
+
if not matched:
|
|
32
|
+
return False
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
return expected == actual
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _count_matches(expected: ExpectedToolCall, observed: list[dict[str, Any]]) -> int:
|
|
39
|
+
total = 0
|
|
40
|
+
for call in observed:
|
|
41
|
+
if call.get("name") != expected.name:
|
|
42
|
+
continue
|
|
43
|
+
args = call.get("args") if isinstance(call.get("args"), dict) else {}
|
|
44
|
+
if _is_subset(expected.args_subset, args):
|
|
45
|
+
total += 1
|
|
46
|
+
return total
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EpisodeEvaluator:
|
|
50
|
+
def evaluate(
|
|
51
|
+
self,
|
|
52
|
+
status: str,
|
|
53
|
+
output_text: str,
|
|
54
|
+
tool_calls: list[dict[str, Any]],
|
|
55
|
+
expected: TaskExpected,
|
|
56
|
+
) -> EpisodeVerdict:
|
|
57
|
+
checks: list[tuple[bool, str]] = []
|
|
58
|
+
|
|
59
|
+
if expected.status is not None:
|
|
60
|
+
ok = status == expected.status
|
|
61
|
+
reason = f"status: '{status}'" if ok else f"status: got '{status}', expected '{expected.status}'"
|
|
62
|
+
checks.append((ok, reason))
|
|
63
|
+
|
|
64
|
+
phrases = expected.output_must_contain or []
|
|
65
|
+
n = len(phrases)
|
|
66
|
+
for i, phrase in enumerate(phrases):
|
|
67
|
+
label = f"output[{i}]" if n > 1 else "output"
|
|
68
|
+
ok = phrase in output_text
|
|
69
|
+
reason = f"{label}: contains '{phrase}'" if ok else f"{label}: missing '{phrase}'"
|
|
70
|
+
checks.append((ok, reason))
|
|
71
|
+
|
|
72
|
+
for exp_call in expected.tool_calls:
|
|
73
|
+
count = _count_matches(exp_call, tool_calls)
|
|
74
|
+
ok = count >= exp_call.times
|
|
75
|
+
label = f"tool_call:{exp_call.name}"
|
|
76
|
+
reason = (
|
|
77
|
+
f"{label}: called {count}×"
|
|
78
|
+
if ok
|
|
79
|
+
else f"{label}: called {count}×, expected ≥{exp_call.times}×"
|
|
80
|
+
)
|
|
81
|
+
checks.append((ok, reason))
|
|
82
|
+
|
|
83
|
+
overall = all(ok for ok, _ in checks) if checks else True
|
|
84
|
+
reason = "; ".join(r for _, r in checks)
|
|
85
|
+
return EpisodeVerdict(passed=overall, reason=reason)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Metrics package for embedded eval engine."""
|