bat-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. add/__init__.py +3 -0
  2. add/client.py +16 -0
  3. bat_cli-0.1.0.dist-info/METADATA +231 -0
  4. bat_cli-0.1.0.dist-info/RECORD +47 -0
  5. bat_cli-0.1.0.dist-info/WHEEL +5 -0
  6. bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
  7. bat_cli-0.1.0.dist-info/top_level.txt +8 -0
  8. build/__init__.py +3 -0
  9. build/build.py +79 -0
  10. cli.py +260 -0
  11. create/__init__.py +3 -0
  12. create/agent.py +312 -0
  13. create/templates/agent/.dockerignore +3 -0
  14. create/templates/agent/.env.template +4 -0
  15. create/templates/agent/.python-version +1 -0
  16. create/templates/agent/Dockerfile +37 -0
  17. create/templates/agent/Makefile +34 -0
  18. create/templates/agent/README.md +1 -0
  19. create/templates/agent/__main__.py +2 -0
  20. create/templates/agent/agent.json.template +12 -0
  21. create/templates/agent/agent.spec +45 -0
  22. create/templates/agent/config.yaml +1 -0
  23. create/templates/agent/llm_client.py.template +36 -0
  24. create/templates/agent/pyproject.toml.template +9 -0
  25. create/templates/agent/src/__init__.py +0 -0
  26. create/templates/agent/src/graph.py +50 -0
  27. create/templates/agent/src/llm_clients/__init__.py +0 -0
  28. create/templates/agent/tests/__init__.py +0 -0
  29. eval/__init__.py +1 -0
  30. eval/commands.py +562 -0
  31. eval/engine/__init__.py +1 -0
  32. eval/engine/adapter.py +251 -0
  33. eval/engine/bench_runner.py +149 -0
  34. eval/engine/contracts.py +115 -0
  35. eval/engine/eval_config.py +294 -0
  36. eval/engine/evaluator.py +85 -0
  37. eval/engine/metrics/__init__.py +1 -0
  38. eval/engine/metrics/llm_evaluators.py +383 -0
  39. eval/engine/metrics/metrics.py +135 -0
  40. eval/engine/metrics/qualitative_helpers.py +64 -0
  41. eval/engine/orchestrator.py +157 -0
  42. eval/engine/plotter.py +347 -0
  43. image_defaults.py +80 -0
  44. push/__init__.py +3 -0
  45. push/push.py +58 -0
  46. set/__init__.py +3 -0
  47. set/env.py +50 -0
@@ -0,0 +1,294 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+ from .contracts import EvalConfig, JudgeSpec, ModelSpec
10
+
11
+ _ENV_VAR_NAME = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
12
+
13
+
14
+ def _to_bool(value: Any, *, default: bool) -> bool:
15
+ if value is None:
16
+ return default
17
+ if isinstance(value, bool):
18
+ return value
19
+ if isinstance(value, str):
20
+ return value.strip().lower() in {"1", "true", "yes", "on"}
21
+ return bool(value)
22
+
23
+
24
+ def _resolve_path(base_dir: Path, raw_path: str | None, fallback: str) -> Path:
25
+ path_value = raw_path or fallback
26
+ path = Path(path_value)
27
+ if path.is_absolute():
28
+ return path
29
+ return (base_dir / path).resolve()
30
+
31
+
32
+ def _to_optional_str(value: Any) -> str | None:
33
+ if value is None:
34
+ return None
35
+ text = str(value).strip()
36
+ return text or None
37
+
38
+
39
+ def _to_positive_int(value: Any, *, field_name: str, default: int) -> int:
40
+ raw = default if value is None else value
41
+ try:
42
+ parsed = int(raw)
43
+ except Exception as exc:
44
+ raise ValueError(f"{field_name} must be an integer") from exc
45
+ if parsed < 1:
46
+ raise ValueError(f"{field_name} must be >= 1")
47
+ return parsed
48
+
49
+
50
+ def _split_provider_model(value: str, *, field_name: str) -> tuple[str, str]:
51
+ raw = value.strip()
52
+ if not raw or ":" not in raw:
53
+ raise ValueError(
54
+ f"{field_name} must use '<provider>:<model>' format when provider is omitted"
55
+ )
56
+
57
+ provider, model = raw.split(":", 1)
58
+ provider = provider.strip()
59
+ model = model.strip()
60
+ if not provider or not model:
61
+ raise ValueError(
62
+ f"{field_name} must use '<provider>:<model>' format when provider is omitted"
63
+ )
64
+ return provider, model
65
+
66
+
67
+ _JUDGE_PROMPT_KEYS = ("relevance", "task_completion", "hallucination", "tool_call")
68
+ _JUDGE_PROMPT_MAX_LEN = 1000
69
+
70
+
71
+ def _parse_judge_prompts(raw: Any) -> dict[str, str]:
72
+ if raw is None:
73
+ return {}
74
+ if not isinstance(raw, dict):
75
+ raise ValueError("judge.prompts must be a mapping")
76
+
77
+ unknown = set(raw) - set(_JUDGE_PROMPT_KEYS)
78
+ if unknown:
79
+ raise ValueError(
80
+ f"judge.prompts has unknown key(s) {sorted(unknown)}; "
81
+ f"allowed: {list(_JUDGE_PROMPT_KEYS)}"
82
+ )
83
+
84
+ out: dict[str, str] = {}
85
+ for key in _JUDGE_PROMPT_KEYS:
86
+ value = raw.get(key)
87
+ if value is None:
88
+ continue
89
+ if not isinstance(value, str):
90
+ raise ValueError(f"judge.prompts.{key} must be a string")
91
+ text = value.strip()
92
+ if not text:
93
+ continue
94
+ if len(text) > _JUDGE_PROMPT_MAX_LEN:
95
+ raise ValueError(
96
+ f"judge.prompts.{key} exceeds the {_JUDGE_PROMPT_MAX_LEN}-character limit "
97
+ f"(got {len(text)})"
98
+ )
99
+ out[key] = text
100
+ return out
101
+
102
+
103
+ def _parse_env_map(raw: Any, *, section_name: str) -> dict[str, str]:
104
+ if raw is None:
105
+ return {}
106
+ if not isinstance(raw, dict):
107
+ raise ValueError(f"{section_name}.env must be a mapping of environment variables")
108
+
109
+ parsed: dict[str, str] = {}
110
+ for key, value in raw.items():
111
+ env_key = str(key).strip()
112
+ if not env_key or value is None:
113
+ continue
114
+ parsed[env_key] = str(value)
115
+ return parsed
116
+
117
+
118
+ def _parse_model_spec(item: Any, *, section_name: str) -> ModelSpec:
119
+ if isinstance(item, str):
120
+ provider, model = _split_provider_model(item, field_name=section_name)
121
+ return ModelSpec(provider=provider, model=model)
122
+
123
+ if not isinstance(item, dict):
124
+ raise ValueError(f"{section_name} must be either a mapping or '<provider>:<model>' string")
125
+
126
+ provider = _to_optional_str(item.get("provider"))
127
+ model = _to_optional_str(item.get("model"))
128
+ base_url = _to_optional_str(item.get("base_url"))
129
+ env = _parse_env_map(item.get("env"), section_name=section_name)
130
+
131
+ if model and not provider and ":" in model:
132
+ provider, model = _split_provider_model(model, field_name=f"{section_name}.model")
133
+
134
+ if not provider or not model:
135
+ raise ValueError(
136
+ f"{section_name} must define at least one valid provider and model (or model as '<provider>:<model>')"
137
+ )
138
+
139
+ return ModelSpec(
140
+ provider=provider,
141
+ model=model,
142
+ base_url=base_url,
143
+ env=env,
144
+ )
145
+
146
+
147
+ def _parse_judge_spec(item: Any) -> JudgeSpec | None:
148
+ if item is None:
149
+ return None
150
+
151
+ if isinstance(item, str):
152
+ provider, model = _split_provider_model(item, field_name="judge")
153
+ return JudgeSpec(provider=provider, model=model)
154
+
155
+ if not isinstance(item, dict):
156
+ raise ValueError("judge must be either a mapping or '<provider>:<model>' string")
157
+
158
+ provider = _to_optional_str(item.get("provider"))
159
+ model = _to_optional_str(item.get("model"))
160
+ base_url = _to_optional_str(item.get("base_url"))
161
+ api_key_env = _to_optional_str(item.get("api_key_env"))
162
+ env = _parse_env_map(item.get("env"), section_name="judge")
163
+ prompts = _parse_judge_prompts(item.get("prompts"))
164
+
165
+ if not any([provider, model, base_url, api_key_env, env, prompts]):
166
+ return None
167
+
168
+ if api_key_env and not _ENV_VAR_NAME.fullmatch(api_key_env):
169
+ raise ValueError(f"judge.api_key_env is not a valid environment variable name: {api_key_env}")
170
+
171
+ if model and not provider and ":" in model:
172
+ provider, model = _split_provider_model(model, field_name="judge.model")
173
+
174
+ if not provider or not model:
175
+ raise ValueError(
176
+ "judge must define provider and model (or model as '<provider>:<model>')"
177
+ )
178
+
179
+ return JudgeSpec(
180
+ provider=provider,
181
+ model=model,
182
+ base_url=base_url,
183
+ api_key_env=api_key_env,
184
+ env=env,
185
+ prompts=prompts,
186
+ )
187
+
188
+
189
+ def load_eval_config(agent_root: Path, config_path: Path) -> EvalConfig:
190
+ raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
191
+ if not isinstance(raw, dict):
192
+ raise ValueError("eval.yaml must define a mapping at top level")
193
+
194
+ evaluation_section = raw.get("evaluation") or {}
195
+ if not isinstance(evaluation_section, dict):
196
+ raise ValueError("evaluation section must be a mapping")
197
+
198
+ models_raw = raw.get("models") or []
199
+ if not isinstance(models_raw, list):
200
+ raise ValueError("models section must be a list")
201
+
202
+ models: list[ModelSpec] = []
203
+ for idx, item in enumerate(models_raw):
204
+ models.append(_parse_model_spec(item, section_name=f"models[{idx}]"))
205
+
206
+ if not models:
207
+ raise ValueError("No valid models configured in eval/eval.yaml")
208
+
209
+ dataset = _resolve_path(agent_root, evaluation_section.get("dataset"), "eval/input/tasks.json")
210
+ output_dir = _resolve_path(agent_root, evaluation_section.get("output_dir"), "eval/output")
211
+ agent_url = _to_optional_str(evaluation_section.get("agent_url")) or "http://127.0.0.1:9900"
212
+
213
+ agent_startup_timeout_s = _to_positive_int(
214
+ evaluation_section.get("agent_startup_timeout_s"),
215
+ field_name="evaluation.agent_startup_timeout_s",
216
+ default=45,
217
+ )
218
+ agent_shutdown_timeout_s = _to_positive_int(
219
+ evaluation_section.get("agent_shutdown_timeout_s"),
220
+ field_name="evaluation.agent_shutdown_timeout_s",
221
+ default=10,
222
+ )
223
+
224
+ k = int(evaluation_section.get("k", 1))
225
+ if k < 1:
226
+ raise ValueError("evaluation.k must be >= 1")
227
+
228
+ qualitative = _to_bool(evaluation_section.get("qualitative"), default=False)
229
+ run_name = _to_optional_str(evaluation_section.get("run_name")) or "benchmark"
230
+
231
+ judge = _parse_judge_spec(raw.get("judge"))
232
+ if qualitative and judge is None:
233
+ raise ValueError(
234
+ "When evaluation.qualitative is true, set judge.provider and judge.model in eval/eval.yaml"
235
+ )
236
+
237
+ return EvalConfig(
238
+ dataset=dataset,
239
+ output_dir=output_dir,
240
+ agent_url=agent_url,
241
+ agent_startup_timeout_s=agent_startup_timeout_s,
242
+ agent_shutdown_timeout_s=agent_shutdown_timeout_s,
243
+ k=k,
244
+ qualitative=qualitative,
245
+ run_name=run_name,
246
+ models=models,
247
+ judge=judge,
248
+ )
249
+
250
+
251
+ def default_eval_yaml() -> str:
252
+ return (
253
+ "evaluation:\n"
254
+ " dataset: eval/input/tasks.json\n"
255
+ " output_dir: eval/output\n"
256
+ " agent_url: http://127.0.0.1:9900\n"
257
+ " agent_startup_timeout_s: 45\n"
258
+ " agent_shutdown_timeout_s: 10\n"
259
+ " k: 1\n"
260
+ " qualitative: false\n"
261
+ "\n"
262
+ "judge:\n"
263
+ " provider: ollama\n"
264
+ " model: local-judge-model\n"
265
+ " base_url: http://localhost:11434\n"
266
+ " # api_key_env: BAT_JUDGE_API_KEY # name of the env var holding the judge's API key\n"
267
+ "\n"
268
+ "models:\n"
269
+ " - provider: openai\n"
270
+ " model: your-model-name\n"
271
+ " - provider: ollama\n"
272
+ " model: your-local-model\n"
273
+ " base_url: http://localhost:11434\n"
274
+ )
275
+
276
+
277
+ def default_tasks_json() -> str:
278
+ return (
279
+ "[\n"
280
+ " {\n"
281
+ " \"id\": \"smoke_test\",\n"
282
+ " \"turns\": [\n"
283
+ " \"Describe what you can do in one short paragraph.\"\n"
284
+ " ],\n"
285
+ " \"expected\": {\n"
286
+ " \"status\": \"completed\",\n"
287
+ " \"expected_outcome\": \"The agent describes its capabilities clearly in one short paragraph.\"\n"
288
+ " },\n"
289
+ " \"meta\": {\n"
290
+ " \"category\": \"smoke\"\n"
291
+ " }\n"
292
+ " }\n"
293
+ "]\n"
294
+ )
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Any
5
+
6
+ from .contracts import EpisodeVerdict, ExpectedToolCall, TaskExpected
7
+
8
+
9
+ def _is_subset(expected: Any, actual: Any) -> bool:
10
+ if isinstance(expected, dict):
11
+ if not isinstance(actual, dict):
12
+ return False
13
+ for key, value in expected.items():
14
+ if key not in actual or not _is_subset(value, actual[key]):
15
+ return False
16
+ return True
17
+
18
+ if isinstance(expected, list):
19
+ if not isinstance(actual, list) or len(expected) > len(actual):
20
+ return False
21
+ used = [False] * len(actual)
22
+ for expected_item in expected:
23
+ matched = False
24
+ for idx, actual_item in enumerate(actual):
25
+ if used[idx]:
26
+ continue
27
+ if _is_subset(expected_item, actual_item):
28
+ used[idx] = True
29
+ matched = True
30
+ break
31
+ if not matched:
32
+ return False
33
+ return True
34
+
35
+ return expected == actual
36
+
37
+
38
+ def _count_matches(expected: ExpectedToolCall, observed: list[dict[str, Any]]) -> int:
39
+ total = 0
40
+ for call in observed:
41
+ if call.get("name") != expected.name:
42
+ continue
43
+ args = call.get("args") if isinstance(call.get("args"), dict) else {}
44
+ if _is_subset(expected.args_subset, args):
45
+ total += 1
46
+ return total
47
+
48
+
49
+ class EpisodeEvaluator:
50
+ def evaluate(
51
+ self,
52
+ status: str,
53
+ output_text: str,
54
+ tool_calls: list[dict[str, Any]],
55
+ expected: TaskExpected,
56
+ ) -> EpisodeVerdict:
57
+ checks: list[tuple[bool, str]] = []
58
+
59
+ if expected.status is not None:
60
+ ok = status == expected.status
61
+ reason = f"status: '{status}'" if ok else f"status: got '{status}', expected '{expected.status}'"
62
+ checks.append((ok, reason))
63
+
64
+ phrases = expected.output_must_contain or []
65
+ n = len(phrases)
66
+ for i, phrase in enumerate(phrases):
67
+ label = f"output[{i}]" if n > 1 else "output"
68
+ ok = phrase in output_text
69
+ reason = f"{label}: contains '{phrase}'" if ok else f"{label}: missing '{phrase}'"
70
+ checks.append((ok, reason))
71
+
72
+ for exp_call in expected.tool_calls:
73
+ count = _count_matches(exp_call, tool_calls)
74
+ ok = count >= exp_call.times
75
+ label = f"tool_call:{exp_call.name}"
76
+ reason = (
77
+ f"{label}: called {count}×"
78
+ if ok
79
+ else f"{label}: called {count}×, expected ≥{exp_call.times}×"
80
+ )
81
+ checks.append((ok, reason))
82
+
83
+ overall = all(ok for ok, _ in checks) if checks else True
84
+ reason = "; ".join(r for _, r in checks)
85
+ return EpisodeVerdict(passed=overall, reason=reason)
@@ -0,0 +1 @@
1
+ """Metrics package for embedded eval engine."""