bat-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. add/__init__.py +3 -0
  2. add/client.py +16 -0
  3. bat_cli-0.1.0.dist-info/METADATA +231 -0
  4. bat_cli-0.1.0.dist-info/RECORD +47 -0
  5. bat_cli-0.1.0.dist-info/WHEEL +5 -0
  6. bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
  7. bat_cli-0.1.0.dist-info/top_level.txt +8 -0
  8. build/__init__.py +3 -0
  9. build/build.py +79 -0
  10. cli.py +260 -0
  11. create/__init__.py +3 -0
  12. create/agent.py +312 -0
  13. create/templates/agent/.dockerignore +3 -0
  14. create/templates/agent/.env.template +4 -0
  15. create/templates/agent/.python-version +1 -0
  16. create/templates/agent/Dockerfile +37 -0
  17. create/templates/agent/Makefile +34 -0
  18. create/templates/agent/README.md +1 -0
  19. create/templates/agent/__main__.py +2 -0
  20. create/templates/agent/agent.json.template +12 -0
  21. create/templates/agent/agent.spec +45 -0
  22. create/templates/agent/config.yaml +1 -0
  23. create/templates/agent/llm_client.py.template +36 -0
  24. create/templates/agent/pyproject.toml.template +9 -0
  25. create/templates/agent/src/__init__.py +0 -0
  26. create/templates/agent/src/graph.py +50 -0
  27. create/templates/agent/src/llm_clients/__init__.py +0 -0
  28. create/templates/agent/tests/__init__.py +0 -0
  29. eval/__init__.py +1 -0
  30. eval/commands.py +562 -0
  31. eval/engine/__init__.py +1 -0
  32. eval/engine/adapter.py +251 -0
  33. eval/engine/bench_runner.py +149 -0
  34. eval/engine/contracts.py +115 -0
  35. eval/engine/eval_config.py +294 -0
  36. eval/engine/evaluator.py +85 -0
  37. eval/engine/metrics/__init__.py +1 -0
  38. eval/engine/metrics/llm_evaluators.py +383 -0
  39. eval/engine/metrics/metrics.py +135 -0
  40. eval/engine/metrics/qualitative_helpers.py +64 -0
  41. eval/engine/orchestrator.py +157 -0
  42. eval/engine/plotter.py +347 -0
  43. image_defaults.py +80 -0
  44. push/__init__.py +3 -0
  45. push/push.py +58 -0
  46. set/__init__.py +3 -0
  47. set/env.py +50 -0
@@ -0,0 +1,157 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from .adapter import BatA2AAdapter
10
+ from .bench_runner import BenchRunner, RunConfig
11
+ from .contracts import EpisodeResult, TaskSpec
12
+ from .metrics.llm_evaluators import evaluate_episode_quality
13
+ from .metrics.metrics import summarize_episode_metrics
14
+ from .metrics.qualitative_helpers import build_context_from_events, build_expected_desc, build_user_facts_summary
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def load_tasks(path: str | Path) -> List[TaskSpec]:
21
+ dataset_path = Path(path)
22
+ try:
23
+ content = dataset_path.read_text(encoding="utf-8").strip()
24
+ objects = json.loads(content)
25
+ if not isinstance(objects, list):
26
+ raise ValueError(f"Expected a JSON array of task objects in {dataset_path}")
27
+ return [TaskSpec.model_validate(obj) for obj in objects]
28
+ except Exception as exc:
29
+ raise ValueError(f"Dataset not formatted correctly in {dataset_path}") from exc
30
+
31
+
32
+ _QUALITATIVE_CONCURRENCY = 8
33
+
34
+
35
+ async def _evaluate_qualitative(results: list[EpisodeResult], tasks_by_id: dict[str, TaskSpec]) -> None:
36
+ sem = asyncio.Semaphore(_QUALITATIVE_CONCURRENCY)
37
+
38
+ async def _score(episode: EpisodeResult) -> None:
39
+ task = tasks_by_id.get(episode.task_id)
40
+ if task is None:
41
+ return
42
+ logger.info(f"Evaluating qualitative scores for episode {episode.task_id}")
43
+ query = " -> ".join(task.turns)
44
+ raw_events = [event.model_dump() for event in episode.trace.events]
45
+ context = build_context_from_events(raw_events)
46
+ user_facts = build_user_facts_summary(raw_events)
47
+ tool_calls = json.dumps(episode.trace.tool_calls, ensure_ascii=False, indent=2)
48
+ expected_desc = build_expected_desc(
49
+ status=task.expected.status,
50
+ expected_outcome=task.expected.expected_outcome,
51
+ output_must_contain=task.expected.output_must_contain,
52
+ expected_tool_calls=task.expected.tool_calls or None,
53
+ )
54
+
55
+ async with sem:
56
+ episode.qualitative_scores = await asyncio.to_thread(
57
+ evaluate_episode_quality,
58
+ query,
59
+ episode.final_output,
60
+ episode.final_status,
61
+ context,
62
+ expected_desc,
63
+ tool_calls,
64
+ bool(task.expected.tool_calls),
65
+ user_facts,
66
+ )
67
+
68
+ await asyncio.gather(*(_score(ep) for ep in results))
69
+
70
+ async def run_evaluation(
71
+ agent_url: str,
72
+ model: str,
73
+ model_provider: str,
74
+ input_path: Path,
75
+ run_name: str = "benchmark",
76
+ task_id: str = "",
77
+ enable_scoring: bool = True,
78
+ enable_qualitative_eval: bool = False,
79
+ k: int = 1,
80
+ out_dir: str = "output",
81
+ ) -> None:
82
+ tasks = load_tasks(input_path)
83
+ tasks_by_id = {task.id: task for task in tasks}
84
+
85
+ bench_runner = BenchRunner(
86
+ adapter=BatA2AAdapter(
87
+ agent_url=agent_url,
88
+ ),
89
+ config=RunConfig(
90
+ run_name=run_name,
91
+ out_dir=out_dir,
92
+ k=k,
93
+ model=f"{model_provider}:{model}",
94
+ task_id=task_id,
95
+ ),
96
+ )
97
+
98
+ logger.info(f"Running evaluation on dataset: {input_path}")
99
+ results = await bench_runner.run(tasks)
100
+ logger.info(f"Evaluation complete. Collected {len(results)} result(s)")
101
+
102
+ if enable_qualitative_eval:
103
+ logger.info("Running qualitative evaluation...")
104
+ await _evaluate_qualitative(results, tasks_by_id)
105
+ bench_runner.persist_results(results)
106
+
107
+ bench_runner.write_summary(results)
108
+
109
+ if not enable_scoring:
110
+ if bench_runner.run_dir:
111
+ logger.info(f"Artifacts written to: {bench_runner.run_dir}")
112
+ return
113
+
114
+ metrics = summarize_episode_metrics(results, k=k)
115
+ if bench_runner.run_dir:
116
+ (bench_runner.run_dir / "metrics.json").write_text(
117
+ json.dumps(metrics, indent=2, ensure_ascii=False),
118
+ encoding="utf-8",
119
+ )
120
+ logger.info(f"Artifacts written to: {bench_runner.run_dir}")
121
+
122
+
123
+ def main() -> int:
124
+ parser = argparse.ArgumentParser(description="Run A2A evaluation in the agent environment")
125
+ parser.add_argument("--dataset", required=True)
126
+ parser.add_argument("--output-dir", required=True)
127
+ parser.add_argument("--agent-url", required=True)
128
+ parser.add_argument("--model-provider", required=True)
129
+ parser.add_argument("--model", required=True)
130
+ parser.add_argument("--task-id", required=True)
131
+ parser.add_argument("--run-name", default="benchmark")
132
+ parser.add_argument("--k", type=int, default=1)
133
+ parser.add_argument("--qualitative", action="store_true")
134
+ args = parser.parse_args()
135
+
136
+ dataset = Path(args.dataset).resolve()
137
+ output_dir = Path(args.output_dir).resolve()
138
+
139
+ asyncio.run(
140
+ run_evaluation(
141
+ agent_url=args.agent_url,
142
+ model=args.model,
143
+ model_provider=args.model_provider,
144
+ input_path=dataset,
145
+ run_name=args.run_name,
146
+ task_id=args.task_id,
147
+ enable_scoring=True,
148
+ enable_qualitative_eval=args.qualitative,
149
+ k=args.k,
150
+ out_dir=str(output_dir),
151
+ )
152
+ )
153
+ return 0
154
+
155
+
156
+ if __name__ == "__main__":
157
+ raise SystemExit(main())
eval/engine/plotter.py ADDED
@@ -0,0 +1,347 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Dict, List, Tuple
6
+
7
+ import matplotlib
8
+ matplotlib.use("Agg")
9
+ import matplotlib.pyplot as plt
10
+
11
+
12
+ def _extract_base_task_id(task_id: str) -> str:
13
+ return re.sub(r"__try\d+$", "", task_id)
14
+
15
+
16
+ def _group_episodes_by_task(per_episode: List[dict]) -> Dict[str, List[dict]]:
17
+ grouped: Dict[str, List[dict]] = {}
18
+ for episode in per_episode:
19
+ base_id = _extract_base_task_id(episode["task_id"])
20
+ grouped.setdefault(base_id, []).append(episode)
21
+ return grouped
22
+
23
+
24
+ def _average_episodes(episodes: List[dict]) -> dict:
25
+ if len(episodes) == 1:
26
+ return episodes[0]
27
+
28
+ n = len(episodes)
29
+ avg_time = sum(ep["time"]["wall_ms"] for ep in episodes) / n
30
+ avg_prompt = sum(ep["tokens"]["prompt_tokens"] for ep in episodes) / n
31
+ avg_completion = sum(ep["tokens"]["completion_tokens"] for ep in episodes) / n
32
+ avg_total = sum(ep["tokens"]["total_tokens"] for ep in episodes) / n
33
+ success_rate = sum(ep["success"] for ep in episodes) / n
34
+
35
+ result = {
36
+ "task_id": _extract_base_task_id(episodes[0]["task_id"]),
37
+ "status": episodes[0]["status"],
38
+ "success": success_rate >= 0.5,
39
+ "success_rate": success_rate,
40
+ "attempts": n,
41
+ "time": {"wall_ms": avg_time},
42
+ "tokens": {
43
+ "prompt_tokens": avg_prompt,
44
+ "completion_tokens": avg_completion,
45
+ "total_tokens": avg_total,
46
+ },
47
+ }
48
+
49
+ qual_episodes = [ep for ep in episodes if "qualitative" in ep]
50
+ if qual_episodes:
51
+ relevance_vals = [ep["qualitative"]["response_relevance"] for ep in qual_episodes if ep["qualitative"].get("response_relevance") is not None]
52
+ completion_vals = [ep["qualitative"]["task_completion_quality"] for ep in qual_episodes if ep["qualitative"].get("task_completion_quality") is not None]
53
+ hallucination_vals = [ep["qualitative"]["hallucination_score"] for ep in qual_episodes if ep["qualitative"].get("hallucination_score") is not None]
54
+
55
+ result["qualitative"] = {
56
+ "response_relevance": sum(relevance_vals) / len(relevance_vals) if relevance_vals else 0,
57
+ "task_completion_quality": sum(completion_vals) / len(completion_vals) if completion_vals else 0,
58
+ "hallucination_score": sum(hallucination_vals) / len(hallucination_vals) if hallucination_vals else 0,
59
+ }
60
+
61
+ return result
62
+
63
+
64
+ def _get_per_episode_averages(metrics_data: dict) -> List[dict]:
65
+ per_episode = metrics_data.get("per_episode", [])
66
+ if not per_episode:
67
+ return []
68
+ grouped = _group_episodes_by_task(per_episode)
69
+ return [_average_episodes(episodes) for episodes in grouped.values()]
70
+
71
+
72
+ def _plot_comparison(metrics: Dict[str, dict]) -> List[Tuple[str, plt.Figure]]:
73
+ """Build summary comparison charts across runs. Returns (name, figure) pairs."""
74
+ if not metrics:
75
+ return []
76
+
77
+ run_names = list(metrics.keys())
78
+ display_names = list(run_names)
79
+
80
+ has_qualitative = any("qualitative" in m.get("summary", {}) for m in metrics.values())
81
+
82
+ times, prompt_tokens, completion_tokens, total_tokens = [], [], [], []
83
+ relevance_scores, completion_quality_scores, hallucination_scores = [], [], []
84
+
85
+ for name in run_names:
86
+ summary = metrics[name].get("summary", {})
87
+ times.append(summary.get("time", {}).get("total_wall_ms", 0) / 1000)
88
+ tokens = summary.get("tokens", {})
89
+ prompt_tokens.append(tokens.get("prompt_tokens_total", 0))
90
+ completion_tokens.append(tokens.get("completion_tokens_total", 0))
91
+ total_tokens.append(tokens.get("total_tokens_total", 0))
92
+ qual = summary.get("qualitative", {})
93
+ relevance_scores.append(qual.get("response_relevance", {}).get("avg", None))
94
+ completion_quality_scores.append(qual.get("task_completion_quality", {}).get("avg", None))
95
+ hallucination_scores.append(qual.get("hallucination_score", {}).get("avg", None))
96
+
97
+ figures: List[Tuple[str, plt.Figure]] = []
98
+
99
+ # 1. Total execution time
100
+ fig1, ax1 = plt.subplots(figsize=(10, 6))
101
+ bars1 = ax1.bar(range(len(run_names)), times, color="steelblue", alpha=0.7)
102
+ ax1.set_xlabel("Run")
103
+ ax1.set_ylabel("Time (seconds)")
104
+ ax1.set_title("Total Execution Time", fontsize=14, fontweight="bold")
105
+ ax1.set_xticks(range(len(run_names)))
106
+ ax1.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
107
+ ax1.grid(axis="y", alpha=0.3)
108
+ for bar, val in zip(bars1, times):
109
+ ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.1f}s", ha="center", va="bottom", fontsize=8)
110
+ fig1.tight_layout()
111
+ figures.append(("execution_time", fig1))
112
+
113
+ # 2. Time vs tokens mirror chart
114
+ fig2, ax2 = plt.subplots(figsize=(max(10, len(run_names) * 1.2), 7))
115
+ x2 = range(len(run_names))
116
+ max_time = max(times) if max(times) > 0 else 1
117
+ max_tok = max(total_tokens) if max(total_tokens) > 0 else 1
118
+ times_norm = [t / max_time for t in times]
119
+ tokens_norm = [-t / max_tok for t in total_tokens]
120
+ ax2.bar(x2, times_norm, color="steelblue", alpha=0.75, label="Execution Time")
121
+ ax2.bar(x2, tokens_norm, color="darkorange", alpha=0.75, label="Total Tokens")
122
+ ax2.axhline(0, color="black", linewidth=0.8)
123
+ for xi, t_n, t_val, tok_n, tok_val in zip(x2, times_norm, times, tokens_norm, total_tokens):
124
+ ax2.text(xi, t_n + 0.02, f"{t_val:.1f}s", ha="center", va="bottom", fontsize=8, color="steelblue")
125
+ ax2.text(xi, tok_n - 0.02, f"{tok_val:,}", ha="center", va="top", fontsize=8, color="darkorange")
126
+ ax2.set_xticks(x2)
127
+ ax2.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
128
+ ax2.set_yticks([-1, -0.5, 0, 0.5, 1])
129
+ ax2.set_yticklabels([f"max\n({max_tok:,} tok)", "50%", "0", "50%", f"max\n({max_time:.1f}s)"], fontsize=8)
130
+ ax2.set_title("Execution Time ↑ vs Total Tokens ↓", fontsize=14, fontweight="bold")
131
+ ax2.legend(fontsize=9)
132
+ ax2.grid(axis="y", alpha=0.2)
133
+ fig2.tight_layout()
134
+ figures.append(("time_vs_total_tokens", fig2))
135
+
136
+ # 3. Token usage stacked
137
+ fig3, ax3 = plt.subplots(figsize=(10, 6))
138
+ x3 = range(len(run_names))
139
+ ax3.bar(x3, prompt_tokens, label="Prompt Tokens", color="cornflowerblue", alpha=0.8)
140
+ ax3.bar(x3, completion_tokens, bottom=prompt_tokens, label="Completion Tokens", color="lightcoral", alpha=0.8)
141
+ ax3.set_xlabel("Run")
142
+ ax3.set_ylabel("Token Count")
143
+ ax3.set_title("Token Usage (Prompt vs Completion)", fontsize=14, fontweight="bold")
144
+ ax3.set_xticks(x3)
145
+ ax3.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
146
+ ax3.legend()
147
+ ax3.grid(axis="y", alpha=0.3)
148
+ fig3.tight_layout()
149
+ figures.append(("token_usage", fig3))
150
+
151
+ # 4. Total tokens
152
+ fig4, ax4 = plt.subplots(figsize=(10, 6))
153
+ bars4 = ax4.bar(range(len(run_names)), total_tokens, color="mediumpurple", alpha=0.7)
154
+ ax4.set_xlabel("Run")
155
+ ax4.set_ylabel("Total Tokens")
156
+ ax4.set_title("Total Tokens", fontsize=14, fontweight="bold")
157
+ ax4.set_xticks(range(len(run_names)))
158
+ ax4.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
159
+ ax4.grid(axis="y", alpha=0.3)
160
+ for bar, val in zip(bars4, total_tokens):
161
+ ax4.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:,}", ha="center", va="bottom", fontsize=8)
162
+ fig4.tight_layout()
163
+ figures.append(("total_tokens", fig4))
164
+
165
+ if not has_qualitative:
166
+ return figures
167
+
168
+ # 5. Combined qualitative
169
+ fig5, ax5 = plt.subplots(figsize=(10, 6))
170
+ x_pos = range(len(run_names))
171
+ width = 0.25
172
+ rel_vals = [v if v is not None else 0 for v in relevance_scores]
173
+ comp_vals = [v if v is not None else 0 for v in completion_quality_scores]
174
+ hall_vals = [v if v is not None else 0 for v in hallucination_scores]
175
+ ax5.bar([i - width for i in x_pos], rel_vals, width, label="Response Relevance", color="lightblue", alpha=0.8)
176
+ ax5.bar(x_pos, comp_vals, width, label="Task Completion", color="lightgreen", alpha=0.8)
177
+ ax5.bar([i + width for i in x_pos], hall_vals, width, label="Groundedness", color="khaki", alpha=0.8)
178
+ ax5.set_xlabel("Run")
179
+ ax5.set_ylabel("Score (0-1)")
180
+ ax5.set_title("Qualitative Metrics (LLM Judge)", fontsize=14, fontweight="bold")
181
+ ax5.set_xticks(x_pos)
182
+ ax5.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
183
+ ax5.set_ylim(0, 1.1)
184
+ ax5.legend(fontsize=8)
185
+ ax5.grid(axis="y", alpha=0.3)
186
+ ax5.axhline(y=0.7, color="orange", linestyle="--", alpha=0.3)
187
+ fig5.tight_layout()
188
+ figures.append(("qualitative_metrics", fig5))
189
+
190
+ # 6–8. Individual qualitative charts
191
+ for metric_name, vals, title in [
192
+ ("response_relevance", rel_vals, "Response Relevance"),
193
+ ("task_completion_quality", comp_vals, "Task Completion Quality"),
194
+ ("hallucination_score", hall_vals, "Groundedness (Hallucination Score)"),
195
+ ]:
196
+ fig, ax = plt.subplots(figsize=(10, 6))
197
+ colors = ["green" if s >= 0.8 else "orange" if s >= 0.6 else "red" for s in vals]
198
+ bars = ax.bar(range(len(run_names)), vals, color=colors, alpha=0.7)
199
+ ax.set_xlabel("Run")
200
+ ax.set_ylabel("Score (0-1)")
201
+ ax.set_title(title, fontsize=14, fontweight="bold")
202
+ ax.set_xticks(range(len(run_names)))
203
+ ax.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
204
+ ax.set_ylim(0, 1.1)
205
+ ax.grid(axis="y", alpha=0.3)
206
+ ax.axhline(y=0.8, color="green", linestyle="--", alpha=0.3, linewidth=1)
207
+ ax.axhline(y=0.6, color="orange", linestyle="--", alpha=0.3, linewidth=1)
208
+ for bar, val in zip(bars, vals):
209
+ if val > 0:
210
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.3f}", ha="center", va="bottom", fontsize=8)
211
+ fig.tight_layout()
212
+ figures.append((metric_name, fig))
213
+
214
+ return figures
215
+
216
+
217
+ def _plot_per_episode_comparison(
218
+ metrics: Dict[str, dict],
219
+ task_filter: str | None = None,
220
+ ) -> List[Tuple[str, plt.Figure]]:
221
+ """Build per-task charts comparing all runs. Returns (name, figure) pairs.
222
+
223
+ If ``task_filter`` is provided, only tasks whose id contains the substring
224
+ are plotted.
225
+ """
226
+ if not metrics:
227
+ return []
228
+
229
+ tasks_by_model: Dict[str, Dict[str, dict]] = {}
230
+ all_task_ids: set = set()
231
+
232
+ for run_name, data in metrics.items():
233
+ episodes = _get_per_episode_averages(data)
234
+ tasks_by_model[run_name] = {ep["task_id"]: ep for ep in episodes}
235
+ all_task_ids.update(ep["task_id"] for ep in episodes)
236
+
237
+ if not all_task_ids:
238
+ return []
239
+
240
+ if task_filter:
241
+ all_task_ids = {tid for tid in all_task_ids if task_filter in tid}
242
+ if not all_task_ids:
243
+ return []
244
+
245
+ sorted_tasks = sorted(all_task_ids)
246
+ model_names = list(metrics.keys())
247
+ has_qualitative = any(
248
+ any("qualitative" in ep for ep in _get_per_episode_averages(data))
249
+ for data in metrics.values()
250
+ )
251
+
252
+ figures: List[Tuple[str, plt.Figure]] = []
253
+
254
+ for task_id in sorted_tasks:
255
+ task_data = []
256
+ available_models = []
257
+ for run_name in model_names:
258
+ if task_id in tasks_by_model[run_name]:
259
+ task_data.append(tasks_by_model[run_name][task_id])
260
+ available_models.append(run_name)
261
+
262
+ if not task_data:
263
+ continue
264
+
265
+ display_names = list(available_models)
266
+ n_plots = 3 if has_qualitative else 2
267
+ fig, axes = plt.subplots(n_plots, 1, figsize=(max(10, len(task_data) * 0.8), 4 * n_plots))
268
+ if n_plots == 1:
269
+ axes = [axes]
270
+
271
+ fig.suptitle(f"Task: {task_id}", fontsize=14, fontweight="bold")
272
+
273
+ # Time
274
+ times = [ep["time"]["wall_ms"] / 1000 for ep in task_data]
275
+ axes[0].bar(range(len(display_names)), times, color="steelblue", alpha=0.7)
276
+ axes[0].set_ylabel("Time (s)", fontsize=11)
277
+ axes[0].set_title("Execution Time by Model", fontsize=12, fontweight="bold")
278
+ axes[0].set_xticks(range(len(display_names)))
279
+ axes[0].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
280
+ axes[0].grid(axis="y", alpha=0.3)
281
+ for bar, val in zip(axes[0].patches, times):
282
+ axes[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.1f}s", ha="center", va="bottom", fontsize=9)
283
+
284
+ # Tokens
285
+ prompt_tokens = [ep["tokens"]["prompt_tokens"] for ep in task_data]
286
+ completion_tokens = [ep["tokens"]["completion_tokens"] for ep in task_data]
287
+ x = range(len(display_names))
288
+ axes[1].bar(x, prompt_tokens, label="Prompt", color="cornflowerblue", alpha=0.8)
289
+ axes[1].bar(x, completion_tokens, bottom=prompt_tokens, label="Completion", color="lightcoral", alpha=0.8)
290
+ axes[1].set_ylabel("Tokens", fontsize=11)
291
+ axes[1].set_title("Token Usage by Model", fontsize=12, fontweight="bold")
292
+ axes[1].set_xticks(x)
293
+ axes[1].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
294
+ axes[1].legend(fontsize=9)
295
+ axes[1].grid(axis="y", alpha=0.3)
296
+
297
+ # Qualitative
298
+ if has_qualitative:
299
+ width = 0.25
300
+ x_pos = range(len(display_names))
301
+ relevance = [ep.get("qualitative", {}).get("response_relevance", 0) for ep in task_data]
302
+ completion_q = [ep.get("qualitative", {}).get("task_completion_quality", 0) for ep in task_data]
303
+ hallucination = [ep.get("qualitative", {}).get("hallucination_score", 0) for ep in task_data]
304
+ axes[2].bar([i - width for i in x_pos], relevance, width, label="Relevance", color="lightblue", alpha=0.8)
305
+ axes[2].bar(x_pos, completion_q, width, label="Completion", color="lightgreen", alpha=0.8)
306
+ axes[2].bar([i + width for i in x_pos], hallucination, width, label="Groundedness", color="khaki", alpha=0.8)
307
+ axes[2].set_ylabel("Score", fontsize=11)
308
+ axes[2].set_title("Qualitative Metrics by Model", fontsize=12, fontweight="bold")
309
+ axes[2].set_xticks(x_pos)
310
+ axes[2].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
311
+ axes[2].set_ylim(0, 1.1)
312
+ axes[2].legend(fontsize=9)
313
+ axes[2].grid(axis="y", alpha=0.3)
314
+ axes[2].axhline(y=0.7, color="orange", linestyle="--", alpha=0.3)
315
+
316
+ fig.tight_layout()
317
+ safe_task = task_id.replace(":", "-").replace("/", "-").replace(" ", "_")
318
+ figures.append((f"per_task_{safe_task}", fig))
319
+
320
+ return figures
321
+
322
+
323
+ def _save_figures(figures: List[Tuple[str, plt.Figure]], output_dir: Path) -> List[Path]:
324
+ output_dir.mkdir(parents=True, exist_ok=True)
325
+ saved: List[Path] = []
326
+ for name, fig in figures:
327
+ path = output_dir / f"metrics_{name}.png"
328
+ fig.savefig(path, dpi=150, bbox_inches="tight")
329
+ plt.close(fig)
330
+ saved.append(path)
331
+ return saved
332
+
333
+
334
+ def generate_and_save_plots(
335
+ metrics: Dict[str, dict],
336
+ output_dir: Path,
337
+ task_filter: str | None = None,
338
+ ) -> List[Path]:
339
+ """Generate all comparison and per-task charts and save them to output_dir.
340
+
341
+ ``task_filter`` is a substring match against ``task_id`` and restricts
342
+ the per-task charts only. Summary/comparison charts always reflect the
343
+ full run.
344
+ """
345
+ figures = _plot_comparison(metrics)
346
+ figures += _plot_per_episode_comparison(metrics, task_filter=task_filter)
347
+ return _save_figures(figures, output_dir)
image_defaults.py ADDED
@@ -0,0 +1,80 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+
5
+
6
+ REPO_ENV_VAR = "BAT_DOCKER_REPO"
7
+ REGISTRY_ENV_VAR = "BAT_DOCKER_REGISTRY"
8
+ DEFAULT_REGISTRY = "default_registry"
9
+
10
+
11
+ def default_repo_name(context_dir: Path) -> str:
12
+ project = re.sub(r"[^a-z0-9]+", "-", context_dir.name.lower()).strip("-") or "agent"
13
+ return f"default-repository/{project}"
14
+
15
+
16
+ def _read_dotenv_value(key: str) -> str | None:
17
+ dotenv_path = Path.cwd() / ".env"
18
+ if not dotenv_path.is_file():
19
+ return None
20
+
21
+ try:
22
+ content = dotenv_path.read_text(encoding="utf-8")
23
+ except OSError:
24
+ return None
25
+
26
+ match = re.search(
27
+ rf"^\s*(?:export\s+)?{re.escape(key)}\s*=\s*(.*?)\s*$",
28
+ content,
29
+ flags=re.MULTILINE,
30
+ )
31
+ if not match:
32
+ return None
33
+
34
+ value = match.group(1).strip()
35
+ if value.startswith("#"):
36
+ return None
37
+
38
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
39
+ value = value[1:-1].strip()
40
+
41
+ if value:
42
+ return value
43
+
44
+ return None
45
+
46
+
47
+ def _resolve_image_value(
48
+ cli_value: str | None,
49
+ *,
50
+ env_key: str,
51
+ default_value: str,
52
+ ) -> str:
53
+ if cli_value:
54
+ return cli_value
55
+
56
+ env_value = os.environ.get(env_key, "").strip()
57
+ if env_value:
58
+ return env_value
59
+
60
+ dotenv_value = _read_dotenv_value(env_key)
61
+ if dotenv_value:
62
+ return dotenv_value
63
+
64
+ return default_value
65
+
66
+
67
+ def resolve_repo_name(context_dir: Path, repo: str | None) -> str:
68
+ return _resolve_image_value(
69
+ repo,
70
+ env_key=REPO_ENV_VAR,
71
+ default_value=default_repo_name(context_dir),
72
+ )
73
+
74
+
75
+ def resolve_registry(context_dir: Path, registry: str | None) -> str:
76
+ return _resolve_image_value(
77
+ registry,
78
+ env_key=REGISTRY_ENV_VAR,
79
+ default_value=DEFAULT_REGISTRY,
80
+ )
push/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .push import push_image
2
+
3
+ __all__ = ["push_image"]
push/push.py ADDED
@@ -0,0 +1,58 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+
4
+ import typer
5
+
6
+ from image_defaults import resolve_registry, resolve_repo_name
7
+
8
+
9
+ def push_image(
10
+ context: Path = typer.Option(
11
+ Path("."),
12
+ "--context",
13
+ "-C",
14
+ help="Directory used to infer the default repository name.",
15
+ ),
16
+ docker_registry: str | None = typer.Option(
17
+ None,
18
+ "--docker-registry",
19
+ help=(
20
+ "Docker registry hostname. Precedence: --docker-registry > "
21
+ "BAT_DOCKER_REGISTRY env var (or .env in current directory) > default_registry."
22
+ ),
23
+ ),
24
+ repo: str | None = typer.Option(
25
+ None,
26
+ "--repo",
27
+ help=(
28
+ "Image repository path. Precedence: --repo > BAT_DOCKER_REPO env var "
29
+ "(or .env in current directory) > default-repository/<project-name>."
30
+ ),
31
+ ),
32
+ tag: str = typer.Option(
33
+ "latest",
34
+ "--tag",
35
+ help="Image tag.",
36
+ ),
37
+ ) -> None:
38
+ context_dir = context.resolve()
39
+ if not context_dir.is_dir():
40
+ typer.secho(f"Context directory not found: {context_dir}", fg=typer.colors.RED, err=True)
41
+ raise typer.Exit(code=1)
42
+
43
+ resolved_registry = resolve_registry(context_dir, docker_registry)
44
+ resolved_repo = resolve_repo_name(context_dir, repo)
45
+ image = f"{resolved_registry}/{resolved_repo}:{tag}"
46
+ command = ["docker", "push", image]
47
+
48
+ typer.echo(f"Pushing Docker image: {image}")
49
+ try:
50
+ subprocess.run(command, check=True, cwd=context_dir)
51
+ except FileNotFoundError as exc:
52
+ typer.secho("Docker executable not found in PATH.", fg=typer.colors.RED, err=True)
53
+ raise typer.Exit(code=1) from exc
54
+ except subprocess.CalledProcessError as exc:
55
+ typer.secho("Docker push failed.", fg=typer.colors.RED, err=True)
56
+ raise typer.Exit(code=exc.returncode or 1) from exc
57
+
58
+ typer.secho(f"Docker image pushed successfully: {image}", fg=typer.colors.GREEN)
set/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .env import set_env_values
2
+
3
+ __all__ = ["set_env_values"]