bat-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- add/__init__.py +3 -0
- add/client.py +16 -0
- bat_cli-0.1.0.dist-info/METADATA +231 -0
- bat_cli-0.1.0.dist-info/RECORD +47 -0
- bat_cli-0.1.0.dist-info/WHEEL +5 -0
- bat_cli-0.1.0.dist-info/entry_points.txt +2 -0
- bat_cli-0.1.0.dist-info/top_level.txt +8 -0
- build/__init__.py +3 -0
- build/build.py +79 -0
- cli.py +260 -0
- create/__init__.py +3 -0
- create/agent.py +312 -0
- create/templates/agent/.dockerignore +3 -0
- create/templates/agent/.env.template +4 -0
- create/templates/agent/.python-version +1 -0
- create/templates/agent/Dockerfile +37 -0
- create/templates/agent/Makefile +34 -0
- create/templates/agent/README.md +1 -0
- create/templates/agent/__main__.py +2 -0
- create/templates/agent/agent.json.template +12 -0
- create/templates/agent/agent.spec +45 -0
- create/templates/agent/config.yaml +1 -0
- create/templates/agent/llm_client.py.template +36 -0
- create/templates/agent/pyproject.toml.template +9 -0
- create/templates/agent/src/__init__.py +0 -0
- create/templates/agent/src/graph.py +50 -0
- create/templates/agent/src/llm_clients/__init__.py +0 -0
- create/templates/agent/tests/__init__.py +0 -0
- eval/__init__.py +1 -0
- eval/commands.py +562 -0
- eval/engine/__init__.py +1 -0
- eval/engine/adapter.py +251 -0
- eval/engine/bench_runner.py +149 -0
- eval/engine/contracts.py +115 -0
- eval/engine/eval_config.py +294 -0
- eval/engine/evaluator.py +85 -0
- eval/engine/metrics/__init__.py +1 -0
- eval/engine/metrics/llm_evaluators.py +383 -0
- eval/engine/metrics/metrics.py +135 -0
- eval/engine/metrics/qualitative_helpers.py +64 -0
- eval/engine/orchestrator.py +157 -0
- eval/engine/plotter.py +347 -0
- image_defaults.py +80 -0
- push/__init__.py +3 -0
- push/push.py +58 -0
- set/__init__.py +3 -0
- set/env.py +50 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .adapter import BatA2AAdapter
|
|
10
|
+
from .bench_runner import BenchRunner, RunConfig
|
|
11
|
+
from .contracts import EpisodeResult, TaskSpec
|
|
12
|
+
from .metrics.llm_evaluators import evaluate_episode_quality
|
|
13
|
+
from .metrics.metrics import summarize_episode_metrics
|
|
14
|
+
from .metrics.qualitative_helpers import build_context_from_events, build_expected_desc, build_user_facts_summary
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_tasks(path: str | Path) -> List[TaskSpec]:
|
|
21
|
+
dataset_path = Path(path)
|
|
22
|
+
try:
|
|
23
|
+
content = dataset_path.read_text(encoding="utf-8").strip()
|
|
24
|
+
objects = json.loads(content)
|
|
25
|
+
if not isinstance(objects, list):
|
|
26
|
+
raise ValueError(f"Expected a JSON array of task objects in {dataset_path}")
|
|
27
|
+
return [TaskSpec.model_validate(obj) for obj in objects]
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
raise ValueError(f"Dataset not formatted correctly in {dataset_path}") from exc
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
_QUALITATIVE_CONCURRENCY = 8
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def _evaluate_qualitative(results: list[EpisodeResult], tasks_by_id: dict[str, TaskSpec]) -> None:
|
|
36
|
+
sem = asyncio.Semaphore(_QUALITATIVE_CONCURRENCY)
|
|
37
|
+
|
|
38
|
+
async def _score(episode: EpisodeResult) -> None:
|
|
39
|
+
task = tasks_by_id.get(episode.task_id)
|
|
40
|
+
if task is None:
|
|
41
|
+
return
|
|
42
|
+
logger.info(f"Evaluating qualitative scores for episode {episode.task_id}")
|
|
43
|
+
query = " -> ".join(task.turns)
|
|
44
|
+
raw_events = [event.model_dump() for event in episode.trace.events]
|
|
45
|
+
context = build_context_from_events(raw_events)
|
|
46
|
+
user_facts = build_user_facts_summary(raw_events)
|
|
47
|
+
tool_calls = json.dumps(episode.trace.tool_calls, ensure_ascii=False, indent=2)
|
|
48
|
+
expected_desc = build_expected_desc(
|
|
49
|
+
status=task.expected.status,
|
|
50
|
+
expected_outcome=task.expected.expected_outcome,
|
|
51
|
+
output_must_contain=task.expected.output_must_contain,
|
|
52
|
+
expected_tool_calls=task.expected.tool_calls or None,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
async with sem:
|
|
56
|
+
episode.qualitative_scores = await asyncio.to_thread(
|
|
57
|
+
evaluate_episode_quality,
|
|
58
|
+
query,
|
|
59
|
+
episode.final_output,
|
|
60
|
+
episode.final_status,
|
|
61
|
+
context,
|
|
62
|
+
expected_desc,
|
|
63
|
+
tool_calls,
|
|
64
|
+
bool(task.expected.tool_calls),
|
|
65
|
+
user_facts,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
await asyncio.gather(*(_score(ep) for ep in results))
|
|
69
|
+
|
|
70
|
+
async def run_evaluation(
|
|
71
|
+
agent_url: str,
|
|
72
|
+
model: str,
|
|
73
|
+
model_provider: str,
|
|
74
|
+
input_path: Path,
|
|
75
|
+
run_name: str = "benchmark",
|
|
76
|
+
task_id: str = "",
|
|
77
|
+
enable_scoring: bool = True,
|
|
78
|
+
enable_qualitative_eval: bool = False,
|
|
79
|
+
k: int = 1,
|
|
80
|
+
out_dir: str = "output",
|
|
81
|
+
) -> None:
|
|
82
|
+
tasks = load_tasks(input_path)
|
|
83
|
+
tasks_by_id = {task.id: task for task in tasks}
|
|
84
|
+
|
|
85
|
+
bench_runner = BenchRunner(
|
|
86
|
+
adapter=BatA2AAdapter(
|
|
87
|
+
agent_url=agent_url,
|
|
88
|
+
),
|
|
89
|
+
config=RunConfig(
|
|
90
|
+
run_name=run_name,
|
|
91
|
+
out_dir=out_dir,
|
|
92
|
+
k=k,
|
|
93
|
+
model=f"{model_provider}:{model}",
|
|
94
|
+
task_id=task_id,
|
|
95
|
+
),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
logger.info(f"Running evaluation on dataset: {input_path}")
|
|
99
|
+
results = await bench_runner.run(tasks)
|
|
100
|
+
logger.info(f"Evaluation complete. Collected {len(results)} result(s)")
|
|
101
|
+
|
|
102
|
+
if enable_qualitative_eval:
|
|
103
|
+
logger.info("Running qualitative evaluation...")
|
|
104
|
+
await _evaluate_qualitative(results, tasks_by_id)
|
|
105
|
+
bench_runner.persist_results(results)
|
|
106
|
+
|
|
107
|
+
bench_runner.write_summary(results)
|
|
108
|
+
|
|
109
|
+
if not enable_scoring:
|
|
110
|
+
if bench_runner.run_dir:
|
|
111
|
+
logger.info(f"Artifacts written to: {bench_runner.run_dir}")
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
metrics = summarize_episode_metrics(results, k=k)
|
|
115
|
+
if bench_runner.run_dir:
|
|
116
|
+
(bench_runner.run_dir / "metrics.json").write_text(
|
|
117
|
+
json.dumps(metrics, indent=2, ensure_ascii=False),
|
|
118
|
+
encoding="utf-8",
|
|
119
|
+
)
|
|
120
|
+
logger.info(f"Artifacts written to: {bench_runner.run_dir}")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main() -> int:
|
|
124
|
+
parser = argparse.ArgumentParser(description="Run A2A evaluation in the agent environment")
|
|
125
|
+
parser.add_argument("--dataset", required=True)
|
|
126
|
+
parser.add_argument("--output-dir", required=True)
|
|
127
|
+
parser.add_argument("--agent-url", required=True)
|
|
128
|
+
parser.add_argument("--model-provider", required=True)
|
|
129
|
+
parser.add_argument("--model", required=True)
|
|
130
|
+
parser.add_argument("--task-id", required=True)
|
|
131
|
+
parser.add_argument("--run-name", default="benchmark")
|
|
132
|
+
parser.add_argument("--k", type=int, default=1)
|
|
133
|
+
parser.add_argument("--qualitative", action="store_true")
|
|
134
|
+
args = parser.parse_args()
|
|
135
|
+
|
|
136
|
+
dataset = Path(args.dataset).resolve()
|
|
137
|
+
output_dir = Path(args.output_dir).resolve()
|
|
138
|
+
|
|
139
|
+
asyncio.run(
|
|
140
|
+
run_evaluation(
|
|
141
|
+
agent_url=args.agent_url,
|
|
142
|
+
model=args.model,
|
|
143
|
+
model_provider=args.model_provider,
|
|
144
|
+
input_path=dataset,
|
|
145
|
+
run_name=args.run_name,
|
|
146
|
+
task_id=args.task_id,
|
|
147
|
+
enable_scoring=True,
|
|
148
|
+
enable_qualitative_eval=args.qualitative,
|
|
149
|
+
k=args.k,
|
|
150
|
+
out_dir=str(output_dir),
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
raise SystemExit(main())
|
eval/engine/plotter.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
import matplotlib
|
|
8
|
+
matplotlib.use("Agg")
|
|
9
|
+
import matplotlib.pyplot as plt
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _extract_base_task_id(task_id: str) -> str:
|
|
13
|
+
return re.sub(r"__try\d+$", "", task_id)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _group_episodes_by_task(per_episode: List[dict]) -> Dict[str, List[dict]]:
|
|
17
|
+
grouped: Dict[str, List[dict]] = {}
|
|
18
|
+
for episode in per_episode:
|
|
19
|
+
base_id = _extract_base_task_id(episode["task_id"])
|
|
20
|
+
grouped.setdefault(base_id, []).append(episode)
|
|
21
|
+
return grouped
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _average_episodes(episodes: List[dict]) -> dict:
|
|
25
|
+
if len(episodes) == 1:
|
|
26
|
+
return episodes[0]
|
|
27
|
+
|
|
28
|
+
n = len(episodes)
|
|
29
|
+
avg_time = sum(ep["time"]["wall_ms"] for ep in episodes) / n
|
|
30
|
+
avg_prompt = sum(ep["tokens"]["prompt_tokens"] for ep in episodes) / n
|
|
31
|
+
avg_completion = sum(ep["tokens"]["completion_tokens"] for ep in episodes) / n
|
|
32
|
+
avg_total = sum(ep["tokens"]["total_tokens"] for ep in episodes) / n
|
|
33
|
+
success_rate = sum(ep["success"] for ep in episodes) / n
|
|
34
|
+
|
|
35
|
+
result = {
|
|
36
|
+
"task_id": _extract_base_task_id(episodes[0]["task_id"]),
|
|
37
|
+
"status": episodes[0]["status"],
|
|
38
|
+
"success": success_rate >= 0.5,
|
|
39
|
+
"success_rate": success_rate,
|
|
40
|
+
"attempts": n,
|
|
41
|
+
"time": {"wall_ms": avg_time},
|
|
42
|
+
"tokens": {
|
|
43
|
+
"prompt_tokens": avg_prompt,
|
|
44
|
+
"completion_tokens": avg_completion,
|
|
45
|
+
"total_tokens": avg_total,
|
|
46
|
+
},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
qual_episodes = [ep for ep in episodes if "qualitative" in ep]
|
|
50
|
+
if qual_episodes:
|
|
51
|
+
relevance_vals = [ep["qualitative"]["response_relevance"] for ep in qual_episodes if ep["qualitative"].get("response_relevance") is not None]
|
|
52
|
+
completion_vals = [ep["qualitative"]["task_completion_quality"] for ep in qual_episodes if ep["qualitative"].get("task_completion_quality") is not None]
|
|
53
|
+
hallucination_vals = [ep["qualitative"]["hallucination_score"] for ep in qual_episodes if ep["qualitative"].get("hallucination_score") is not None]
|
|
54
|
+
|
|
55
|
+
result["qualitative"] = {
|
|
56
|
+
"response_relevance": sum(relevance_vals) / len(relevance_vals) if relevance_vals else 0,
|
|
57
|
+
"task_completion_quality": sum(completion_vals) / len(completion_vals) if completion_vals else 0,
|
|
58
|
+
"hallucination_score": sum(hallucination_vals) / len(hallucination_vals) if hallucination_vals else 0,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_per_episode_averages(metrics_data: dict) -> List[dict]:
|
|
65
|
+
per_episode = metrics_data.get("per_episode", [])
|
|
66
|
+
if not per_episode:
|
|
67
|
+
return []
|
|
68
|
+
grouped = _group_episodes_by_task(per_episode)
|
|
69
|
+
return [_average_episodes(episodes) for episodes in grouped.values()]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _plot_comparison(metrics: Dict[str, dict]) -> List[Tuple[str, plt.Figure]]:
|
|
73
|
+
"""Build summary comparison charts across runs. Returns (name, figure) pairs."""
|
|
74
|
+
if not metrics:
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
run_names = list(metrics.keys())
|
|
78
|
+
display_names = list(run_names)
|
|
79
|
+
|
|
80
|
+
has_qualitative = any("qualitative" in m.get("summary", {}) for m in metrics.values())
|
|
81
|
+
|
|
82
|
+
times, prompt_tokens, completion_tokens, total_tokens = [], [], [], []
|
|
83
|
+
relevance_scores, completion_quality_scores, hallucination_scores = [], [], []
|
|
84
|
+
|
|
85
|
+
for name in run_names:
|
|
86
|
+
summary = metrics[name].get("summary", {})
|
|
87
|
+
times.append(summary.get("time", {}).get("total_wall_ms", 0) / 1000)
|
|
88
|
+
tokens = summary.get("tokens", {})
|
|
89
|
+
prompt_tokens.append(tokens.get("prompt_tokens_total", 0))
|
|
90
|
+
completion_tokens.append(tokens.get("completion_tokens_total", 0))
|
|
91
|
+
total_tokens.append(tokens.get("total_tokens_total", 0))
|
|
92
|
+
qual = summary.get("qualitative", {})
|
|
93
|
+
relevance_scores.append(qual.get("response_relevance", {}).get("avg", None))
|
|
94
|
+
completion_quality_scores.append(qual.get("task_completion_quality", {}).get("avg", None))
|
|
95
|
+
hallucination_scores.append(qual.get("hallucination_score", {}).get("avg", None))
|
|
96
|
+
|
|
97
|
+
figures: List[Tuple[str, plt.Figure]] = []
|
|
98
|
+
|
|
99
|
+
# 1. Total execution time
|
|
100
|
+
fig1, ax1 = plt.subplots(figsize=(10, 6))
|
|
101
|
+
bars1 = ax1.bar(range(len(run_names)), times, color="steelblue", alpha=0.7)
|
|
102
|
+
ax1.set_xlabel("Run")
|
|
103
|
+
ax1.set_ylabel("Time (seconds)")
|
|
104
|
+
ax1.set_title("Total Execution Time", fontsize=14, fontweight="bold")
|
|
105
|
+
ax1.set_xticks(range(len(run_names)))
|
|
106
|
+
ax1.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
107
|
+
ax1.grid(axis="y", alpha=0.3)
|
|
108
|
+
for bar, val in zip(bars1, times):
|
|
109
|
+
ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.1f}s", ha="center", va="bottom", fontsize=8)
|
|
110
|
+
fig1.tight_layout()
|
|
111
|
+
figures.append(("execution_time", fig1))
|
|
112
|
+
|
|
113
|
+
# 2. Time vs tokens mirror chart
|
|
114
|
+
fig2, ax2 = plt.subplots(figsize=(max(10, len(run_names) * 1.2), 7))
|
|
115
|
+
x2 = range(len(run_names))
|
|
116
|
+
max_time = max(times) if max(times) > 0 else 1
|
|
117
|
+
max_tok = max(total_tokens) if max(total_tokens) > 0 else 1
|
|
118
|
+
times_norm = [t / max_time for t in times]
|
|
119
|
+
tokens_norm = [-t / max_tok for t in total_tokens]
|
|
120
|
+
ax2.bar(x2, times_norm, color="steelblue", alpha=0.75, label="Execution Time")
|
|
121
|
+
ax2.bar(x2, tokens_norm, color="darkorange", alpha=0.75, label="Total Tokens")
|
|
122
|
+
ax2.axhline(0, color="black", linewidth=0.8)
|
|
123
|
+
for xi, t_n, t_val, tok_n, tok_val in zip(x2, times_norm, times, tokens_norm, total_tokens):
|
|
124
|
+
ax2.text(xi, t_n + 0.02, f"{t_val:.1f}s", ha="center", va="bottom", fontsize=8, color="steelblue")
|
|
125
|
+
ax2.text(xi, tok_n - 0.02, f"{tok_val:,}", ha="center", va="top", fontsize=8, color="darkorange")
|
|
126
|
+
ax2.set_xticks(x2)
|
|
127
|
+
ax2.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
128
|
+
ax2.set_yticks([-1, -0.5, 0, 0.5, 1])
|
|
129
|
+
ax2.set_yticklabels([f"max\n({max_tok:,} tok)", "50%", "0", "50%", f"max\n({max_time:.1f}s)"], fontsize=8)
|
|
130
|
+
ax2.set_title("Execution Time ↑ vs Total Tokens ↓", fontsize=14, fontweight="bold")
|
|
131
|
+
ax2.legend(fontsize=9)
|
|
132
|
+
ax2.grid(axis="y", alpha=0.2)
|
|
133
|
+
fig2.tight_layout()
|
|
134
|
+
figures.append(("time_vs_total_tokens", fig2))
|
|
135
|
+
|
|
136
|
+
# 3. Token usage stacked
|
|
137
|
+
fig3, ax3 = plt.subplots(figsize=(10, 6))
|
|
138
|
+
x3 = range(len(run_names))
|
|
139
|
+
ax3.bar(x3, prompt_tokens, label="Prompt Tokens", color="cornflowerblue", alpha=0.8)
|
|
140
|
+
ax3.bar(x3, completion_tokens, bottom=prompt_tokens, label="Completion Tokens", color="lightcoral", alpha=0.8)
|
|
141
|
+
ax3.set_xlabel("Run")
|
|
142
|
+
ax3.set_ylabel("Token Count")
|
|
143
|
+
ax3.set_title("Token Usage (Prompt vs Completion)", fontsize=14, fontweight="bold")
|
|
144
|
+
ax3.set_xticks(x3)
|
|
145
|
+
ax3.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
146
|
+
ax3.legend()
|
|
147
|
+
ax3.grid(axis="y", alpha=0.3)
|
|
148
|
+
fig3.tight_layout()
|
|
149
|
+
figures.append(("token_usage", fig3))
|
|
150
|
+
|
|
151
|
+
# 4. Total tokens
|
|
152
|
+
fig4, ax4 = plt.subplots(figsize=(10, 6))
|
|
153
|
+
bars4 = ax4.bar(range(len(run_names)), total_tokens, color="mediumpurple", alpha=0.7)
|
|
154
|
+
ax4.set_xlabel("Run")
|
|
155
|
+
ax4.set_ylabel("Total Tokens")
|
|
156
|
+
ax4.set_title("Total Tokens", fontsize=14, fontweight="bold")
|
|
157
|
+
ax4.set_xticks(range(len(run_names)))
|
|
158
|
+
ax4.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
159
|
+
ax4.grid(axis="y", alpha=0.3)
|
|
160
|
+
for bar, val in zip(bars4, total_tokens):
|
|
161
|
+
ax4.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:,}", ha="center", va="bottom", fontsize=8)
|
|
162
|
+
fig4.tight_layout()
|
|
163
|
+
figures.append(("total_tokens", fig4))
|
|
164
|
+
|
|
165
|
+
if not has_qualitative:
|
|
166
|
+
return figures
|
|
167
|
+
|
|
168
|
+
# 5. Combined qualitative
|
|
169
|
+
fig5, ax5 = plt.subplots(figsize=(10, 6))
|
|
170
|
+
x_pos = range(len(run_names))
|
|
171
|
+
width = 0.25
|
|
172
|
+
rel_vals = [v if v is not None else 0 for v in relevance_scores]
|
|
173
|
+
comp_vals = [v if v is not None else 0 for v in completion_quality_scores]
|
|
174
|
+
hall_vals = [v if v is not None else 0 for v in hallucination_scores]
|
|
175
|
+
ax5.bar([i - width for i in x_pos], rel_vals, width, label="Response Relevance", color="lightblue", alpha=0.8)
|
|
176
|
+
ax5.bar(x_pos, comp_vals, width, label="Task Completion", color="lightgreen", alpha=0.8)
|
|
177
|
+
ax5.bar([i + width for i in x_pos], hall_vals, width, label="Groundedness", color="khaki", alpha=0.8)
|
|
178
|
+
ax5.set_xlabel("Run")
|
|
179
|
+
ax5.set_ylabel("Score (0-1)")
|
|
180
|
+
ax5.set_title("Qualitative Metrics (LLM Judge)", fontsize=14, fontweight="bold")
|
|
181
|
+
ax5.set_xticks(x_pos)
|
|
182
|
+
ax5.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
183
|
+
ax5.set_ylim(0, 1.1)
|
|
184
|
+
ax5.legend(fontsize=8)
|
|
185
|
+
ax5.grid(axis="y", alpha=0.3)
|
|
186
|
+
ax5.axhline(y=0.7, color="orange", linestyle="--", alpha=0.3)
|
|
187
|
+
fig5.tight_layout()
|
|
188
|
+
figures.append(("qualitative_metrics", fig5))
|
|
189
|
+
|
|
190
|
+
# 6–8. Individual qualitative charts
|
|
191
|
+
for metric_name, vals, title in [
|
|
192
|
+
("response_relevance", rel_vals, "Response Relevance"),
|
|
193
|
+
("task_completion_quality", comp_vals, "Task Completion Quality"),
|
|
194
|
+
("hallucination_score", hall_vals, "Groundedness (Hallucination Score)"),
|
|
195
|
+
]:
|
|
196
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
197
|
+
colors = ["green" if s >= 0.8 else "orange" if s >= 0.6 else "red" for s in vals]
|
|
198
|
+
bars = ax.bar(range(len(run_names)), vals, color=colors, alpha=0.7)
|
|
199
|
+
ax.set_xlabel("Run")
|
|
200
|
+
ax.set_ylabel("Score (0-1)")
|
|
201
|
+
ax.set_title(title, fontsize=14, fontweight="bold")
|
|
202
|
+
ax.set_xticks(range(len(run_names)))
|
|
203
|
+
ax.set_xticklabels(display_names, rotation=45, ha="right", fontsize=8)
|
|
204
|
+
ax.set_ylim(0, 1.1)
|
|
205
|
+
ax.grid(axis="y", alpha=0.3)
|
|
206
|
+
ax.axhline(y=0.8, color="green", linestyle="--", alpha=0.3, linewidth=1)
|
|
207
|
+
ax.axhline(y=0.6, color="orange", linestyle="--", alpha=0.3, linewidth=1)
|
|
208
|
+
for bar, val in zip(bars, vals):
|
|
209
|
+
if val > 0:
|
|
210
|
+
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.3f}", ha="center", va="bottom", fontsize=8)
|
|
211
|
+
fig.tight_layout()
|
|
212
|
+
figures.append((metric_name, fig))
|
|
213
|
+
|
|
214
|
+
return figures
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _plot_per_episode_comparison(
|
|
218
|
+
metrics: Dict[str, dict],
|
|
219
|
+
task_filter: str | None = None,
|
|
220
|
+
) -> List[Tuple[str, plt.Figure]]:
|
|
221
|
+
"""Build per-task charts comparing all runs. Returns (name, figure) pairs.
|
|
222
|
+
|
|
223
|
+
If ``task_filter`` is provided, only tasks whose id contains the substring
|
|
224
|
+
are plotted.
|
|
225
|
+
"""
|
|
226
|
+
if not metrics:
|
|
227
|
+
return []
|
|
228
|
+
|
|
229
|
+
tasks_by_model: Dict[str, Dict[str, dict]] = {}
|
|
230
|
+
all_task_ids: set = set()
|
|
231
|
+
|
|
232
|
+
for run_name, data in metrics.items():
|
|
233
|
+
episodes = _get_per_episode_averages(data)
|
|
234
|
+
tasks_by_model[run_name] = {ep["task_id"]: ep for ep in episodes}
|
|
235
|
+
all_task_ids.update(ep["task_id"] for ep in episodes)
|
|
236
|
+
|
|
237
|
+
if not all_task_ids:
|
|
238
|
+
return []
|
|
239
|
+
|
|
240
|
+
if task_filter:
|
|
241
|
+
all_task_ids = {tid for tid in all_task_ids if task_filter in tid}
|
|
242
|
+
if not all_task_ids:
|
|
243
|
+
return []
|
|
244
|
+
|
|
245
|
+
sorted_tasks = sorted(all_task_ids)
|
|
246
|
+
model_names = list(metrics.keys())
|
|
247
|
+
has_qualitative = any(
|
|
248
|
+
any("qualitative" in ep for ep in _get_per_episode_averages(data))
|
|
249
|
+
for data in metrics.values()
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
figures: List[Tuple[str, plt.Figure]] = []
|
|
253
|
+
|
|
254
|
+
for task_id in sorted_tasks:
|
|
255
|
+
task_data = []
|
|
256
|
+
available_models = []
|
|
257
|
+
for run_name in model_names:
|
|
258
|
+
if task_id in tasks_by_model[run_name]:
|
|
259
|
+
task_data.append(tasks_by_model[run_name][task_id])
|
|
260
|
+
available_models.append(run_name)
|
|
261
|
+
|
|
262
|
+
if not task_data:
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
display_names = list(available_models)
|
|
266
|
+
n_plots = 3 if has_qualitative else 2
|
|
267
|
+
fig, axes = plt.subplots(n_plots, 1, figsize=(max(10, len(task_data) * 0.8), 4 * n_plots))
|
|
268
|
+
if n_plots == 1:
|
|
269
|
+
axes = [axes]
|
|
270
|
+
|
|
271
|
+
fig.suptitle(f"Task: {task_id}", fontsize=14, fontweight="bold")
|
|
272
|
+
|
|
273
|
+
# Time
|
|
274
|
+
times = [ep["time"]["wall_ms"] / 1000 for ep in task_data]
|
|
275
|
+
axes[0].bar(range(len(display_names)), times, color="steelblue", alpha=0.7)
|
|
276
|
+
axes[0].set_ylabel("Time (s)", fontsize=11)
|
|
277
|
+
axes[0].set_title("Execution Time by Model", fontsize=12, fontweight="bold")
|
|
278
|
+
axes[0].set_xticks(range(len(display_names)))
|
|
279
|
+
axes[0].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
|
|
280
|
+
axes[0].grid(axis="y", alpha=0.3)
|
|
281
|
+
for bar, val in zip(axes[0].patches, times):
|
|
282
|
+
axes[0].text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{val:.1f}s", ha="center", va="bottom", fontsize=9)
|
|
283
|
+
|
|
284
|
+
# Tokens
|
|
285
|
+
prompt_tokens = [ep["tokens"]["prompt_tokens"] for ep in task_data]
|
|
286
|
+
completion_tokens = [ep["tokens"]["completion_tokens"] for ep in task_data]
|
|
287
|
+
x = range(len(display_names))
|
|
288
|
+
axes[1].bar(x, prompt_tokens, label="Prompt", color="cornflowerblue", alpha=0.8)
|
|
289
|
+
axes[1].bar(x, completion_tokens, bottom=prompt_tokens, label="Completion", color="lightcoral", alpha=0.8)
|
|
290
|
+
axes[1].set_ylabel("Tokens", fontsize=11)
|
|
291
|
+
axes[1].set_title("Token Usage by Model", fontsize=12, fontweight="bold")
|
|
292
|
+
axes[1].set_xticks(x)
|
|
293
|
+
axes[1].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
|
|
294
|
+
axes[1].legend(fontsize=9)
|
|
295
|
+
axes[1].grid(axis="y", alpha=0.3)
|
|
296
|
+
|
|
297
|
+
# Qualitative
|
|
298
|
+
if has_qualitative:
|
|
299
|
+
width = 0.25
|
|
300
|
+
x_pos = range(len(display_names))
|
|
301
|
+
relevance = [ep.get("qualitative", {}).get("response_relevance", 0) for ep in task_data]
|
|
302
|
+
completion_q = [ep.get("qualitative", {}).get("task_completion_quality", 0) for ep in task_data]
|
|
303
|
+
hallucination = [ep.get("qualitative", {}).get("hallucination_score", 0) for ep in task_data]
|
|
304
|
+
axes[2].bar([i - width for i in x_pos], relevance, width, label="Relevance", color="lightblue", alpha=0.8)
|
|
305
|
+
axes[2].bar(x_pos, completion_q, width, label="Completion", color="lightgreen", alpha=0.8)
|
|
306
|
+
axes[2].bar([i + width for i in x_pos], hallucination, width, label="Groundedness", color="khaki", alpha=0.8)
|
|
307
|
+
axes[2].set_ylabel("Score", fontsize=11)
|
|
308
|
+
axes[2].set_title("Qualitative Metrics by Model", fontsize=12, fontweight="bold")
|
|
309
|
+
axes[2].set_xticks(x_pos)
|
|
310
|
+
axes[2].set_xticklabels(display_names, rotation=45, ha="right", fontsize=10)
|
|
311
|
+
axes[2].set_ylim(0, 1.1)
|
|
312
|
+
axes[2].legend(fontsize=9)
|
|
313
|
+
axes[2].grid(axis="y", alpha=0.3)
|
|
314
|
+
axes[2].axhline(y=0.7, color="orange", linestyle="--", alpha=0.3)
|
|
315
|
+
|
|
316
|
+
fig.tight_layout()
|
|
317
|
+
safe_task = task_id.replace(":", "-").replace("/", "-").replace(" ", "_")
|
|
318
|
+
figures.append((f"per_task_{safe_task}", fig))
|
|
319
|
+
|
|
320
|
+
return figures
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _save_figures(figures: List[Tuple[str, plt.Figure]], output_dir: Path) -> List[Path]:
|
|
324
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
325
|
+
saved: List[Path] = []
|
|
326
|
+
for name, fig in figures:
|
|
327
|
+
path = output_dir / f"metrics_{name}.png"
|
|
328
|
+
fig.savefig(path, dpi=150, bbox_inches="tight")
|
|
329
|
+
plt.close(fig)
|
|
330
|
+
saved.append(path)
|
|
331
|
+
return saved
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def generate_and_save_plots(
|
|
335
|
+
metrics: Dict[str, dict],
|
|
336
|
+
output_dir: Path,
|
|
337
|
+
task_filter: str | None = None,
|
|
338
|
+
) -> List[Path]:
|
|
339
|
+
"""Generate all comparison and per-task charts and save them to output_dir.
|
|
340
|
+
|
|
341
|
+
``task_filter`` is a substring match against ``task_id`` and restricts
|
|
342
|
+
the per-task charts only. Summary/comparison charts always reflect the
|
|
343
|
+
full run.
|
|
344
|
+
"""
|
|
345
|
+
figures = _plot_comparison(metrics)
|
|
346
|
+
figures += _plot_per_episode_comparison(metrics, task_filter=task_filter)
|
|
347
|
+
return _save_figures(figures, output_dir)
|
image_defaults.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
REPO_ENV_VAR = "BAT_DOCKER_REPO"
|
|
7
|
+
REGISTRY_ENV_VAR = "BAT_DOCKER_REGISTRY"
|
|
8
|
+
DEFAULT_REGISTRY = "default_registry"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def default_repo_name(context_dir: Path) -> str:
|
|
12
|
+
project = re.sub(r"[^a-z0-9]+", "-", context_dir.name.lower()).strip("-") or "agent"
|
|
13
|
+
return f"default-repository/{project}"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _read_dotenv_value(key: str) -> str | None:
|
|
17
|
+
dotenv_path = Path.cwd() / ".env"
|
|
18
|
+
if not dotenv_path.is_file():
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
content = dotenv_path.read_text(encoding="utf-8")
|
|
23
|
+
except OSError:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
match = re.search(
|
|
27
|
+
rf"^\s*(?:export\s+)?{re.escape(key)}\s*=\s*(.*?)\s*$",
|
|
28
|
+
content,
|
|
29
|
+
flags=re.MULTILINE,
|
|
30
|
+
)
|
|
31
|
+
if not match:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
value = match.group(1).strip()
|
|
35
|
+
if value.startswith("#"):
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
if len(value) >= 2 and value[0] == value[-1] and value[0] in {'"', "'"}:
|
|
39
|
+
value = value[1:-1].strip()
|
|
40
|
+
|
|
41
|
+
if value:
|
|
42
|
+
return value
|
|
43
|
+
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _resolve_image_value(
|
|
48
|
+
cli_value: str | None,
|
|
49
|
+
*,
|
|
50
|
+
env_key: str,
|
|
51
|
+
default_value: str,
|
|
52
|
+
) -> str:
|
|
53
|
+
if cli_value:
|
|
54
|
+
return cli_value
|
|
55
|
+
|
|
56
|
+
env_value = os.environ.get(env_key, "").strip()
|
|
57
|
+
if env_value:
|
|
58
|
+
return env_value
|
|
59
|
+
|
|
60
|
+
dotenv_value = _read_dotenv_value(env_key)
|
|
61
|
+
if dotenv_value:
|
|
62
|
+
return dotenv_value
|
|
63
|
+
|
|
64
|
+
return default_value
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def resolve_repo_name(context_dir: Path, repo: str | None) -> str:
|
|
68
|
+
return _resolve_image_value(
|
|
69
|
+
repo,
|
|
70
|
+
env_key=REPO_ENV_VAR,
|
|
71
|
+
default_value=default_repo_name(context_dir),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def resolve_registry(context_dir: Path, registry: str | None) -> str:
|
|
76
|
+
return _resolve_image_value(
|
|
77
|
+
registry,
|
|
78
|
+
env_key=REGISTRY_ENV_VAR,
|
|
79
|
+
default_value=DEFAULT_REGISTRY,
|
|
80
|
+
)
|
push/__init__.py
ADDED
push/push.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
|
|
6
|
+
from image_defaults import resolve_registry, resolve_repo_name
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def push_image(
|
|
10
|
+
context: Path = typer.Option(
|
|
11
|
+
Path("."),
|
|
12
|
+
"--context",
|
|
13
|
+
"-C",
|
|
14
|
+
help="Directory used to infer the default repository name.",
|
|
15
|
+
),
|
|
16
|
+
docker_registry: str | None = typer.Option(
|
|
17
|
+
None,
|
|
18
|
+
"--docker-registry",
|
|
19
|
+
help=(
|
|
20
|
+
"Docker registry hostname. Precedence: --docker-registry > "
|
|
21
|
+
"BAT_DOCKER_REGISTRY env var (or .env in current directory) > default_registry."
|
|
22
|
+
),
|
|
23
|
+
),
|
|
24
|
+
repo: str | None = typer.Option(
|
|
25
|
+
None,
|
|
26
|
+
"--repo",
|
|
27
|
+
help=(
|
|
28
|
+
"Image repository path. Precedence: --repo > BAT_DOCKER_REPO env var "
|
|
29
|
+
"(or .env in current directory) > default-repository/<project-name>."
|
|
30
|
+
),
|
|
31
|
+
),
|
|
32
|
+
tag: str = typer.Option(
|
|
33
|
+
"latest",
|
|
34
|
+
"--tag",
|
|
35
|
+
help="Image tag.",
|
|
36
|
+
),
|
|
37
|
+
) -> None:
|
|
38
|
+
context_dir = context.resolve()
|
|
39
|
+
if not context_dir.is_dir():
|
|
40
|
+
typer.secho(f"Context directory not found: {context_dir}", fg=typer.colors.RED, err=True)
|
|
41
|
+
raise typer.Exit(code=1)
|
|
42
|
+
|
|
43
|
+
resolved_registry = resolve_registry(context_dir, docker_registry)
|
|
44
|
+
resolved_repo = resolve_repo_name(context_dir, repo)
|
|
45
|
+
image = f"{resolved_registry}/{resolved_repo}:{tag}"
|
|
46
|
+
command = ["docker", "push", image]
|
|
47
|
+
|
|
48
|
+
typer.echo(f"Pushing Docker image: {image}")
|
|
49
|
+
try:
|
|
50
|
+
subprocess.run(command, check=True, cwd=context_dir)
|
|
51
|
+
except FileNotFoundError as exc:
|
|
52
|
+
typer.secho("Docker executable not found in PATH.", fg=typer.colors.RED, err=True)
|
|
53
|
+
raise typer.Exit(code=1) from exc
|
|
54
|
+
except subprocess.CalledProcessError as exc:
|
|
55
|
+
typer.secho("Docker push failed.", fg=typer.colors.RED, err=True)
|
|
56
|
+
raise typer.Exit(code=exc.returncode or 1) from exc
|
|
57
|
+
|
|
58
|
+
typer.secho(f"Docker image pushed successfully: {image}", fg=typer.colors.GREEN)
|
set/__init__.py
ADDED