astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Aggregate individual run results into model comparison benchmark.
|
|
4
|
+
|
|
5
|
+
This script supports two workspace layouts:
|
|
6
|
+
|
|
7
|
+
1. Anonymous run IDs with mapping.json (recommended for blind evaluation):
|
|
8
|
+
<workspace>/
|
|
9
|
+
├── evals.json
|
|
10
|
+
├── mapping.json # Maps run IDs to models
|
|
11
|
+
└── run-001/
|
|
12
|
+
├── outputs/
|
|
13
|
+
├── grading.json
|
|
14
|
+
└── timing.json
|
|
15
|
+
|
|
16
|
+
2. Legacy layout with model names in directories:
|
|
17
|
+
<workspace>/
|
|
18
|
+
├── evals.json
|
|
19
|
+
└── eval-1/
|
|
20
|
+
├── eval_metadata.json
|
|
21
|
+
├── opus/
|
|
22
|
+
│ ├── outputs/
|
|
23
|
+
│ ├── grading.json
|
|
24
|
+
│ └── timing.json
|
|
25
|
+
└── sonnet/
|
|
26
|
+
└── ...
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
python aggregate_results.py <workspace_dir> [--mapping mapping.json]
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
python aggregate_results.py pdf-eval-workspace/
|
|
33
|
+
python aggregate_results.py pdf-eval-workspace/ --mapping pdf-eval-workspace/mapping.json
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import argparse
|
|
37
|
+
import json
|
|
38
|
+
import math
|
|
39
|
+
import sys
|
|
40
|
+
from datetime import datetime, timezone
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def calculate_stats(values: list[float]) -> dict:
|
|
45
|
+
"""Calculate mean, stddev, min, max for a list of values."""
|
|
46
|
+
if not values:
|
|
47
|
+
return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
|
|
48
|
+
|
|
49
|
+
n = len(values)
|
|
50
|
+
mean = sum(values) / n
|
|
51
|
+
|
|
52
|
+
if n > 1:
|
|
53
|
+
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
|
|
54
|
+
stddev = math.sqrt(variance)
|
|
55
|
+
else:
|
|
56
|
+
stddev = 0.0
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"mean": round(mean, 4),
|
|
60
|
+
"stddev": round(stddev, 4),
|
|
61
|
+
"min": round(min(values), 4),
|
|
62
|
+
"max": round(max(values), 4)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def load_evals_config(workspace: Path) -> dict:
|
|
67
|
+
"""Load evals.json from workspace."""
|
|
68
|
+
evals_path = workspace / "evals.json"
|
|
69
|
+
if evals_path.exists():
|
|
70
|
+
with open(evals_path) as f:
|
|
71
|
+
return json.load(f)
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_mapping(workspace: Path, mapping_path: Path | None = None) -> dict:
|
|
76
|
+
"""Load mapping.json that maps run IDs to model names."""
|
|
77
|
+
if mapping_path:
|
|
78
|
+
path = mapping_path
|
|
79
|
+
else:
|
|
80
|
+
path = workspace / "mapping.json"
|
|
81
|
+
|
|
82
|
+
if path.exists():
|
|
83
|
+
with open(path) as f:
|
|
84
|
+
return json.load(f)
|
|
85
|
+
return {}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def load_run_results_with_mapping(workspace: Path, mapping: dict) -> dict[str, list[dict]]:
|
|
89
|
+
"""
|
|
90
|
+
Load run results using anonymous run IDs with mapping.
|
|
91
|
+
|
|
92
|
+
Returns dict keyed by model name, each containing a list of run results.
|
|
93
|
+
"""
|
|
94
|
+
results: dict[str, list[dict]] = {}
|
|
95
|
+
|
|
96
|
+
for run_dir in sorted(workspace.glob("run-*")):
|
|
97
|
+
if not run_dir.is_dir():
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
run_id = run_dir.name
|
|
101
|
+
|
|
102
|
+
# Get model and eval info from mapping
|
|
103
|
+
run_info = mapping.get(run_id, {})
|
|
104
|
+
model = run_info.get("model")
|
|
105
|
+
eval_id = run_info.get("eval_id")
|
|
106
|
+
eval_name = run_info.get("eval_name", f"Eval {eval_id}")
|
|
107
|
+
|
|
108
|
+
if not model:
|
|
109
|
+
print(f"Warning: No model mapping found for {run_id}")
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
if model not in results:
|
|
113
|
+
results[model] = []
|
|
114
|
+
|
|
115
|
+
# Load grading.json
|
|
116
|
+
grading_path = run_dir / "grading.json"
|
|
117
|
+
if not grading_path.exists():
|
|
118
|
+
print(f"Warning: grading.json not found in {run_dir}")
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
with open(grading_path) as f:
|
|
123
|
+
grading = json.load(f)
|
|
124
|
+
except json.JSONDecodeError as e:
|
|
125
|
+
print(f"Warning: Invalid JSON in {grading_path}: {e}")
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# Load timing.json
|
|
129
|
+
timing_path = run_dir / "timing.json"
|
|
130
|
+
timing_data = {}
|
|
131
|
+
if timing_path.exists():
|
|
132
|
+
try:
|
|
133
|
+
with open(timing_path) as tf:
|
|
134
|
+
timing_data = json.load(tf)
|
|
135
|
+
except json.JSONDecodeError:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Calculate time in seconds from timing data
|
|
139
|
+
# Support both duration_ms and total_duration_seconds formats
|
|
140
|
+
time_seconds = 0.0
|
|
141
|
+
if "duration_ms" in timing_data:
|
|
142
|
+
time_seconds = timing_data["duration_ms"] / 1000.0
|
|
143
|
+
elif "total_duration_seconds" in timing_data:
|
|
144
|
+
time_seconds = timing_data["total_duration_seconds"]
|
|
145
|
+
|
|
146
|
+
# Build result
|
|
147
|
+
result = {
|
|
148
|
+
"run_id": run_id,
|
|
149
|
+
"eval_id": eval_id,
|
|
150
|
+
"eval_name": eval_name,
|
|
151
|
+
"model": model,
|
|
152
|
+
"result": {
|
|
153
|
+
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
|
154
|
+
"passed": grading.get("summary", {}).get("passed", 0),
|
|
155
|
+
"failed": grading.get("summary", {}).get("failed", 0),
|
|
156
|
+
"total": grading.get("summary", {}).get("total", 0),
|
|
157
|
+
"time_seconds": time_seconds,
|
|
158
|
+
"tokens": timing_data.get("total_tokens", 0),
|
|
159
|
+
"tool_calls": grading.get("execution_metrics", {}).get("total_tool_calls", 0),
|
|
160
|
+
"errors": grading.get("execution_metrics", {}).get("errors_encountered", 0)
|
|
161
|
+
},
|
|
162
|
+
"expectations": grading.get("expectations", []),
|
|
163
|
+
"issues": grading.get("issues", [])
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
results[model].append(result)
|
|
167
|
+
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def load_run_results_legacy(workspace: Path) -> dict[str, list[dict]]:
|
|
172
|
+
"""
|
|
173
|
+
Load run results using legacy layout with model names in directories.
|
|
174
|
+
|
|
175
|
+
Returns dict keyed by model name, each containing a list of run results.
|
|
176
|
+
"""
|
|
177
|
+
results: dict[str, list[dict]] = {}
|
|
178
|
+
|
|
179
|
+
# Find all eval directories
|
|
180
|
+
for eval_dir in sorted(workspace.glob("eval-*")):
|
|
181
|
+
if not eval_dir.is_dir():
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# Get eval metadata
|
|
185
|
+
metadata_path = eval_dir / "eval_metadata.json"
|
|
186
|
+
eval_id = None
|
|
187
|
+
eval_name = eval_dir.name
|
|
188
|
+
|
|
189
|
+
if metadata_path.exists():
|
|
190
|
+
try:
|
|
191
|
+
with open(metadata_path) as mf:
|
|
192
|
+
metadata = json.load(mf)
|
|
193
|
+
eval_id = metadata.get("eval_id")
|
|
194
|
+
eval_name = metadata.get("eval_name", eval_dir.name)
|
|
195
|
+
except (json.JSONDecodeError, OSError):
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
if eval_id is None:
|
|
199
|
+
try:
|
|
200
|
+
eval_id = int(eval_dir.name.split("-")[1])
|
|
201
|
+
except (ValueError, IndexError):
|
|
202
|
+
eval_id = 0
|
|
203
|
+
|
|
204
|
+
# Find model directories
|
|
205
|
+
for model_dir in sorted(eval_dir.iterdir()):
|
|
206
|
+
if not model_dir.is_dir():
|
|
207
|
+
continue
|
|
208
|
+
if model_dir.name in ["outputs", "inputs"]:
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
model = model_dir.name
|
|
212
|
+
if model not in results:
|
|
213
|
+
results[model] = []
|
|
214
|
+
|
|
215
|
+
# Load grading.json
|
|
216
|
+
grading_path = model_dir / "grading.json"
|
|
217
|
+
if not grading_path.exists():
|
|
218
|
+
print(f"Warning: grading.json not found in {model_dir}")
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
with open(grading_path) as f:
|
|
223
|
+
grading = json.load(f)
|
|
224
|
+
except json.JSONDecodeError as e:
|
|
225
|
+
print(f"Warning: Invalid JSON in {grading_path}: {e}")
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Load timing.json
|
|
229
|
+
timing_path = model_dir / "timing.json"
|
|
230
|
+
timing_data = {}
|
|
231
|
+
if timing_path.exists():
|
|
232
|
+
try:
|
|
233
|
+
with open(timing_path) as tf:
|
|
234
|
+
timing_data = json.load(tf)
|
|
235
|
+
except json.JSONDecodeError:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
# Build result
|
|
239
|
+
result = {
|
|
240
|
+
"run_id": f"eval-{eval_id}-{model}",
|
|
241
|
+
"eval_id": eval_id,
|
|
242
|
+
"eval_name": eval_name,
|
|
243
|
+
"model": model,
|
|
244
|
+
"result": {
|
|
245
|
+
"pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
|
|
246
|
+
"passed": grading.get("summary", {}).get("passed", 0),
|
|
247
|
+
"failed": grading.get("summary", {}).get("failed", 0),
|
|
248
|
+
"total": grading.get("summary", {}).get("total", 0),
|
|
249
|
+
"time_seconds": timing_data.get("total_duration_seconds", 0.0),
|
|
250
|
+
"tokens": timing_data.get("total_tokens", 0),
|
|
251
|
+
"tool_calls": grading.get("execution_metrics", {}).get("total_tool_calls", 0),
|
|
252
|
+
"errors": grading.get("execution_metrics", {}).get("errors_encountered", 0)
|
|
253
|
+
},
|
|
254
|
+
"expectations": grading.get("expectations", []),
|
|
255
|
+
"issues": grading.get("issues", [])
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
results[model].append(result)
|
|
259
|
+
|
|
260
|
+
return results
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def load_run_results(workspace: Path, mapping: dict) -> dict[str, list[dict]]:
|
|
264
|
+
"""
|
|
265
|
+
Load all run results from a workspace.
|
|
266
|
+
|
|
267
|
+
Automatically detects layout: anonymous run IDs with mapping.json or legacy.
|
|
268
|
+
"""
|
|
269
|
+
# If mapping is provided or exists, use anonymous layout
|
|
270
|
+
if mapping:
|
|
271
|
+
return load_run_results_with_mapping(workspace, mapping)
|
|
272
|
+
|
|
273
|
+
# Check for mapping.json
|
|
274
|
+
mapping_path = workspace / "mapping.json"
|
|
275
|
+
if mapping_path.exists():
|
|
276
|
+
mapping = load_mapping(workspace)
|
|
277
|
+
if mapping:
|
|
278
|
+
return load_run_results_with_mapping(workspace, mapping)
|
|
279
|
+
|
|
280
|
+
# Fall back to legacy layout
|
|
281
|
+
return load_run_results_legacy(workspace)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def aggregate_model_summary(results: dict[str, list[dict]]) -> dict[str, dict]:
|
|
285
|
+
"""Aggregate results into model summaries."""
|
|
286
|
+
model_summary = {}
|
|
287
|
+
|
|
288
|
+
for model, runs in results.items():
|
|
289
|
+
if not runs:
|
|
290
|
+
model_summary[model] = {
|
|
291
|
+
"pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
292
|
+
"time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
293
|
+
"tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
|
294
|
+
}
|
|
295
|
+
continue
|
|
296
|
+
|
|
297
|
+
# Safely extract values, handling missing keys
|
|
298
|
+
pass_rates = []
|
|
299
|
+
times = []
|
|
300
|
+
tokens_list = []
|
|
301
|
+
|
|
302
|
+
for r in runs:
|
|
303
|
+
result = r.get("result", {})
|
|
304
|
+
if result.get("pass_rate") is not None:
|
|
305
|
+
pass_rates.append(result["pass_rate"])
|
|
306
|
+
if result.get("time_seconds") is not None:
|
|
307
|
+
times.append(result["time_seconds"])
|
|
308
|
+
if result.get("tokens") is not None:
|
|
309
|
+
tokens_list.append(result["tokens"])
|
|
310
|
+
|
|
311
|
+
model_summary[model] = {
|
|
312
|
+
"pass_rate": calculate_stats(pass_rates) if pass_rates else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
313
|
+
"time_seconds": calculate_stats(times) if times else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
|
|
314
|
+
"tokens": calculate_stats(tokens_list) if tokens_list else {"mean": 0, "stddev": 0, "min": 0, "max": 0}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return model_summary
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def calculate_comparison(model_summary: dict[str, dict]) -> dict:
|
|
321
|
+
"""Calculate comparison metrics between models."""
|
|
322
|
+
models = list(model_summary.keys())
|
|
323
|
+
|
|
324
|
+
if len(models) < 2:
|
|
325
|
+
return {
|
|
326
|
+
"pass_rate_delta": "N/A",
|
|
327
|
+
"time_delta": "N/A",
|
|
328
|
+
"token_delta": "N/A",
|
|
329
|
+
"cost_efficiency": {}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
# Compare first model to second
|
|
333
|
+
primary = model_summary.get(models[0], {})
|
|
334
|
+
baseline = model_summary.get(models[1], {})
|
|
335
|
+
|
|
336
|
+
delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
|
|
337
|
+
delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
|
|
338
|
+
delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
|
|
339
|
+
|
|
340
|
+
# Calculate cost efficiency: pass_rate * 1000 / (tokens / 1000)
|
|
341
|
+
cost_efficiency = {}
|
|
342
|
+
for model, summary in model_summary.items():
|
|
343
|
+
pass_rate = summary.get("pass_rate", {}).get("mean", 0)
|
|
344
|
+
tokens = summary.get("tokens", {}).get("mean", 0)
|
|
345
|
+
if tokens > 0:
|
|
346
|
+
efficiency = (pass_rate * 1000) / (tokens / 1000)
|
|
347
|
+
cost_efficiency[model] = round(efficiency, 1)
|
|
348
|
+
else:
|
|
349
|
+
cost_efficiency[model] = 0
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
"pass_rate_delta": f"{delta_pass_rate:+.2f}",
|
|
353
|
+
"time_delta": f"{delta_time:+.1f}s",
|
|
354
|
+
"token_delta": f"{delta_tokens:+.0f}",
|
|
355
|
+
"cost_efficiency": cost_efficiency
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def generate_recommendations(model_summary: dict[str, dict], comparison: dict) -> list[dict]:
|
|
360
|
+
"""Generate usage recommendations based on actual results."""
|
|
361
|
+
recommendations = []
|
|
362
|
+
models = list(model_summary.keys())
|
|
363
|
+
|
|
364
|
+
if len(models) < 2:
|
|
365
|
+
return recommendations
|
|
366
|
+
|
|
367
|
+
primary = models[0]
|
|
368
|
+
baseline = models[1]
|
|
369
|
+
|
|
370
|
+
primary_pr = model_summary[primary]["pass_rate"]["mean"]
|
|
371
|
+
baseline_pr = model_summary[baseline]["pass_rate"]["mean"]
|
|
372
|
+
|
|
373
|
+
primary_time = model_summary[primary]["time_seconds"]["mean"]
|
|
374
|
+
baseline_time = model_summary[baseline]["time_seconds"]["mean"]
|
|
375
|
+
|
|
376
|
+
pr_diff = primary_pr - baseline_pr
|
|
377
|
+
|
|
378
|
+
# Generate recommendations based on actual data patterns
|
|
379
|
+
|
|
380
|
+
# High accuracy recommendation
|
|
381
|
+
if pr_diff > 0.1:
|
|
382
|
+
recommendations.append({
|
|
383
|
+
"scenario": "High accuracy requirements",
|
|
384
|
+
"recommended_model": primary,
|
|
385
|
+
"reason": f"{pr_diff*100:.0f}% higher pass rate ({primary_pr*100:.0f}% vs {baseline_pr*100:.0f}%)"
|
|
386
|
+
})
|
|
387
|
+
|
|
388
|
+
# Cost efficiency recommendation
|
|
389
|
+
cost_eff = comparison.get("cost_efficiency", {})
|
|
390
|
+
baseline_eff = cost_eff.get(baseline, 0)
|
|
391
|
+
primary_eff = cost_eff.get(primary, 0)
|
|
392
|
+
|
|
393
|
+
if baseline_eff > primary_eff and baseline_pr >= 0.7:
|
|
394
|
+
recommendations.append({
|
|
395
|
+
"scenario": "Cost-conscious use",
|
|
396
|
+
"recommended_model": baseline,
|
|
397
|
+
"reason": f"Better cost efficiency ({baseline_eff:.1f} vs {primary_eff:.1f}) with acceptable quality ({baseline_pr*100:.0f}%)"
|
|
398
|
+
})
|
|
399
|
+
|
|
400
|
+
# Speed recommendation
|
|
401
|
+
if baseline_time < primary_time * 0.8 and baseline_time > 0:
|
|
402
|
+
speedup = ((primary_time - baseline_time) / primary_time) * 100
|
|
403
|
+
recommendations.append({
|
|
404
|
+
"scenario": "Speed-critical tasks",
|
|
405
|
+
"recommended_model": baseline,
|
|
406
|
+
"reason": f"{speedup:.0f}% faster ({baseline_time:.1f}s vs {primary_time:.1f}s)"
|
|
407
|
+
})
|
|
408
|
+
|
|
409
|
+
# Default fallback based on actual results
|
|
410
|
+
if not recommendations:
|
|
411
|
+
if pr_diff > 0:
|
|
412
|
+
recommendations.append({
|
|
413
|
+
"scenario": "General use",
|
|
414
|
+
"recommended_model": primary,
|
|
415
|
+
"reason": f"Higher pass rate ({primary_pr*100:.0f}% vs {baseline_pr*100:.0f}%)"
|
|
416
|
+
})
|
|
417
|
+
else:
|
|
418
|
+
recommendations.append({
|
|
419
|
+
"scenario": "General use",
|
|
420
|
+
"recommended_model": baseline,
|
|
421
|
+
"reason": f"Similar quality ({baseline_pr*100:.0f}% vs {primary_pr*100:.0f}%) with lower cost"
|
|
422
|
+
})
|
|
423
|
+
|
|
424
|
+
return recommendations
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def generate_notes(model_summary: dict[str, dict], results: dict[str, list[dict]]) -> list[str]:
|
|
428
|
+
"""Generate analyst notes from the results."""
|
|
429
|
+
notes = []
|
|
430
|
+
models = list(model_summary.keys())
|
|
431
|
+
|
|
432
|
+
if len(models) < 2:
|
|
433
|
+
return notes
|
|
434
|
+
|
|
435
|
+
primary = models[0]
|
|
436
|
+
baseline = models[1]
|
|
437
|
+
|
|
438
|
+
primary_pr = model_summary[primary]["pass_rate"]["mean"]
|
|
439
|
+
baseline_pr = model_summary[baseline]["pass_rate"]["mean"]
|
|
440
|
+
|
|
441
|
+
# Pass rate comparison
|
|
442
|
+
if primary_pr > baseline_pr:
|
|
443
|
+
notes.append(f"{primary} achieves {primary_pr*100:.0f}% pass rate vs {baseline}'s {baseline_pr*100:.0f}%")
|
|
444
|
+
elif baseline_pr > primary_pr:
|
|
445
|
+
notes.append(f"{baseline} achieves {baseline_pr*100:.0f}% pass rate vs {primary}'s {primary_pr*100:.0f}%")
|
|
446
|
+
else:
|
|
447
|
+
notes.append(f"Both models achieve similar pass rate ({primary_pr*100:.0f}%)")
|
|
448
|
+
|
|
449
|
+
# Speed comparison
|
|
450
|
+
primary_time = model_summary[primary]["time_seconds"]["mean"]
|
|
451
|
+
baseline_time = model_summary[baseline]["time_seconds"]["mean"]
|
|
452
|
+
if baseline_time < primary_time and baseline_time > 0:
|
|
453
|
+
speedup = ((primary_time - baseline_time) / primary_time) * 100
|
|
454
|
+
notes.append(f"{baseline} is {speedup:.0f}% faster on average")
|
|
455
|
+
elif primary_time < baseline_time and primary_time > 0:
|
|
456
|
+
speedup = ((baseline_time - primary_time) / baseline_time) * 100
|
|
457
|
+
notes.append(f"{primary} is {speedup:.0f}% faster on average")
|
|
458
|
+
|
|
459
|
+
# Look for eval-specific patterns
|
|
460
|
+
primary_runs = results.get(primary, [])
|
|
461
|
+
baseline_runs = results.get(baseline, [])
|
|
462
|
+
|
|
463
|
+
for pr in primary_runs:
|
|
464
|
+
for br in baseline_runs:
|
|
465
|
+
if pr["eval_id"] == br["eval_id"]:
|
|
466
|
+
pr_rate = pr["result"]["pass_rate"]
|
|
467
|
+
br_rate = br["result"]["pass_rate"]
|
|
468
|
+
if pr_rate < 0.5 and br_rate < 0.5:
|
|
469
|
+
notes.append(f"Both models struggle with '{pr['eval_name']}'")
|
|
470
|
+
elif pr_rate - br_rate > 0.3:
|
|
471
|
+
notes.append(f"{primary} significantly outperforms on '{pr['eval_name']}'")
|
|
472
|
+
elif br_rate - pr_rate > 0.3:
|
|
473
|
+
notes.append(f"{baseline} significantly outperforms on '{pr['eval_name']}'")
|
|
474
|
+
|
|
475
|
+
return notes[:5] # Limit to 5 notes
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def generate_benchmark(workspace: Path, mapping: dict) -> dict:
|
|
479
|
+
"""Generate complete benchmark.json from workspace results."""
|
|
480
|
+
config = load_evals_config(workspace)
|
|
481
|
+
results = load_run_results(workspace, mapping)
|
|
482
|
+
model_summary = aggregate_model_summary(results)
|
|
483
|
+
comparison = calculate_comparison(model_summary)
|
|
484
|
+
recommendations = generate_recommendations(model_summary, comparison)
|
|
485
|
+
notes = generate_notes(model_summary, results)
|
|
486
|
+
|
|
487
|
+
# Flatten runs for benchmark output
|
|
488
|
+
all_runs = []
|
|
489
|
+
for model, runs in results.items():
|
|
490
|
+
all_runs.extend(runs)
|
|
491
|
+
|
|
492
|
+
# Sort by eval_id, then model
|
|
493
|
+
all_runs.sort(key=lambda r: (r["eval_id"], r["model"]))
|
|
494
|
+
|
|
495
|
+
# Get eval IDs
|
|
496
|
+
eval_ids = sorted(set(r["eval_id"] for r in all_runs))
|
|
497
|
+
|
|
498
|
+
benchmark = {
|
|
499
|
+
"metadata": {
|
|
500
|
+
"target_skill": config.get("target_skill", "unknown"),
|
|
501
|
+
"target_skill_path": config.get("target_skill_path", ""),
|
|
502
|
+
"models_compared": list(results.keys()),
|
|
503
|
+
"timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
504
|
+
"evals_run": eval_ids,
|
|
505
|
+
"note": "Results are based on blind evaluation - graders did not know which model produced each output"
|
|
506
|
+
},
|
|
507
|
+
"runs": all_runs,
|
|
508
|
+
"model_summary": model_summary,
|
|
509
|
+
"comparison": comparison,
|
|
510
|
+
"recommendations": recommendations,
|
|
511
|
+
"notes": notes
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
return benchmark
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def generate_markdown(benchmark: dict) -> str:
|
|
518
|
+
"""Generate human-readable benchmark.md from benchmark data."""
|
|
519
|
+
metadata = benchmark["metadata"]
|
|
520
|
+
model_summary = benchmark["model_summary"]
|
|
521
|
+
comparison = benchmark["comparison"]
|
|
522
|
+
recommendations = benchmark["recommendations"]
|
|
523
|
+
notes = benchmark["notes"]
|
|
524
|
+
|
|
525
|
+
models = list(model_summary.keys())
|
|
526
|
+
model_a = models[0] if models else "model_a"
|
|
527
|
+
model_b = models[1] if len(models) > 1 else "model_b"
|
|
528
|
+
|
|
529
|
+
lines = [
|
|
530
|
+
f"# Model Comparison: {metadata['target_skill']}",
|
|
531
|
+
"",
|
|
532
|
+
f"**Skill**: {metadata['target_skill']}",
|
|
533
|
+
f"**Date**: {metadata['timestamp']}",
|
|
534
|
+
f"**Models**: {', '.join(metadata['models_compared'])}",
|
|
535
|
+
f"**Evals**: {', '.join(map(str, metadata['evals_run']))}",
|
|
536
|
+
"",
|
|
537
|
+
"> **Note**: Results are based on blind evaluation. Graders did not know which model produced each output.",
|
|
538
|
+
"",
|
|
539
|
+
"## Summary",
|
|
540
|
+
"",
|
|
541
|
+
f"| Metric | {model_a} | {model_b} | Delta |",
|
|
542
|
+
"|--------|------------|---------------|-------|",
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
a_summary = model_summary.get(model_a, {})
|
|
546
|
+
b_summary = model_summary.get(model_b, {})
|
|
547
|
+
|
|
548
|
+
# Pass rate
|
|
549
|
+
a_pr = a_summary.get("pass_rate", {})
|
|
550
|
+
b_pr = b_summary.get("pass_rate", {})
|
|
551
|
+
lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {comparison.get('pass_rate_delta', '—')} |")
|
|
552
|
+
|
|
553
|
+
# Time
|
|
554
|
+
a_time = a_summary.get("time_seconds", {})
|
|
555
|
+
b_time = b_summary.get("time_seconds", {})
|
|
556
|
+
lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {comparison.get('time_delta', '—')} |")
|
|
557
|
+
|
|
558
|
+
# Tokens
|
|
559
|
+
a_tokens = a_summary.get("tokens", {})
|
|
560
|
+
b_tokens = b_summary.get("tokens", {})
|
|
561
|
+
lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {comparison.get('token_delta', '—')} |")
|
|
562
|
+
|
|
563
|
+
# Cost efficiency
|
|
564
|
+
cost_eff = comparison.get("cost_efficiency", {})
|
|
565
|
+
if cost_eff:
|
|
566
|
+
lines.append(f"| Cost Efficiency | {cost_eff.get(model_a, '—')} | {cost_eff.get(model_b, '—')} | Higher is better |")
|
|
567
|
+
|
|
568
|
+
# Recommendations
|
|
569
|
+
if recommendations:
|
|
570
|
+
lines.extend([
|
|
571
|
+
"",
|
|
572
|
+
"## Recommendations",
|
|
573
|
+
"",
|
|
574
|
+
"> These recommendations are derived from actual evaluation results, not pre-conceived assumptions.",
|
|
575
|
+
""
|
|
576
|
+
])
|
|
577
|
+
for rec in recommendations:
|
|
578
|
+
lines.append(f"- **{rec['scenario']}**: Use **{rec['recommended_model']}** — {rec['reason']}")
|
|
579
|
+
|
|
580
|
+
# Notes
|
|
581
|
+
if notes:
|
|
582
|
+
lines.extend([
|
|
583
|
+
"",
|
|
584
|
+
"## Analysis Notes",
|
|
585
|
+
""
|
|
586
|
+
])
|
|
587
|
+
for note in notes:
|
|
588
|
+
lines.append(f"- {note}")
|
|
589
|
+
|
|
590
|
+
# Per-eval breakdown
|
|
591
|
+
runs = benchmark.get("runs", [])
|
|
592
|
+
if runs:
|
|
593
|
+
lines.extend([
|
|
594
|
+
"",
|
|
595
|
+
"## Per-Eval Breakdown",
|
|
596
|
+
""
|
|
597
|
+
])
|
|
598
|
+
|
|
599
|
+
eval_ids = sorted(set(r["eval_id"] for r in runs))
|
|
600
|
+
for eval_id in eval_ids:
|
|
601
|
+
eval_runs = [r for r in runs if r["eval_id"] == eval_id]
|
|
602
|
+
eval_name = eval_runs[0]["eval_name"] if eval_runs else f"Eval {eval_id}"
|
|
603
|
+
|
|
604
|
+
lines.append(f"### {eval_name}")
|
|
605
|
+
lines.append("")
|
|
606
|
+
lines.append("| Model | Pass Rate | Time | Tokens |")
|
|
607
|
+
lines.append("|-------|-----------|------|--------|")
|
|
608
|
+
|
|
609
|
+
for run in eval_runs:
|
|
610
|
+
r = run["result"]
|
|
611
|
+
lines.append(f"| {run['model']} | {r['pass_rate']*100:.0f}% | {r['time_seconds']:.1f}s | {r['tokens']} |")
|
|
612
|
+
|
|
613
|
+
lines.append("")
|
|
614
|
+
|
|
615
|
+
return "\n".join(lines)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def main():
|
|
619
|
+
parser = argparse.ArgumentParser(
|
|
620
|
+
description="Aggregate model comparison results into benchmark summary"
|
|
621
|
+
)
|
|
622
|
+
parser.add_argument(
|
|
623
|
+
"workspace",
|
|
624
|
+
type=Path,
|
|
625
|
+
help="Path to the workspace directory"
|
|
626
|
+
)
|
|
627
|
+
parser.add_argument(
|
|
628
|
+
"--mapping", "-m",
|
|
629
|
+
type=Path,
|
|
630
|
+
default=None,
|
|
631
|
+
help="Path to mapping.json (default: <workspace>/mapping.json)"
|
|
632
|
+
)
|
|
633
|
+
parser.add_argument(
|
|
634
|
+
"--output", "-o",
|
|
635
|
+
type=Path,
|
|
636
|
+
help="Output path for benchmark.json (default: <workspace>/benchmark.json)"
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
args = parser.parse_args()
|
|
640
|
+
|
|
641
|
+
if not args.workspace.exists():
|
|
642
|
+
print(f"Directory not found: {args.workspace}")
|
|
643
|
+
sys.exit(1)
|
|
644
|
+
|
|
645
|
+
# Load mapping if provided
|
|
646
|
+
mapping = {}
|
|
647
|
+
if args.mapping:
|
|
648
|
+
mapping = load_mapping(args.workspace, args.mapping)
|
|
649
|
+
|
|
650
|
+
# Generate benchmark
|
|
651
|
+
benchmark = generate_benchmark(args.workspace, mapping)
|
|
652
|
+
|
|
653
|
+
# Determine output paths
|
|
654
|
+
output_json = args.output or (args.workspace / "benchmark.json")
|
|
655
|
+
output_md = output_json.with_suffix(".md")
|
|
656
|
+
|
|
657
|
+
# Write benchmark.json
|
|
658
|
+
with open(output_json, "w") as f:
|
|
659
|
+
json.dump(benchmark, f, indent=2)
|
|
660
|
+
print(f"Generated: {output_json}")
|
|
661
|
+
|
|
662
|
+
# Write benchmark.md
|
|
663
|
+
markdown = generate_markdown(benchmark)
|
|
664
|
+
with open(output_md, "w") as f:
|
|
665
|
+
f.write(markdown)
|
|
666
|
+
print(f"Generated: {output_md}")
|
|
667
|
+
|
|
668
|
+
# Print summary
|
|
669
|
+
print(f"\nSummary:")
|
|
670
|
+
for model, summary in benchmark["model_summary"].items():
|
|
671
|
+
pr = summary["pass_rate"]["mean"]
|
|
672
|
+
print(f" {model}: {pr*100:.1f}% pass rate")
|
|
673
|
+
|
|
674
|
+
if benchmark["recommendations"]:
|
|
675
|
+
print(f"\nTop recommendation:")
|
|
676
|
+
rec = benchmark["recommendations"][0]
|
|
677
|
+
print(f" For {rec['scenario']}: use {rec['recommended_model']}")
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
if __name__ == "__main__":
|
|
681
|
+
main()
|