astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,681 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Aggregate individual run results into model comparison benchmark.
4
+
5
+ This script supports two workspace layouts:
6
+
7
+ 1. Anonymous run IDs with mapping.json (recommended for blind evaluation):
8
+ <workspace>/
9
+ ├── evals.json
10
+ ├── mapping.json # Maps run IDs to models
11
+ └── run-001/
12
+ ├── outputs/
13
+ ├── grading.json
14
+ └── timing.json
15
+
16
+ 2. Legacy layout with model names in directories:
17
+ <workspace>/
18
+ ├── evals.json
19
+ └── eval-1/
20
+ ├── eval_metadata.json
21
+ ├── opus/
22
+ │ ├── outputs/
23
+ │ ├── grading.json
24
+ │ └── timing.json
25
+ └── sonnet/
26
+ └── ...
27
+
28
+ Usage:
29
+ python aggregate_results.py <workspace_dir> [--mapping mapping.json]
30
+
31
+ Example:
32
+ python aggregate_results.py pdf-eval-workspace/
33
+ python aggregate_results.py pdf-eval-workspace/ --mapping pdf-eval-workspace/mapping.json
34
+ """
35
+
36
+ import argparse
37
+ import json
38
+ import math
39
+ import sys
40
+ from datetime import datetime, timezone
41
+ from pathlib import Path
42
+
43
+
44
+ def calculate_stats(values: list[float]) -> dict:
45
+ """Calculate mean, stddev, min, max for a list of values."""
46
+ if not values:
47
+ return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
48
+
49
+ n = len(values)
50
+ mean = sum(values) / n
51
+
52
+ if n > 1:
53
+ variance = sum((x - mean) ** 2 for x in values) / (n - 1)
54
+ stddev = math.sqrt(variance)
55
+ else:
56
+ stddev = 0.0
57
+
58
+ return {
59
+ "mean": round(mean, 4),
60
+ "stddev": round(stddev, 4),
61
+ "min": round(min(values), 4),
62
+ "max": round(max(values), 4)
63
+ }
64
+
65
+
66
+ def load_evals_config(workspace: Path) -> dict:
67
+ """Load evals.json from workspace."""
68
+ evals_path = workspace / "evals.json"
69
+ if evals_path.exists():
70
+ with open(evals_path) as f:
71
+ return json.load(f)
72
+ return {}
73
+
74
+
75
+ def load_mapping(workspace: Path, mapping_path: Path | None = None) -> dict:
76
+ """Load mapping.json that maps run IDs to model names."""
77
+ if mapping_path:
78
+ path = mapping_path
79
+ else:
80
+ path = workspace / "mapping.json"
81
+
82
+ if path.exists():
83
+ with open(path) as f:
84
+ return json.load(f)
85
+ return {}
86
+
87
+
88
+ def load_run_results_with_mapping(workspace: Path, mapping: dict) -> dict[str, list[dict]]:
89
+ """
90
+ Load run results using anonymous run IDs with mapping.
91
+
92
+ Returns dict keyed by model name, each containing a list of run results.
93
+ """
94
+ results: dict[str, list[dict]] = {}
95
+
96
+ for run_dir in sorted(workspace.glob("run-*")):
97
+ if not run_dir.is_dir():
98
+ continue
99
+
100
+ run_id = run_dir.name
101
+
102
+ # Get model and eval info from mapping
103
+ run_info = mapping.get(run_id, {})
104
+ model = run_info.get("model")
105
+ eval_id = run_info.get("eval_id")
106
+ eval_name = run_info.get("eval_name", f"Eval {eval_id}")
107
+
108
+ if not model:
109
+ print(f"Warning: No model mapping found for {run_id}")
110
+ continue
111
+
112
+ if model not in results:
113
+ results[model] = []
114
+
115
+ # Load grading.json
116
+ grading_path = run_dir / "grading.json"
117
+ if not grading_path.exists():
118
+ print(f"Warning: grading.json not found in {run_dir}")
119
+ continue
120
+
121
+ try:
122
+ with open(grading_path) as f:
123
+ grading = json.load(f)
124
+ except json.JSONDecodeError as e:
125
+ print(f"Warning: Invalid JSON in {grading_path}: {e}")
126
+ continue
127
+
128
+ # Load timing.json
129
+ timing_path = run_dir / "timing.json"
130
+ timing_data = {}
131
+ if timing_path.exists():
132
+ try:
133
+ with open(timing_path) as tf:
134
+ timing_data = json.load(tf)
135
+ except json.JSONDecodeError:
136
+ pass
137
+
138
+ # Calculate time in seconds from timing data
139
+ # Support both duration_ms and total_duration_seconds formats
140
+ time_seconds = 0.0
141
+ if "duration_ms" in timing_data:
142
+ time_seconds = timing_data["duration_ms"] / 1000.0
143
+ elif "total_duration_seconds" in timing_data:
144
+ time_seconds = timing_data["total_duration_seconds"]
145
+
146
+ # Build result
147
+ result = {
148
+ "run_id": run_id,
149
+ "eval_id": eval_id,
150
+ "eval_name": eval_name,
151
+ "model": model,
152
+ "result": {
153
+ "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
154
+ "passed": grading.get("summary", {}).get("passed", 0),
155
+ "failed": grading.get("summary", {}).get("failed", 0),
156
+ "total": grading.get("summary", {}).get("total", 0),
157
+ "time_seconds": time_seconds,
158
+ "tokens": timing_data.get("total_tokens", 0),
159
+ "tool_calls": grading.get("execution_metrics", {}).get("total_tool_calls", 0),
160
+ "errors": grading.get("execution_metrics", {}).get("errors_encountered", 0)
161
+ },
162
+ "expectations": grading.get("expectations", []),
163
+ "issues": grading.get("issues", [])
164
+ }
165
+
166
+ results[model].append(result)
167
+
168
+ return results
169
+
170
+
171
+ def load_run_results_legacy(workspace: Path) -> dict[str, list[dict]]:
172
+ """
173
+ Load run results using legacy layout with model names in directories.
174
+
175
+ Returns dict keyed by model name, each containing a list of run results.
176
+ """
177
+ results: dict[str, list[dict]] = {}
178
+
179
+ # Find all eval directories
180
+ for eval_dir in sorted(workspace.glob("eval-*")):
181
+ if not eval_dir.is_dir():
182
+ continue
183
+
184
+ # Get eval metadata
185
+ metadata_path = eval_dir / "eval_metadata.json"
186
+ eval_id = None
187
+ eval_name = eval_dir.name
188
+
189
+ if metadata_path.exists():
190
+ try:
191
+ with open(metadata_path) as mf:
192
+ metadata = json.load(mf)
193
+ eval_id = metadata.get("eval_id")
194
+ eval_name = metadata.get("eval_name", eval_dir.name)
195
+ except (json.JSONDecodeError, OSError):
196
+ pass
197
+
198
+ if eval_id is None:
199
+ try:
200
+ eval_id = int(eval_dir.name.split("-")[1])
201
+ except (ValueError, IndexError):
202
+ eval_id = 0
203
+
204
+ # Find model directories
205
+ for model_dir in sorted(eval_dir.iterdir()):
206
+ if not model_dir.is_dir():
207
+ continue
208
+ if model_dir.name in ["outputs", "inputs"]:
209
+ continue
210
+
211
+ model = model_dir.name
212
+ if model not in results:
213
+ results[model] = []
214
+
215
+ # Load grading.json
216
+ grading_path = model_dir / "grading.json"
217
+ if not grading_path.exists():
218
+ print(f"Warning: grading.json not found in {model_dir}")
219
+ continue
220
+
221
+ try:
222
+ with open(grading_path) as f:
223
+ grading = json.load(f)
224
+ except json.JSONDecodeError as e:
225
+ print(f"Warning: Invalid JSON in {grading_path}: {e}")
226
+ continue
227
+
228
+ # Load timing.json
229
+ timing_path = model_dir / "timing.json"
230
+ timing_data = {}
231
+ if timing_path.exists():
232
+ try:
233
+ with open(timing_path) as tf:
234
+ timing_data = json.load(tf)
235
+ except json.JSONDecodeError:
236
+ pass
237
+
238
+ # Build result
239
+ result = {
240
+ "run_id": f"eval-{eval_id}-{model}",
241
+ "eval_id": eval_id,
242
+ "eval_name": eval_name,
243
+ "model": model,
244
+ "result": {
245
+ "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
246
+ "passed": grading.get("summary", {}).get("passed", 0),
247
+ "failed": grading.get("summary", {}).get("failed", 0),
248
+ "total": grading.get("summary", {}).get("total", 0),
249
+ "time_seconds": timing_data.get("total_duration_seconds", 0.0),
250
+ "tokens": timing_data.get("total_tokens", 0),
251
+ "tool_calls": grading.get("execution_metrics", {}).get("total_tool_calls", 0),
252
+ "errors": grading.get("execution_metrics", {}).get("errors_encountered", 0)
253
+ },
254
+ "expectations": grading.get("expectations", []),
255
+ "issues": grading.get("issues", [])
256
+ }
257
+
258
+ results[model].append(result)
259
+
260
+ return results
261
+
262
+
263
+ def load_run_results(workspace: Path, mapping: dict) -> dict[str, list[dict]]:
264
+ """
265
+ Load all run results from a workspace.
266
+
267
+ Automatically detects layout: anonymous run IDs with mapping.json or legacy.
268
+ """
269
+ # If mapping is provided or exists, use anonymous layout
270
+ if mapping:
271
+ return load_run_results_with_mapping(workspace, mapping)
272
+
273
+ # Check for mapping.json
274
+ mapping_path = workspace / "mapping.json"
275
+ if mapping_path.exists():
276
+ mapping = load_mapping(workspace)
277
+ if mapping:
278
+ return load_run_results_with_mapping(workspace, mapping)
279
+
280
+ # Fall back to legacy layout
281
+ return load_run_results_legacy(workspace)
282
+
283
+
284
+ def aggregate_model_summary(results: dict[str, list[dict]]) -> dict[str, dict]:
285
+ """Aggregate results into model summaries."""
286
+ model_summary = {}
287
+
288
+ for model, runs in results.items():
289
+ if not runs:
290
+ model_summary[model] = {
291
+ "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
292
+ "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
293
+ "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
294
+ }
295
+ continue
296
+
297
+ # Safely extract values, handling missing keys
298
+ pass_rates = []
299
+ times = []
300
+ tokens_list = []
301
+
302
+ for r in runs:
303
+ result = r.get("result", {})
304
+ if result.get("pass_rate") is not None:
305
+ pass_rates.append(result["pass_rate"])
306
+ if result.get("time_seconds") is not None:
307
+ times.append(result["time_seconds"])
308
+ if result.get("tokens") is not None:
309
+ tokens_list.append(result["tokens"])
310
+
311
+ model_summary[model] = {
312
+ "pass_rate": calculate_stats(pass_rates) if pass_rates else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
313
+ "time_seconds": calculate_stats(times) if times else {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
314
+ "tokens": calculate_stats(tokens_list) if tokens_list else {"mean": 0, "stddev": 0, "min": 0, "max": 0}
315
+ }
316
+
317
+ return model_summary
318
+
319
+
320
+ def calculate_comparison(model_summary: dict[str, dict]) -> dict:
321
+ """Calculate comparison metrics between models."""
322
+ models = list(model_summary.keys())
323
+
324
+ if len(models) < 2:
325
+ return {
326
+ "pass_rate_delta": "N/A",
327
+ "time_delta": "N/A",
328
+ "token_delta": "N/A",
329
+ "cost_efficiency": {}
330
+ }
331
+
332
+ # Compare first model to second
333
+ primary = model_summary.get(models[0], {})
334
+ baseline = model_summary.get(models[1], {})
335
+
336
+ delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
337
+ delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
338
+ delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
339
+
340
+ # Calculate cost efficiency: pass_rate * 1000 / (tokens / 1000)
341
+ cost_efficiency = {}
342
+ for model, summary in model_summary.items():
343
+ pass_rate = summary.get("pass_rate", {}).get("mean", 0)
344
+ tokens = summary.get("tokens", {}).get("mean", 0)
345
+ if tokens > 0:
346
+ efficiency = (pass_rate * 1000) / (tokens / 1000)
347
+ cost_efficiency[model] = round(efficiency, 1)
348
+ else:
349
+ cost_efficiency[model] = 0
350
+
351
+ return {
352
+ "pass_rate_delta": f"{delta_pass_rate:+.2f}",
353
+ "time_delta": f"{delta_time:+.1f}s",
354
+ "token_delta": f"{delta_tokens:+.0f}",
355
+ "cost_efficiency": cost_efficiency
356
+ }
357
+
358
+
359
+ def generate_recommendations(model_summary: dict[str, dict], comparison: dict) -> list[dict]:
360
+ """Generate usage recommendations based on actual results."""
361
+ recommendations = []
362
+ models = list(model_summary.keys())
363
+
364
+ if len(models) < 2:
365
+ return recommendations
366
+
367
+ primary = models[0]
368
+ baseline = models[1]
369
+
370
+ primary_pr = model_summary[primary]["pass_rate"]["mean"]
371
+ baseline_pr = model_summary[baseline]["pass_rate"]["mean"]
372
+
373
+ primary_time = model_summary[primary]["time_seconds"]["mean"]
374
+ baseline_time = model_summary[baseline]["time_seconds"]["mean"]
375
+
376
+ pr_diff = primary_pr - baseline_pr
377
+
378
+ # Generate recommendations based on actual data patterns
379
+
380
+ # High accuracy recommendation
381
+ if pr_diff > 0.1:
382
+ recommendations.append({
383
+ "scenario": "High accuracy requirements",
384
+ "recommended_model": primary,
385
+ "reason": f"{pr_diff*100:.0f}% higher pass rate ({primary_pr*100:.0f}% vs {baseline_pr*100:.0f}%)"
386
+ })
387
+
388
+ # Cost efficiency recommendation
389
+ cost_eff = comparison.get("cost_efficiency", {})
390
+ baseline_eff = cost_eff.get(baseline, 0)
391
+ primary_eff = cost_eff.get(primary, 0)
392
+
393
+ if baseline_eff > primary_eff and baseline_pr >= 0.7:
394
+ recommendations.append({
395
+ "scenario": "Cost-conscious use",
396
+ "recommended_model": baseline,
397
+ "reason": f"Better cost efficiency ({baseline_eff:.1f} vs {primary_eff:.1f}) with acceptable quality ({baseline_pr*100:.0f}%)"
398
+ })
399
+
400
+ # Speed recommendation
401
+ if baseline_time < primary_time * 0.8 and baseline_time > 0:
402
+ speedup = ((primary_time - baseline_time) / primary_time) * 100
403
+ recommendations.append({
404
+ "scenario": "Speed-critical tasks",
405
+ "recommended_model": baseline,
406
+ "reason": f"{speedup:.0f}% faster ({baseline_time:.1f}s vs {primary_time:.1f}s)"
407
+ })
408
+
409
+ # Default fallback based on actual results
410
+ if not recommendations:
411
+ if pr_diff > 0:
412
+ recommendations.append({
413
+ "scenario": "General use",
414
+ "recommended_model": primary,
415
+ "reason": f"Higher pass rate ({primary_pr*100:.0f}% vs {baseline_pr*100:.0f}%)"
416
+ })
417
+ else:
418
+ recommendations.append({
419
+ "scenario": "General use",
420
+ "recommended_model": baseline,
421
+ "reason": f"Similar quality ({baseline_pr*100:.0f}% vs {primary_pr*100:.0f}%) with lower cost"
422
+ })
423
+
424
+ return recommendations
425
+
426
+
427
+ def generate_notes(model_summary: dict[str, dict], results: dict[str, list[dict]]) -> list[str]:
428
+ """Generate analyst notes from the results."""
429
+ notes = []
430
+ models = list(model_summary.keys())
431
+
432
+ if len(models) < 2:
433
+ return notes
434
+
435
+ primary = models[0]
436
+ baseline = models[1]
437
+
438
+ primary_pr = model_summary[primary]["pass_rate"]["mean"]
439
+ baseline_pr = model_summary[baseline]["pass_rate"]["mean"]
440
+
441
+ # Pass rate comparison
442
+ if primary_pr > baseline_pr:
443
+ notes.append(f"{primary} achieves {primary_pr*100:.0f}% pass rate vs {baseline}'s {baseline_pr*100:.0f}%")
444
+ elif baseline_pr > primary_pr:
445
+ notes.append(f"{baseline} achieves {baseline_pr*100:.0f}% pass rate vs {primary}'s {primary_pr*100:.0f}%")
446
+ else:
447
+ notes.append(f"Both models achieve similar pass rate ({primary_pr*100:.0f}%)")
448
+
449
+ # Speed comparison
450
+ primary_time = model_summary[primary]["time_seconds"]["mean"]
451
+ baseline_time = model_summary[baseline]["time_seconds"]["mean"]
452
+ if baseline_time < primary_time and baseline_time > 0:
453
+ speedup = ((primary_time - baseline_time) / primary_time) * 100
454
+ notes.append(f"{baseline} is {speedup:.0f}% faster on average")
455
+ elif primary_time < baseline_time and primary_time > 0:
456
+ speedup = ((baseline_time - primary_time) / baseline_time) * 100
457
+ notes.append(f"{primary} is {speedup:.0f}% faster on average")
458
+
459
+ # Look for eval-specific patterns
460
+ primary_runs = results.get(primary, [])
461
+ baseline_runs = results.get(baseline, [])
462
+
463
+ for pr in primary_runs:
464
+ for br in baseline_runs:
465
+ if pr["eval_id"] == br["eval_id"]:
466
+ pr_rate = pr["result"]["pass_rate"]
467
+ br_rate = br["result"]["pass_rate"]
468
+ if pr_rate < 0.5 and br_rate < 0.5:
469
+ notes.append(f"Both models struggle with '{pr['eval_name']}'")
470
+ elif pr_rate - br_rate > 0.3:
471
+ notes.append(f"{primary} significantly outperforms on '{pr['eval_name']}'")
472
+ elif br_rate - pr_rate > 0.3:
473
+ notes.append(f"{baseline} significantly outperforms on '{pr['eval_name']}'")
474
+
475
+ return notes[:5] # Limit to 5 notes
476
+
477
+
478
+ def generate_benchmark(workspace: Path, mapping: dict) -> dict:
479
+ """Generate complete benchmark.json from workspace results."""
480
+ config = load_evals_config(workspace)
481
+ results = load_run_results(workspace, mapping)
482
+ model_summary = aggregate_model_summary(results)
483
+ comparison = calculate_comparison(model_summary)
484
+ recommendations = generate_recommendations(model_summary, comparison)
485
+ notes = generate_notes(model_summary, results)
486
+
487
+ # Flatten runs for benchmark output
488
+ all_runs = []
489
+ for model, runs in results.items():
490
+ all_runs.extend(runs)
491
+
492
+ # Sort by eval_id, then model
493
+ all_runs.sort(key=lambda r: (r["eval_id"], r["model"]))
494
+
495
+ # Get eval IDs
496
+ eval_ids = sorted(set(r["eval_id"] for r in all_runs))
497
+
498
+ benchmark = {
499
+ "metadata": {
500
+ "target_skill": config.get("target_skill", "unknown"),
501
+ "target_skill_path": config.get("target_skill_path", ""),
502
+ "models_compared": list(results.keys()),
503
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
504
+ "evals_run": eval_ids,
505
+ "note": "Results are based on blind evaluation - graders did not know which model produced each output"
506
+ },
507
+ "runs": all_runs,
508
+ "model_summary": model_summary,
509
+ "comparison": comparison,
510
+ "recommendations": recommendations,
511
+ "notes": notes
512
+ }
513
+
514
+ return benchmark
515
+
516
+
517
+ def generate_markdown(benchmark: dict) -> str:
518
+ """Generate human-readable benchmark.md from benchmark data."""
519
+ metadata = benchmark["metadata"]
520
+ model_summary = benchmark["model_summary"]
521
+ comparison = benchmark["comparison"]
522
+ recommendations = benchmark["recommendations"]
523
+ notes = benchmark["notes"]
524
+
525
+ models = list(model_summary.keys())
526
+ model_a = models[0] if models else "model_a"
527
+ model_b = models[1] if len(models) > 1 else "model_b"
528
+
529
+ lines = [
530
+ f"# Model Comparison: {metadata['target_skill']}",
531
+ "",
532
+ f"**Skill**: {metadata['target_skill']}",
533
+ f"**Date**: {metadata['timestamp']}",
534
+ f"**Models**: {', '.join(metadata['models_compared'])}",
535
+ f"**Evals**: {', '.join(map(str, metadata['evals_run']))}",
536
+ "",
537
+ "> **Note**: Results are based on blind evaluation. Graders did not know which model produced each output.",
538
+ "",
539
+ "## Summary",
540
+ "",
541
+ f"| Metric | {model_a} | {model_b} | Delta |",
542
+ "|--------|------------|---------------|-------|",
543
+ ]
544
+
545
+ a_summary = model_summary.get(model_a, {})
546
+ b_summary = model_summary.get(model_b, {})
547
+
548
+ # Pass rate
549
+ a_pr = a_summary.get("pass_rate", {})
550
+ b_pr = b_summary.get("pass_rate", {})
551
+ lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {comparison.get('pass_rate_delta', '—')} |")
552
+
553
+ # Time
554
+ a_time = a_summary.get("time_seconds", {})
555
+ b_time = b_summary.get("time_seconds", {})
556
+ lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {comparison.get('time_delta', '—')} |")
557
+
558
+ # Tokens
559
+ a_tokens = a_summary.get("tokens", {})
560
+ b_tokens = b_summary.get("tokens", {})
561
+ lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {comparison.get('token_delta', '—')} |")
562
+
563
+ # Cost efficiency
564
+ cost_eff = comparison.get("cost_efficiency", {})
565
+ if cost_eff:
566
+ lines.append(f"| Cost Efficiency | {cost_eff.get(model_a, '—')} | {cost_eff.get(model_b, '—')} | Higher is better |")
567
+
568
+ # Recommendations
569
+ if recommendations:
570
+ lines.extend([
571
+ "",
572
+ "## Recommendations",
573
+ "",
574
+ "> These recommendations are derived from actual evaluation results, not pre-conceived assumptions.",
575
+ ""
576
+ ])
577
+ for rec in recommendations:
578
+ lines.append(f"- **{rec['scenario']}**: Use **{rec['recommended_model']}** — {rec['reason']}")
579
+
580
+ # Notes
581
+ if notes:
582
+ lines.extend([
583
+ "",
584
+ "## Analysis Notes",
585
+ ""
586
+ ])
587
+ for note in notes:
588
+ lines.append(f"- {note}")
589
+
590
+ # Per-eval breakdown
591
+ runs = benchmark.get("runs", [])
592
+ if runs:
593
+ lines.extend([
594
+ "",
595
+ "## Per-Eval Breakdown",
596
+ ""
597
+ ])
598
+
599
+ eval_ids = sorted(set(r["eval_id"] for r in runs))
600
+ for eval_id in eval_ids:
601
+ eval_runs = [r for r in runs if r["eval_id"] == eval_id]
602
+ eval_name = eval_runs[0]["eval_name"] if eval_runs else f"Eval {eval_id}"
603
+
604
+ lines.append(f"### {eval_name}")
605
+ lines.append("")
606
+ lines.append("| Model | Pass Rate | Time | Tokens |")
607
+ lines.append("|-------|-----------|------|--------|")
608
+
609
+ for run in eval_runs:
610
+ r = run["result"]
611
+ lines.append(f"| {run['model']} | {r['pass_rate']*100:.0f}% | {r['time_seconds']:.1f}s | {r['tokens']} |")
612
+
613
+ lines.append("")
614
+
615
+ return "\n".join(lines)
616
+
617
+
618
+ def main():
619
+ parser = argparse.ArgumentParser(
620
+ description="Aggregate model comparison results into benchmark summary"
621
+ )
622
+ parser.add_argument(
623
+ "workspace",
624
+ type=Path,
625
+ help="Path to the workspace directory"
626
+ )
627
+ parser.add_argument(
628
+ "--mapping", "-m",
629
+ type=Path,
630
+ default=None,
631
+ help="Path to mapping.json (default: <workspace>/mapping.json)"
632
+ )
633
+ parser.add_argument(
634
+ "--output", "-o",
635
+ type=Path,
636
+ help="Output path for benchmark.json (default: <workspace>/benchmark.json)"
637
+ )
638
+
639
+ args = parser.parse_args()
640
+
641
+ if not args.workspace.exists():
642
+ print(f"Directory not found: {args.workspace}")
643
+ sys.exit(1)
644
+
645
+ # Load mapping if provided
646
+ mapping = {}
647
+ if args.mapping:
648
+ mapping = load_mapping(args.workspace, args.mapping)
649
+
650
+ # Generate benchmark
651
+ benchmark = generate_benchmark(args.workspace, mapping)
652
+
653
+ # Determine output paths
654
+ output_json = args.output or (args.workspace / "benchmark.json")
655
+ output_md = output_json.with_suffix(".md")
656
+
657
+ # Write benchmark.json
658
+ with open(output_json, "w") as f:
659
+ json.dump(benchmark, f, indent=2)
660
+ print(f"Generated: {output_json}")
661
+
662
+ # Write benchmark.md
663
+ markdown = generate_markdown(benchmark)
664
+ with open(output_md, "w") as f:
665
+ f.write(markdown)
666
+ print(f"Generated: {output_md}")
667
+
668
+ # Print summary
669
+ print(f"\nSummary:")
670
+ for model, summary in benchmark["model_summary"].items():
671
+ pr = summary["pass_rate"]["mean"]
672
+ print(f" {model}: {pr*100:.1f}% pass rate")
673
+
674
+ if benchmark["recommendations"]:
675
+ print(f"\nTop recommendation:")
676
+ rec = benchmark["recommendations"][0]
677
+ print(f" For {rec['scenario']}: use {rec['recommended_model']}")
678
+
679
+
680
+ if __name__ == "__main__":
681
+ main()