astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,485 @@
1
+ #!/usr/bin/env python3
2
+ """Generate a visual comparison report for model evaluation results.
3
+
4
+ Reads the workspace directory, discovers runs, and generates a self-contained
5
+ HTML page showing the model comparison results.
6
+
7
+ Usage:
8
+ python generate_report.py <workspace-path> [--output report.html]
9
+ python generate_report.py <workspace-path> [--serve] [--port PORT]
10
+
11
+ No dependencies beyond the Python stdlib are required.
12
+ """
13
+
14
+ import argparse
15
+ import base64
16
+ import json
17
+ import mimetypes
18
+ import os
19
+ import signal
20
+ import subprocess
21
+ import sys
22
+ import time
23
+ import webbrowser
24
+ from functools import partial
25
+ from http.server import HTTPServer, BaseHTTPRequestHandler
26
+ from pathlib import Path
27
+
28
+ # Files to exclude from output listings
29
+ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
30
+
31
+ # Extensions we render as inline text
32
+ TEXT_EXTENSIONS = {
33
+ ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
34
+ ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
35
+ ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
36
+ }
37
+
38
+ # Extensions we render as inline images
39
+ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
40
+
41
+ # MIME type overrides for common types
42
+ MIME_OVERRIDES = {
43
+ ".svg": "image/svg+xml",
44
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
45
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
46
+ }
47
+
48
+
49
+ def get_mime_type(path: Path) -> str:
50
+ ext = path.suffix.lower()
51
+ if ext in MIME_OVERRIDES:
52
+ return MIME_OVERRIDES[ext]
53
+ mime, _ = mimetypes.guess_type(str(path))
54
+ return mime or "application/octet-stream"
55
+
56
+
57
+ def load_evals_config(workspace: Path) -> dict:
58
+ """Load evals.json from workspace to get prompts."""
59
+ evals_path = workspace / "evals.json"
60
+ if evals_path.exists():
61
+ try:
62
+ return json.loads(evals_path.read_text())
63
+ except (json.JSONDecodeError, OSError):
64
+ pass
65
+ return {}
66
+
67
+
68
+ def load_mapping(workspace: Path) -> dict:
69
+ """Load mapping.json to map run IDs to eval info."""
70
+ mapping_path = workspace / "mapping.json"
71
+ if mapping_path.exists():
72
+ try:
73
+ return json.loads(mapping_path.read_text())
74
+ except (json.JSONDecodeError, OSError):
75
+ pass
76
+ return {}
77
+
78
+
79
+ def find_runs(workspace: Path) -> list[dict]:
80
+ """Recursively find directories that contain an outputs/ subdirectory."""
81
+ # Load evals and mapping for prompts
82
+ evals_config = load_evals_config(workspace)
83
+ mapping = load_mapping(workspace)
84
+
85
+ # Build eval_id -> prompt mapping
86
+ eval_prompts = {}
87
+ for eval_item in evals_config.get("evals", []):
88
+ eval_prompts[eval_item.get("id")] = eval_item.get("prompt", "")
89
+
90
+ runs: list[dict] = []
91
+ _find_runs_recursive(workspace, workspace, runs, mapping, eval_prompts)
92
+ runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r.get("model", ""), r["id"]))
93
+ return runs
94
+
95
+
96
+ def _find_runs_recursive(root: Path, current: Path, runs: list[dict], mapping: dict = None, eval_prompts: dict = None) -> None:
97
+ if not current.is_dir():
98
+ return
99
+
100
+ outputs_dir = current / "outputs"
101
+ if outputs_dir.is_dir():
102
+ run = build_run(root, current, mapping, eval_prompts)
103
+ if run:
104
+ runs.append(run)
105
+ return
106
+
107
+ skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
108
+ for child in sorted(current.iterdir()):
109
+ if child.is_dir() and child.name not in skip:
110
+ _find_runs_recursive(root, child, runs, mapping, eval_prompts)
111
+
112
+
113
+ def build_run(root: Path, run_dir: Path, mapping: dict = None, eval_prompts: dict = None) -> dict | None:
114
+ """Build a run dict with prompt, outputs, and grading data."""
115
+ prompt = ""
116
+ eval_id = None
117
+ eval_name = None
118
+ model = run_dir.name # Model name is directory name
119
+ run_id = run_dir.name
120
+
121
+ # First try mapping.json for run info (blind evaluation mode)
122
+ if mapping and run_id in mapping:
123
+ run_info = mapping[run_id]
124
+ model = run_info.get("model", model)
125
+ eval_id = run_info.get("eval_id")
126
+ eval_name = run_info.get("eval_name")
127
+ # Get prompt from eval_prompts using eval_id
128
+ if eval_prompts and eval_id in eval_prompts:
129
+ prompt = eval_prompts[eval_id]
130
+
131
+ # Fallback: Try eval_metadata.json from parent
132
+ if not prompt:
133
+ metadata_path = run_dir.parent / "eval_metadata.json"
134
+ if metadata_path.exists():
135
+ try:
136
+ metadata = json.loads(metadata_path.read_text())
137
+ prompt = metadata.get("prompt", "")
138
+ eval_id = metadata.get("eval_id")
139
+ except (json.JSONDecodeError, OSError):
140
+ pass
141
+
142
+ # Fallback: Try timing.json for model info
143
+ if model == run_dir.name: # Still using directory name
144
+ timing_path = run_dir / "timing.json"
145
+ if timing_path.exists():
146
+ try:
147
+ timing = json.loads(timing_path.read_text())
148
+ if timing.get("model"):
149
+ model = timing["model"]
150
+ except (json.JSONDecodeError, OSError):
151
+ pass
152
+
153
+ if not prompt:
154
+ prompt = "(No prompt found)"
155
+
156
+ run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
157
+
158
+ # Define timing_path early (used in multiple places)
159
+ timing_path = run_dir / "timing.json"
160
+
161
+ # Collect output files
162
+ outputs_dir = run_dir / "outputs"
163
+ output_files: list[dict] = []
164
+ if outputs_dir.is_dir():
165
+ for f in sorted(outputs_dir.iterdir()):
166
+ if f.is_file() and f.name not in METADATA_FILES:
167
+ output_files.append(embed_file(f))
168
+
169
+ # Load grading if present
170
+ grading = None
171
+ grading_path = run_dir / "grading.json"
172
+ if grading_path.exists():
173
+ try:
174
+ grading = json.loads(grading_path.read_text())
175
+ except (json.JSONDecodeError, OSError):
176
+ pass
177
+
178
+ # Load timing if present
179
+ timing = None
180
+ if timing_path.exists():
181
+ try:
182
+ timing = json.loads(timing_path.read_text())
183
+ except (json.JSONDecodeError, OSError):
184
+ pass
185
+
186
+ return {
187
+ "id": run_id,
188
+ "prompt": prompt,
189
+ "eval_id": eval_id,
190
+ "eval_name": eval_name,
191
+ "model": model,
192
+ "outputs": output_files,
193
+ "grading": grading,
194
+ "timing": timing,
195
+ }
196
+
197
+
198
+ def embed_file(path: Path) -> dict:
199
+ """Read a file and return an embedded representation."""
200
+ ext = path.suffix.lower()
201
+ mime = get_mime_type(path)
202
+
203
+ if ext in TEXT_EXTENSIONS:
204
+ try:
205
+ content = path.read_text(errors="replace")
206
+ except OSError:
207
+ content = "(Error reading file)"
208
+ return {
209
+ "name": path.name,
210
+ "type": "text",
211
+ "content": content,
212
+ }
213
+ elif ext in IMAGE_EXTENSIONS:
214
+ try:
215
+ raw = path.read_bytes()
216
+ b64 = base64.b64encode(raw).decode("ascii")
217
+ except OSError:
218
+ return {"name": path.name, "type": "error", "content": "(Error reading file)"}
219
+ return {
220
+ "name": path.name,
221
+ "type": "image",
222
+ "mime": mime,
223
+ "data_uri": f"data:{mime};base64,{b64}",
224
+ }
225
+ else:
226
+ # Binary / unknown — base64 download link
227
+ try:
228
+ raw = path.read_bytes()
229
+ b64 = base64.b64encode(raw).decode("ascii")
230
+ except OSError:
231
+ return {"name": path.name, "type": "error", "content": "(Error reading file)"}
232
+ return {
233
+ "name": path.name,
234
+ "type": "binary",
235
+ "mime": mime,
236
+ "data_uri": f"data:{mime};base64,{b64}",
237
+ }
238
+
239
+
240
+ def load_benchmark(workspace: Path) -> dict | None:
241
+ """Load benchmark.json from workspace."""
242
+ benchmark_path = workspace / "benchmark.json"
243
+ if benchmark_path.exists():
244
+ try:
245
+ return json.loads(benchmark_path.read_text())
246
+ except (json.JSONDecodeError, OSError):
247
+ pass
248
+ return None
249
+
250
+
251
+ def generate_html(runs: list[dict], benchmark: dict | None, skill_name: str) -> str:
252
+ """Generate the complete standalone HTML page with embedded data."""
253
+ template_path = Path(__file__).parent / "viewer.html"
254
+ if template_path.exists():
255
+ template = template_path.read_text()
256
+ else:
257
+ template = get_default_template()
258
+
259
+ embedded = {
260
+ "skill_name": skill_name,
261
+ "runs": runs,
262
+ "benchmark": benchmark,
263
+ }
264
+ # Use indent=2 to avoid extremely long lines that break VSCode tokenization
265
+ # and ensure ensure_ascii=False for proper Unicode character handling
266
+ data_json = json.dumps(embedded, indent=2, ensure_ascii=False)
267
+
268
+ # CRITICAL: Escape </script> tags in the JSON to prevent browsers from
269
+ # incorrectly terminating the script block when the data contains HTML examples
270
+ # Use <\/script> which JavaScript will parse correctly but browsers won't see as a tag
271
+ data_json = data_json.replace("</script>", "<\\/script>")
272
+
273
+ return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
274
+
275
+
276
+ def get_default_template() -> str:
277
+ """Return a minimal HTML template if viewer.html doesn't exist."""
278
+ return '''<!DOCTYPE html>
279
+ <html lang="en">
280
+ <head>
281
+ <meta charset="UTF-8">
282
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
283
+ <title>Model Comparison Report</title>
284
+ <style>
285
+ * { box-sizing: border-box; margin: 0; padding: 0; }
286
+ body { font-family: system-ui, -apple-system, sans-serif; background: #f5f5f5; padding: 2rem; }
287
+ h1 { margin-bottom: 1rem; }
288
+ .card { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
289
+ .model-badge { display: inline-block; padding: 0.25rem 0.75rem; border-radius: 9999px; font-size: 0.75rem; font-weight: 600; margin-left: 0.5rem; }
290
+ .model-opus { background: #e3f2fd; color: #1565c0; }
291
+ .model-sonnet { background: #fff3e0; color: #e65100; }
292
+ .model-haiku { background: #e8f5e9; color: #2e7d32; }
293
+ pre { background: #f5f5f5; padding: 1rem; border-radius: 4px; overflow-x: auto; font-size: 0.875rem; }
294
+ table { width: 100%; border-collapse: collapse; }
295
+ th, td { padding: 0.75rem; text-align: left; border-bottom: 1px solid #eee; }
296
+ th { background: #f5f5f5; font-weight: 600; }
297
+ .pass { color: #2e7d32; }
298
+ .fail { color: #c62828; }
299
+ </style>
300
+ </head>
301
+ <body>
302
+ <h1>Model Comparison: <span id="skill-name"></span></h1>
303
+ <div id="content"></div>
304
+ <script>
305
+ /*__EMBEDDED_DATA__*/
306
+
307
+ document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
308
+
309
+ const content = document.getElementById("content");
310
+
311
+ // Render benchmark summary if available
312
+ if (EMBEDDED_DATA.benchmark) {
313
+ const b = EMBEDDED_DATA.benchmark;
314
+ const summary = b.model_summary || {};
315
+ const models = Object.keys(summary);
316
+
317
+ let html = '<div class="card"><h2>Summary</h2><table><thead><tr><th>Metric</th>';
318
+ for (const m of models) {
319
+ html += `<th>${m}</th>`;
320
+ }
321
+ html += '</tr></thead><tbody>';
322
+
323
+ html += '<tr><td>Pass Rate</td>';
324
+ for (const m of models) {
325
+ const pr = summary[m].pass_rate || {};
326
+ html += `<td>${(pr.mean * 100).toFixed(0)}% ± ${(pr.stddev * 100).toFixed(0)}%</td>`;
327
+ }
328
+ html += '</tr>';
329
+
330
+ html += '<tr><td>Time (s)</td>';
331
+ for (const m of models) {
332
+ const t = summary[m].time_seconds || {};
333
+ html += `<td>${t.mean.toFixed(1)}s</td>`;
334
+ }
335
+ html += '</tr>';
336
+
337
+ html += '<tr><td>Tokens</td>';
338
+ for (const m of models) {
339
+ const tk = summary[m].tokens || {};
340
+ html += `<td>${tk.mean.toFixed(0)}</td>`;
341
+ }
342
+ html += '</tr></tbody></table></div>';
343
+
344
+ content.innerHTML += html;
345
+ }
346
+
347
+ // Render runs
348
+ for (const run of EMBEDDED_DATA.runs) {
349
+ let html = `<div class="card"><h3>Eval ${run.eval_id}: ${run.model}</h3>`;
350
+ html += `<p><strong>Prompt:</strong> ${run.prompt}</p>`;
351
+
352
+ if (run.grading) {
353
+ const g = run.grading;
354
+ html += `<p><strong>Pass Rate:</strong> <span class="${g.summary.pass_rate >= 0.7 ? 'pass' : 'fail'}">${(g.summary.pass_rate * 100).toFixed(0)}%</span></p>`;
355
+
356
+ if (g.expectations && g.expectations.length > 0) {
357
+ html += '<h4>Assertions</h4><ul>';
358
+ for (const e of g.expectations) {
359
+ html += `<li class="${e.passed ? 'pass' : 'fail'}">${e.passed ? '✓' : '✗'} ${e.text}</li>`;
360
+ }
361
+ html += '</ul>';
362
+ }
363
+ }
364
+
365
+ html += '</div>';
366
+ content.innerHTML += html;
367
+ }
368
+ </script>
369
+ </body>
370
+ </html>'''
371
+
372
+
373
+ class ReportHandler(BaseHTTPRequestHandler):
374
+ """Serves the comparison HTML report."""
375
+
376
+ def __init__(
377
+ self,
378
+ workspace: Path,
379
+ skill_name: str,
380
+ benchmark: dict | None,
381
+ *args,
382
+ **kwargs,
383
+ ):
384
+ self.workspace = workspace
385
+ self.skill_name = skill_name
386
+ self.benchmark = benchmark
387
+ super().__init__(*args, **kwargs)
388
+
389
+ def do_GET(self) -> None:
390
+ if self.path == "/" or self.path == "/index.html":
391
+ runs = find_runs(self.workspace)
392
+ html = generate_html(runs, self.benchmark, self.skill_name)
393
+ content = html.encode("utf-8")
394
+ self.send_response(200)
395
+ self.send_header("Content-Type", "text/html; charset=utf-8")
396
+ self.send_header("Content-Length", str(len(content)))
397
+ self.end_headers()
398
+ self.wfile.write(content)
399
+ else:
400
+ self.send_error(404)
401
+
402
+ def log_message(self, format: str, *args: object) -> None:
403
+ pass
404
+
405
+
406
+ def _kill_port(port: int) -> None:
407
+ """Kill any process listening on the given port."""
408
+ try:
409
+ result = subprocess.run(
410
+ ["lsof", "-ti", f":{port}"],
411
+ capture_output=True, text=True, timeout=5,
412
+ )
413
+ for pid_str in result.stdout.strip().split("\n"):
414
+ if pid_str.strip():
415
+ try:
416
+ os.kill(int(pid_str.strip()), signal.SIGTERM)
417
+ except (ProcessLookupError, ValueError):
418
+ pass
419
+ if result.stdout.strip():
420
+ time.sleep(0.5)
421
+ except subprocess.TimeoutExpired:
422
+ pass
423
+ except FileNotFoundError:
424
+ pass
425
+
426
+
427
+ def main() -> None:
428
+ parser = argparse.ArgumentParser(description="Generate model comparison report")
429
+ parser.add_argument("workspace", type=Path, help="Path to workspace directory")
430
+ parser.add_argument("--port", "-p", type=int, default=3118, help="Server port (default: 3118)")
431
+ parser.add_argument("--output", "-o", type=Path, default=None, help="Output HTML file path")
432
+ parser.add_argument("--serve", "-s", action="store_true", help="Serve report via HTTP")
433
+ parser.add_argument("--name", "-n", type=str, default=None, help="Skill name for header")
434
+ args = parser.parse_args()
435
+
436
+ workspace = args.workspace.resolve()
437
+ if not workspace.is_dir():
438
+ print(f"Error: {workspace} is not a directory", file=sys.stderr)
439
+ sys.exit(1)
440
+
441
+ runs = find_runs(workspace)
442
+ benchmark = load_benchmark(workspace)
443
+ skill_name = args.name or workspace.name.replace("-eval-workspace", "")
444
+
445
+ if args.output:
446
+ html = generate_html(runs, benchmark, skill_name)
447
+ args.output.parent.mkdir(parents=True, exist_ok=True)
448
+ args.output.write_text(html)
449
+ print(f"\n Report written to: {args.output}\n")
450
+ sys.exit(0)
451
+
452
+ if args.serve:
453
+ port = args.port
454
+ _kill_port(port)
455
+ handler = partial(ReportHandler, workspace, skill_name, benchmark)
456
+ try:
457
+ server = HTTPServer(("127.0.0.1", port), handler)
458
+ except OSError:
459
+ server = HTTPServer(("127.0.0.1", 0), handler)
460
+ port = server.server_address[1]
461
+
462
+ url = f"http://localhost:{port}"
463
+ print(f"\n Model Comparison Report")
464
+ print(f" ─────────────────────────────────")
465
+ print(f" URL: {url}")
466
+ print(f" Workspace: {workspace}")
467
+ print(f"\n Press Ctrl+C to stop.\n")
468
+
469
+ webbrowser.open(url)
470
+
471
+ try:
472
+ server.serve_forever()
473
+ except KeyboardInterrupt:
474
+ print("\nStopped.")
475
+ server.server_close()
476
+ else:
477
+ # Default: write to workspace/report.html
478
+ output = workspace / "report.html"
479
+ html = generate_html(runs, benchmark, skill_name)
480
+ output.write_text(html)
481
+ print(f"\n Report written to: {output}\n")
482
+
483
+
484
+ if __name__ == "__main__":
485
+ main()