astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate a visual comparison report for model evaluation results.
|
|
3
|
+
|
|
4
|
+
Reads the workspace directory, discovers runs, and generates a self-contained
|
|
5
|
+
HTML page showing the model comparison results.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python generate_report.py <workspace-path> [--output report.html]
|
|
9
|
+
python generate_report.py <workspace-path> [--serve] [--port PORT]
|
|
10
|
+
|
|
11
|
+
No dependencies beyond the Python stdlib are required.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import base64
|
|
16
|
+
import json
|
|
17
|
+
import mimetypes
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
import webbrowser
|
|
24
|
+
from functools import partial
|
|
25
|
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
# Files to exclude from output listings
|
|
29
|
+
METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
|
|
30
|
+
|
|
31
|
+
# Extensions we render as inline text
|
|
32
|
+
TEXT_EXTENSIONS = {
|
|
33
|
+
".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
|
|
34
|
+
".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
|
|
35
|
+
".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Extensions we render as inline images
|
|
39
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
|
|
40
|
+
|
|
41
|
+
# MIME type overrides for common types
|
|
42
|
+
MIME_OVERRIDES = {
|
|
43
|
+
".svg": "image/svg+xml",
|
|
44
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
45
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_mime_type(path: Path) -> str:
|
|
50
|
+
ext = path.suffix.lower()
|
|
51
|
+
if ext in MIME_OVERRIDES:
|
|
52
|
+
return MIME_OVERRIDES[ext]
|
|
53
|
+
mime, _ = mimetypes.guess_type(str(path))
|
|
54
|
+
return mime or "application/octet-stream"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_evals_config(workspace: Path) -> dict:
|
|
58
|
+
"""Load evals.json from workspace to get prompts."""
|
|
59
|
+
evals_path = workspace / "evals.json"
|
|
60
|
+
if evals_path.exists():
|
|
61
|
+
try:
|
|
62
|
+
return json.loads(evals_path.read_text())
|
|
63
|
+
except (json.JSONDecodeError, OSError):
|
|
64
|
+
pass
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def load_mapping(workspace: Path) -> dict:
|
|
69
|
+
"""Load mapping.json to map run IDs to eval info."""
|
|
70
|
+
mapping_path = workspace / "mapping.json"
|
|
71
|
+
if mapping_path.exists():
|
|
72
|
+
try:
|
|
73
|
+
return json.loads(mapping_path.read_text())
|
|
74
|
+
except (json.JSONDecodeError, OSError):
|
|
75
|
+
pass
|
|
76
|
+
return {}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def find_runs(workspace: Path) -> list[dict]:
|
|
80
|
+
"""Recursively find directories that contain an outputs/ subdirectory."""
|
|
81
|
+
# Load evals and mapping for prompts
|
|
82
|
+
evals_config = load_evals_config(workspace)
|
|
83
|
+
mapping = load_mapping(workspace)
|
|
84
|
+
|
|
85
|
+
# Build eval_id -> prompt mapping
|
|
86
|
+
eval_prompts = {}
|
|
87
|
+
for eval_item in evals_config.get("evals", []):
|
|
88
|
+
eval_prompts[eval_item.get("id")] = eval_item.get("prompt", "")
|
|
89
|
+
|
|
90
|
+
runs: list[dict] = []
|
|
91
|
+
_find_runs_recursive(workspace, workspace, runs, mapping, eval_prompts)
|
|
92
|
+
runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r.get("model", ""), r["id"]))
|
|
93
|
+
return runs
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _find_runs_recursive(root: Path, current: Path, runs: list[dict], mapping: dict = None, eval_prompts: dict = None) -> None:
|
|
97
|
+
if not current.is_dir():
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
outputs_dir = current / "outputs"
|
|
101
|
+
if outputs_dir.is_dir():
|
|
102
|
+
run = build_run(root, current, mapping, eval_prompts)
|
|
103
|
+
if run:
|
|
104
|
+
runs.append(run)
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
|
|
108
|
+
for child in sorted(current.iterdir()):
|
|
109
|
+
if child.is_dir() and child.name not in skip:
|
|
110
|
+
_find_runs_recursive(root, child, runs, mapping, eval_prompts)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def build_run(root: Path, run_dir: Path, mapping: dict = None, eval_prompts: dict = None) -> dict | None:
|
|
114
|
+
"""Build a run dict with prompt, outputs, and grading data."""
|
|
115
|
+
prompt = ""
|
|
116
|
+
eval_id = None
|
|
117
|
+
eval_name = None
|
|
118
|
+
model = run_dir.name # Model name is directory name
|
|
119
|
+
run_id = run_dir.name
|
|
120
|
+
|
|
121
|
+
# First try mapping.json for run info (blind evaluation mode)
|
|
122
|
+
if mapping and run_id in mapping:
|
|
123
|
+
run_info = mapping[run_id]
|
|
124
|
+
model = run_info.get("model", model)
|
|
125
|
+
eval_id = run_info.get("eval_id")
|
|
126
|
+
eval_name = run_info.get("eval_name")
|
|
127
|
+
# Get prompt from eval_prompts using eval_id
|
|
128
|
+
if eval_prompts and eval_id in eval_prompts:
|
|
129
|
+
prompt = eval_prompts[eval_id]
|
|
130
|
+
|
|
131
|
+
# Fallback: Try eval_metadata.json from parent
|
|
132
|
+
if not prompt:
|
|
133
|
+
metadata_path = run_dir.parent / "eval_metadata.json"
|
|
134
|
+
if metadata_path.exists():
|
|
135
|
+
try:
|
|
136
|
+
metadata = json.loads(metadata_path.read_text())
|
|
137
|
+
prompt = metadata.get("prompt", "")
|
|
138
|
+
eval_id = metadata.get("eval_id")
|
|
139
|
+
except (json.JSONDecodeError, OSError):
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
# Fallback: Try timing.json for model info
|
|
143
|
+
if model == run_dir.name: # Still using directory name
|
|
144
|
+
timing_path = run_dir / "timing.json"
|
|
145
|
+
if timing_path.exists():
|
|
146
|
+
try:
|
|
147
|
+
timing = json.loads(timing_path.read_text())
|
|
148
|
+
if timing.get("model"):
|
|
149
|
+
model = timing["model"]
|
|
150
|
+
except (json.JSONDecodeError, OSError):
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
if not prompt:
|
|
154
|
+
prompt = "(No prompt found)"
|
|
155
|
+
|
|
156
|
+
run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
|
|
157
|
+
|
|
158
|
+
# Define timing_path early (used in multiple places)
|
|
159
|
+
timing_path = run_dir / "timing.json"
|
|
160
|
+
|
|
161
|
+
# Collect output files
|
|
162
|
+
outputs_dir = run_dir / "outputs"
|
|
163
|
+
output_files: list[dict] = []
|
|
164
|
+
if outputs_dir.is_dir():
|
|
165
|
+
for f in sorted(outputs_dir.iterdir()):
|
|
166
|
+
if f.is_file() and f.name not in METADATA_FILES:
|
|
167
|
+
output_files.append(embed_file(f))
|
|
168
|
+
|
|
169
|
+
# Load grading if present
|
|
170
|
+
grading = None
|
|
171
|
+
grading_path = run_dir / "grading.json"
|
|
172
|
+
if grading_path.exists():
|
|
173
|
+
try:
|
|
174
|
+
grading = json.loads(grading_path.read_text())
|
|
175
|
+
except (json.JSONDecodeError, OSError):
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
# Load timing if present
|
|
179
|
+
timing = None
|
|
180
|
+
if timing_path.exists():
|
|
181
|
+
try:
|
|
182
|
+
timing = json.loads(timing_path.read_text())
|
|
183
|
+
except (json.JSONDecodeError, OSError):
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
return {
|
|
187
|
+
"id": run_id,
|
|
188
|
+
"prompt": prompt,
|
|
189
|
+
"eval_id": eval_id,
|
|
190
|
+
"eval_name": eval_name,
|
|
191
|
+
"model": model,
|
|
192
|
+
"outputs": output_files,
|
|
193
|
+
"grading": grading,
|
|
194
|
+
"timing": timing,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def embed_file(path: Path) -> dict:
|
|
199
|
+
"""Read a file and return an embedded representation."""
|
|
200
|
+
ext = path.suffix.lower()
|
|
201
|
+
mime = get_mime_type(path)
|
|
202
|
+
|
|
203
|
+
if ext in TEXT_EXTENSIONS:
|
|
204
|
+
try:
|
|
205
|
+
content = path.read_text(errors="replace")
|
|
206
|
+
except OSError:
|
|
207
|
+
content = "(Error reading file)"
|
|
208
|
+
return {
|
|
209
|
+
"name": path.name,
|
|
210
|
+
"type": "text",
|
|
211
|
+
"content": content,
|
|
212
|
+
}
|
|
213
|
+
elif ext in IMAGE_EXTENSIONS:
|
|
214
|
+
try:
|
|
215
|
+
raw = path.read_bytes()
|
|
216
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
217
|
+
except OSError:
|
|
218
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
219
|
+
return {
|
|
220
|
+
"name": path.name,
|
|
221
|
+
"type": "image",
|
|
222
|
+
"mime": mime,
|
|
223
|
+
"data_uri": f"data:{mime};base64,{b64}",
|
|
224
|
+
}
|
|
225
|
+
else:
|
|
226
|
+
# Binary / unknown — base64 download link
|
|
227
|
+
try:
|
|
228
|
+
raw = path.read_bytes()
|
|
229
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
230
|
+
except OSError:
|
|
231
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
232
|
+
return {
|
|
233
|
+
"name": path.name,
|
|
234
|
+
"type": "binary",
|
|
235
|
+
"mime": mime,
|
|
236
|
+
"data_uri": f"data:{mime};base64,{b64}",
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def load_benchmark(workspace: Path) -> dict | None:
|
|
241
|
+
"""Load benchmark.json from workspace."""
|
|
242
|
+
benchmark_path = workspace / "benchmark.json"
|
|
243
|
+
if benchmark_path.exists():
|
|
244
|
+
try:
|
|
245
|
+
return json.loads(benchmark_path.read_text())
|
|
246
|
+
except (json.JSONDecodeError, OSError):
|
|
247
|
+
pass
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def generate_html(runs: list[dict], benchmark: dict | None, skill_name: str) -> str:
|
|
252
|
+
"""Generate the complete standalone HTML page with embedded data."""
|
|
253
|
+
template_path = Path(__file__).parent / "viewer.html"
|
|
254
|
+
if template_path.exists():
|
|
255
|
+
template = template_path.read_text()
|
|
256
|
+
else:
|
|
257
|
+
template = get_default_template()
|
|
258
|
+
|
|
259
|
+
embedded = {
|
|
260
|
+
"skill_name": skill_name,
|
|
261
|
+
"runs": runs,
|
|
262
|
+
"benchmark": benchmark,
|
|
263
|
+
}
|
|
264
|
+
# Use indent=2 to avoid extremely long lines that break VSCode tokenization
|
|
265
|
+
# and ensure ensure_ascii=False for proper Unicode character handling
|
|
266
|
+
data_json = json.dumps(embedded, indent=2, ensure_ascii=False)
|
|
267
|
+
|
|
268
|
+
# CRITICAL: Escape </script> tags in the JSON to prevent browsers from
|
|
269
|
+
# incorrectly terminating the script block when the data contains HTML examples
|
|
270
|
+
# Use <\/script> which JavaScript will parse correctly but browsers won't see as a tag
|
|
271
|
+
data_json = data_json.replace("</script>", "<\\/script>")
|
|
272
|
+
|
|
273
|
+
return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def get_default_template() -> str:
|
|
277
|
+
"""Return a minimal HTML template if viewer.html doesn't exist."""
|
|
278
|
+
return '''<!DOCTYPE html>
|
|
279
|
+
<html lang="en">
|
|
280
|
+
<head>
|
|
281
|
+
<meta charset="UTF-8">
|
|
282
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
283
|
+
<title>Model Comparison Report</title>
|
|
284
|
+
<style>
|
|
285
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
286
|
+
body { font-family: system-ui, -apple-system, sans-serif; background: #f5f5f5; padding: 2rem; }
|
|
287
|
+
h1 { margin-bottom: 1rem; }
|
|
288
|
+
.card { background: white; border-radius: 8px; padding: 1.5rem; margin-bottom: 1rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
|
|
289
|
+
.model-badge { display: inline-block; padding: 0.25rem 0.75rem; border-radius: 9999px; font-size: 0.75rem; font-weight: 600; margin-left: 0.5rem; }
|
|
290
|
+
.model-opus { background: #e3f2fd; color: #1565c0; }
|
|
291
|
+
.model-sonnet { background: #fff3e0; color: #e65100; }
|
|
292
|
+
.model-haiku { background: #e8f5e9; color: #2e7d32; }
|
|
293
|
+
pre { background: #f5f5f5; padding: 1rem; border-radius: 4px; overflow-x: auto; font-size: 0.875rem; }
|
|
294
|
+
table { width: 100%; border-collapse: collapse; }
|
|
295
|
+
th, td { padding: 0.75rem; text-align: left; border-bottom: 1px solid #eee; }
|
|
296
|
+
th { background: #f5f5f5; font-weight: 600; }
|
|
297
|
+
.pass { color: #2e7d32; }
|
|
298
|
+
.fail { color: #c62828; }
|
|
299
|
+
</style>
|
|
300
|
+
</head>
|
|
301
|
+
<body>
|
|
302
|
+
<h1>Model Comparison: <span id="skill-name"></span></h1>
|
|
303
|
+
<div id="content"></div>
|
|
304
|
+
<script>
|
|
305
|
+
/*__EMBEDDED_DATA__*/
|
|
306
|
+
|
|
307
|
+
document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name;
|
|
308
|
+
|
|
309
|
+
const content = document.getElementById("content");
|
|
310
|
+
|
|
311
|
+
// Render benchmark summary if available
|
|
312
|
+
if (EMBEDDED_DATA.benchmark) {
|
|
313
|
+
const b = EMBEDDED_DATA.benchmark;
|
|
314
|
+
const summary = b.model_summary || {};
|
|
315
|
+
const models = Object.keys(summary);
|
|
316
|
+
|
|
317
|
+
let html = '<div class="card"><h2>Summary</h2><table><thead><tr><th>Metric</th>';
|
|
318
|
+
for (const m of models) {
|
|
319
|
+
html += `<th>${m}</th>`;
|
|
320
|
+
}
|
|
321
|
+
html += '</tr></thead><tbody>';
|
|
322
|
+
|
|
323
|
+
html += '<tr><td>Pass Rate</td>';
|
|
324
|
+
for (const m of models) {
|
|
325
|
+
const pr = summary[m].pass_rate || {};
|
|
326
|
+
html += `<td>${(pr.mean * 100).toFixed(0)}% ± ${(pr.stddev * 100).toFixed(0)}%</td>`;
|
|
327
|
+
}
|
|
328
|
+
html += '</tr>';
|
|
329
|
+
|
|
330
|
+
html += '<tr><td>Time (s)</td>';
|
|
331
|
+
for (const m of models) {
|
|
332
|
+
const t = summary[m].time_seconds || {};
|
|
333
|
+
html += `<td>${t.mean.toFixed(1)}s</td>`;
|
|
334
|
+
}
|
|
335
|
+
html += '</tr>';
|
|
336
|
+
|
|
337
|
+
html += '<tr><td>Tokens</td>';
|
|
338
|
+
for (const m of models) {
|
|
339
|
+
const tk = summary[m].tokens || {};
|
|
340
|
+
html += `<td>${tk.mean.toFixed(0)}</td>`;
|
|
341
|
+
}
|
|
342
|
+
html += '</tr></tbody></table></div>';
|
|
343
|
+
|
|
344
|
+
content.innerHTML += html;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// Render runs
|
|
348
|
+
for (const run of EMBEDDED_DATA.runs) {
|
|
349
|
+
let html = `<div class="card"><h3>Eval ${run.eval_id}: ${run.model}</h3>`;
|
|
350
|
+
html += `<p><strong>Prompt:</strong> ${run.prompt}</p>`;
|
|
351
|
+
|
|
352
|
+
if (run.grading) {
|
|
353
|
+
const g = run.grading;
|
|
354
|
+
html += `<p><strong>Pass Rate:</strong> <span class="${g.summary.pass_rate >= 0.7 ? 'pass' : 'fail'}">${(g.summary.pass_rate * 100).toFixed(0)}%</span></p>`;
|
|
355
|
+
|
|
356
|
+
if (g.expectations && g.expectations.length > 0) {
|
|
357
|
+
html += '<h4>Assertions</h4><ul>';
|
|
358
|
+
for (const e of g.expectations) {
|
|
359
|
+
html += `<li class="${e.passed ? 'pass' : 'fail'}">${e.passed ? '✓' : '✗'} ${e.text}</li>`;
|
|
360
|
+
}
|
|
361
|
+
html += '</ul>';
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
html += '</div>';
|
|
366
|
+
content.innerHTML += html;
|
|
367
|
+
}
|
|
368
|
+
</script>
|
|
369
|
+
</body>
|
|
370
|
+
</html>'''
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class ReportHandler(BaseHTTPRequestHandler):
|
|
374
|
+
"""Serves the comparison HTML report."""
|
|
375
|
+
|
|
376
|
+
def __init__(
|
|
377
|
+
self,
|
|
378
|
+
workspace: Path,
|
|
379
|
+
skill_name: str,
|
|
380
|
+
benchmark: dict | None,
|
|
381
|
+
*args,
|
|
382
|
+
**kwargs,
|
|
383
|
+
):
|
|
384
|
+
self.workspace = workspace
|
|
385
|
+
self.skill_name = skill_name
|
|
386
|
+
self.benchmark = benchmark
|
|
387
|
+
super().__init__(*args, **kwargs)
|
|
388
|
+
|
|
389
|
+
def do_GET(self) -> None:
|
|
390
|
+
if self.path == "/" or self.path == "/index.html":
|
|
391
|
+
runs = find_runs(self.workspace)
|
|
392
|
+
html = generate_html(runs, self.benchmark, self.skill_name)
|
|
393
|
+
content = html.encode("utf-8")
|
|
394
|
+
self.send_response(200)
|
|
395
|
+
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
396
|
+
self.send_header("Content-Length", str(len(content)))
|
|
397
|
+
self.end_headers()
|
|
398
|
+
self.wfile.write(content)
|
|
399
|
+
else:
|
|
400
|
+
self.send_error(404)
|
|
401
|
+
|
|
402
|
+
def log_message(self, format: str, *args: object) -> None:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _kill_port(port: int) -> None:
|
|
407
|
+
"""Kill any process listening on the given port."""
|
|
408
|
+
try:
|
|
409
|
+
result = subprocess.run(
|
|
410
|
+
["lsof", "-ti", f":{port}"],
|
|
411
|
+
capture_output=True, text=True, timeout=5,
|
|
412
|
+
)
|
|
413
|
+
for pid_str in result.stdout.strip().split("\n"):
|
|
414
|
+
if pid_str.strip():
|
|
415
|
+
try:
|
|
416
|
+
os.kill(int(pid_str.strip()), signal.SIGTERM)
|
|
417
|
+
except (ProcessLookupError, ValueError):
|
|
418
|
+
pass
|
|
419
|
+
if result.stdout.strip():
|
|
420
|
+
time.sleep(0.5)
|
|
421
|
+
except subprocess.TimeoutExpired:
|
|
422
|
+
pass
|
|
423
|
+
except FileNotFoundError:
|
|
424
|
+
pass
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def main() -> None:
|
|
428
|
+
parser = argparse.ArgumentParser(description="Generate model comparison report")
|
|
429
|
+
parser.add_argument("workspace", type=Path, help="Path to workspace directory")
|
|
430
|
+
parser.add_argument("--port", "-p", type=int, default=3118, help="Server port (default: 3118)")
|
|
431
|
+
parser.add_argument("--output", "-o", type=Path, default=None, help="Output HTML file path")
|
|
432
|
+
parser.add_argument("--serve", "-s", action="store_true", help="Serve report via HTTP")
|
|
433
|
+
parser.add_argument("--name", "-n", type=str, default=None, help="Skill name for header")
|
|
434
|
+
args = parser.parse_args()
|
|
435
|
+
|
|
436
|
+
workspace = args.workspace.resolve()
|
|
437
|
+
if not workspace.is_dir():
|
|
438
|
+
print(f"Error: {workspace} is not a directory", file=sys.stderr)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
|
|
441
|
+
runs = find_runs(workspace)
|
|
442
|
+
benchmark = load_benchmark(workspace)
|
|
443
|
+
skill_name = args.name or workspace.name.replace("-eval-workspace", "")
|
|
444
|
+
|
|
445
|
+
if args.output:
|
|
446
|
+
html = generate_html(runs, benchmark, skill_name)
|
|
447
|
+
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
448
|
+
args.output.write_text(html)
|
|
449
|
+
print(f"\n Report written to: {args.output}\n")
|
|
450
|
+
sys.exit(0)
|
|
451
|
+
|
|
452
|
+
if args.serve:
|
|
453
|
+
port = args.port
|
|
454
|
+
_kill_port(port)
|
|
455
|
+
handler = partial(ReportHandler, workspace, skill_name, benchmark)
|
|
456
|
+
try:
|
|
457
|
+
server = HTTPServer(("127.0.0.1", port), handler)
|
|
458
|
+
except OSError:
|
|
459
|
+
server = HTTPServer(("127.0.0.1", 0), handler)
|
|
460
|
+
port = server.server_address[1]
|
|
461
|
+
|
|
462
|
+
url = f"http://localhost:{port}"
|
|
463
|
+
print(f"\n Model Comparison Report")
|
|
464
|
+
print(f" ─────────────────────────────────")
|
|
465
|
+
print(f" URL: {url}")
|
|
466
|
+
print(f" Workspace: {workspace}")
|
|
467
|
+
print(f"\n Press Ctrl+C to stop.\n")
|
|
468
|
+
|
|
469
|
+
webbrowser.open(url)
|
|
470
|
+
|
|
471
|
+
try:
|
|
472
|
+
server.serve_forever()
|
|
473
|
+
except KeyboardInterrupt:
|
|
474
|
+
print("\nStopped.")
|
|
475
|
+
server.server_close()
|
|
476
|
+
else:
|
|
477
|
+
# Default: write to workspace/report.html
|
|
478
|
+
output = workspace / "report.html"
|
|
479
|
+
html = generate_html(runs, benchmark, skill_name)
|
|
480
|
+
output.write_text(html)
|
|
481
|
+
print(f"\n Report written to: {output}\n")
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
if __name__ == "__main__":
|
|
485
|
+
main()
|