astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Model Comparison Report</title>
|
|
7
|
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
8
|
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
9
|
+
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
|
|
10
|
+
<style>
|
|
11
|
+
:root {
|
|
12
|
+
--bg: #faf9f5;
|
|
13
|
+
--surface: #ffffff;
|
|
14
|
+
--border: #e8e6dc;
|
|
15
|
+
--text: #141413;
|
|
16
|
+
--text-muted: #b0aea5;
|
|
17
|
+
--accent: #d97757;
|
|
18
|
+
--accent-hover: #c4613f;
|
|
19
|
+
--green: #788c5d;
|
|
20
|
+
--green-bg: #eef2e8;
|
|
21
|
+
--red: #c44;
|
|
22
|
+
--red-bg: #fceaea;
|
|
23
|
+
--header-bg: #141413;
|
|
24
|
+
--header-text: #faf9f5;
|
|
25
|
+
--radius: 6px;
|
|
26
|
+
--opus-color: #1976d2;
|
|
27
|
+
--opus-bg: rgba(25, 118, 210, 0.1);
|
|
28
|
+
--sonnet-color: #f57f17;
|
|
29
|
+
--sonnet-bg: rgba(245, 127, 23, 0.1);
|
|
30
|
+
--haiku-color: #2e7d32;
|
|
31
|
+
--haiku-bg: rgba(46, 125, 50, 0.1);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
35
|
+
|
|
36
|
+
body {
|
|
37
|
+
font-family: 'Lora', Georgia, serif;
|
|
38
|
+
background: var(--bg);
|
|
39
|
+
color: var(--text);
|
|
40
|
+
min-height: 100vh;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/* ---- Header ---- */
|
|
44
|
+
.header {
|
|
45
|
+
background: var(--header-bg);
|
|
46
|
+
color: var(--header-text);
|
|
47
|
+
padding: 1rem 2rem;
|
|
48
|
+
display: flex;
|
|
49
|
+
justify-content: space-between;
|
|
50
|
+
align-items: center;
|
|
51
|
+
}
|
|
52
|
+
.header h1 {
|
|
53
|
+
font-family: 'Poppins', sans-serif;
|
|
54
|
+
font-size: 1.25rem;
|
|
55
|
+
font-weight: 600;
|
|
56
|
+
}
|
|
57
|
+
.header .metadata {
|
|
58
|
+
font-size: 0.8rem;
|
|
59
|
+
opacity: 0.7;
|
|
60
|
+
margin-top: 0.25rem;
|
|
61
|
+
}
|
|
62
|
+
.header .note {
|
|
63
|
+
font-size: 0.75rem;
|
|
64
|
+
opacity: 0.6;
|
|
65
|
+
font-style: italic;
|
|
66
|
+
margin-top: 0.5rem;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/* ---- Main content ---- */
|
|
70
|
+
.main {
|
|
71
|
+
padding: 1.5rem 2rem;
|
|
72
|
+
max-width: 1200px;
|
|
73
|
+
margin: 0 auto;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/* ---- Sections ---- */
|
|
77
|
+
.section {
|
|
78
|
+
background: var(--surface);
|
|
79
|
+
border: 1px solid var(--border);
|
|
80
|
+
border-radius: var(--radius);
|
|
81
|
+
margin-bottom: 1.5rem;
|
|
82
|
+
}
|
|
83
|
+
.section-header {
|
|
84
|
+
font-family: 'Poppins', sans-serif;
|
|
85
|
+
padding: 0.75rem 1rem;
|
|
86
|
+
font-size: 0.75rem;
|
|
87
|
+
font-weight: 500;
|
|
88
|
+
text-transform: uppercase;
|
|
89
|
+
letter-spacing: 0.05em;
|
|
90
|
+
color: var(--text-muted);
|
|
91
|
+
border-bottom: 1px solid var(--border);
|
|
92
|
+
background: var(--bg);
|
|
93
|
+
}
|
|
94
|
+
.section-body {
|
|
95
|
+
padding: 1rem;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/* ---- Model badge ---- */
|
|
99
|
+
.model-badge {
|
|
100
|
+
display: inline-block;
|
|
101
|
+
padding: 0.2rem 0.625rem;
|
|
102
|
+
border-radius: 9999px;
|
|
103
|
+
font-family: 'Poppins', sans-serif;
|
|
104
|
+
font-size: 0.6875rem;
|
|
105
|
+
font-weight: 600;
|
|
106
|
+
text-transform: uppercase;
|
|
107
|
+
letter-spacing: 0.03em;
|
|
108
|
+
margin-left: 0.5rem;
|
|
109
|
+
vertical-align: middle;
|
|
110
|
+
}
|
|
111
|
+
.model-badge.model-opus { background: var(--opus-bg); color: var(--opus-color); }
|
|
112
|
+
.model-badge.model-sonnet { background: var(--sonnet-bg); color: var(--sonnet-color); }
|
|
113
|
+
.model-badge.model-haiku { background: var(--haiku-bg); color: var(--haiku-color); }
|
|
114
|
+
|
|
115
|
+
/* ---- Summary Table ---- */
|
|
116
|
+
.summary-table {
|
|
117
|
+
width: 100%;
|
|
118
|
+
border-collapse: collapse;
|
|
119
|
+
font-size: 0.875rem;
|
|
120
|
+
}
|
|
121
|
+
.summary-table th, .summary-table td {
|
|
122
|
+
padding: 0.75rem 1rem;
|
|
123
|
+
text-align: left;
|
|
124
|
+
border-bottom: 1px solid var(--border);
|
|
125
|
+
}
|
|
126
|
+
.summary-table th {
|
|
127
|
+
font-family: 'Poppins', sans-serif;
|
|
128
|
+
font-weight: 500;
|
|
129
|
+
background: var(--bg);
|
|
130
|
+
}
|
|
131
|
+
.summary-table tr:last-child td {
|
|
132
|
+
border-bottom: none;
|
|
133
|
+
}
|
|
134
|
+
.summary-table .metric-label {
|
|
135
|
+
font-weight: 600;
|
|
136
|
+
}
|
|
137
|
+
.summary-table .positive { color: var(--green); font-weight: 600; }
|
|
138
|
+
.summary-table .negative { color: var(--red); font-weight: 600; }
|
|
139
|
+
|
|
140
|
+
/* ---- Per-eval grid ---- */
|
|
141
|
+
.eval-grid {
|
|
142
|
+
display: grid;
|
|
143
|
+
grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
|
|
144
|
+
gap: 1rem;
|
|
145
|
+
}
|
|
146
|
+
.eval-card {
|
|
147
|
+
background: var(--surface);
|
|
148
|
+
border: 1px solid var(--border);
|
|
149
|
+
border-radius: var(--radius);
|
|
150
|
+
overflow: hidden;
|
|
151
|
+
}
|
|
152
|
+
.eval-card-header {
|
|
153
|
+
background: var(--bg);
|
|
154
|
+
padding: 0.75rem 1rem;
|
|
155
|
+
border-bottom: 1px solid var(--border);
|
|
156
|
+
font-family: 'Poppins', sans-serif;
|
|
157
|
+
font-weight: 500;
|
|
158
|
+
font-size: 0.875rem;
|
|
159
|
+
}
|
|
160
|
+
.eval-card-body {
|
|
161
|
+
padding: 1rem;
|
|
162
|
+
}
|
|
163
|
+
.model-result {
|
|
164
|
+
display: flex;
|
|
165
|
+
justify-content: space-between;
|
|
166
|
+
align-items: center;
|
|
167
|
+
padding: 0.5rem 0;
|
|
168
|
+
border-bottom: 1px solid var(--border);
|
|
169
|
+
}
|
|
170
|
+
.model-result:last-child {
|
|
171
|
+
border-bottom: none;
|
|
172
|
+
}
|
|
173
|
+
.model-result .model-name {
|
|
174
|
+
display: flex;
|
|
175
|
+
align-items: center;
|
|
176
|
+
}
|
|
177
|
+
.model-result .stats {
|
|
178
|
+
display: flex;
|
|
179
|
+
gap: 1rem;
|
|
180
|
+
font-size: 0.8rem;
|
|
181
|
+
color: var(--text-muted);
|
|
182
|
+
}
|
|
183
|
+
.model-result .pass-rate {
|
|
184
|
+
font-weight: 600;
|
|
185
|
+
}
|
|
186
|
+
.model-result .pass-rate.high { color: var(--green); }
|
|
187
|
+
.model-result .pass-rate.medium { color: var(--sonnet-color); }
|
|
188
|
+
.model-result .pass-rate.low { color: var(--red); }
|
|
189
|
+
|
|
190
|
+
/* ---- Assertions ---- */
|
|
191
|
+
.assertion-list {
|
|
192
|
+
list-style: none;
|
|
193
|
+
font-size: 0.8125rem;
|
|
194
|
+
}
|
|
195
|
+
.assertion-item {
|
|
196
|
+
padding: 0.5rem 0;
|
|
197
|
+
border-bottom: 1px solid var(--border);
|
|
198
|
+
display: flex;
|
|
199
|
+
align-items: flex-start;
|
|
200
|
+
gap: 0.5rem;
|
|
201
|
+
}
|
|
202
|
+
.assertion-item:last-child { border-bottom: none; }
|
|
203
|
+
.assertion-status {
|
|
204
|
+
font-weight: 600;
|
|
205
|
+
flex-shrink: 0;
|
|
206
|
+
}
|
|
207
|
+
.assertion-status.pass { color: var(--green); }
|
|
208
|
+
.assertion-status.fail { color: var(--red); }
|
|
209
|
+
.assertion-evidence {
|
|
210
|
+
color: var(--text-muted);
|
|
211
|
+
font-size: 0.75rem;
|
|
212
|
+
margin-top: 0.25rem;
|
|
213
|
+
padding-left: 1.5rem;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/* ---- Recommendations ---- */
|
|
217
|
+
.recommendation {
|
|
218
|
+
background: var(--bg);
|
|
219
|
+
border: 1px solid var(--border);
|
|
220
|
+
border-radius: var(--radius);
|
|
221
|
+
padding: 1rem;
|
|
222
|
+
margin-bottom: 0.75rem;
|
|
223
|
+
}
|
|
224
|
+
.recommendation .scenario {
|
|
225
|
+
font-family: 'Poppins', sans-serif;
|
|
226
|
+
font-weight: 600;
|
|
227
|
+
font-size: 0.875rem;
|
|
228
|
+
margin-bottom: 0.25rem;
|
|
229
|
+
}
|
|
230
|
+
.recommendation .model {
|
|
231
|
+
display: inline-block;
|
|
232
|
+
padding: 0.125rem 0.5rem;
|
|
233
|
+
border-radius: 4px;
|
|
234
|
+
font-size: 0.75rem;
|
|
235
|
+
font-weight: 600;
|
|
236
|
+
margin-right: 0.5rem;
|
|
237
|
+
}
|
|
238
|
+
.recommendation .reason {
|
|
239
|
+
font-size: 0.8125rem;
|
|
240
|
+
color: var(--text-muted);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/* ---- Notes ---- */
|
|
244
|
+
.notes-list {
|
|
245
|
+
list-style: disc;
|
|
246
|
+
padding-left: 1.25rem;
|
|
247
|
+
font-size: 0.875rem;
|
|
248
|
+
}
|
|
249
|
+
.notes-list li {
|
|
250
|
+
margin-bottom: 0.5rem;
|
|
251
|
+
line-height: 1.5;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/* ---- View tabs ---- */
|
|
255
|
+
.view-tabs {
|
|
256
|
+
display: flex;
|
|
257
|
+
gap: 0;
|
|
258
|
+
padding: 0 2rem;
|
|
259
|
+
background: var(--bg);
|
|
260
|
+
border-bottom: 1px solid var(--border);
|
|
261
|
+
}
|
|
262
|
+
.view-tab {
|
|
263
|
+
font-family: 'Poppins', sans-serif;
|
|
264
|
+
padding: 0.625rem 1.25rem;
|
|
265
|
+
font-size: 0.8125rem;
|
|
266
|
+
font-weight: 500;
|
|
267
|
+
cursor: pointer;
|
|
268
|
+
border: none;
|
|
269
|
+
background: none;
|
|
270
|
+
color: var(--text-muted);
|
|
271
|
+
border-bottom: 2px solid transparent;
|
|
272
|
+
transition: all 0.15s;
|
|
273
|
+
}
|
|
274
|
+
.view-tab:hover { color: var(--text); }
|
|
275
|
+
.view-tab.active {
|
|
276
|
+
color: var(--accent);
|
|
277
|
+
border-bottom-color: var(--accent);
|
|
278
|
+
}
|
|
279
|
+
.view-panel { display: none; }
|
|
280
|
+
.view-panel.active { display: block; }
|
|
281
|
+
|
|
282
|
+
/* ---- Output files ---- */
|
|
283
|
+
.output-file {
|
|
284
|
+
border: 1px solid var(--border);
|
|
285
|
+
border-radius: var(--radius);
|
|
286
|
+
overflow: hidden;
|
|
287
|
+
margin-top: 1rem;
|
|
288
|
+
}
|
|
289
|
+
.output-file-header {
|
|
290
|
+
padding: 0.5rem 0.75rem;
|
|
291
|
+
font-size: 0.8rem;
|
|
292
|
+
font-weight: 600;
|
|
293
|
+
color: var(--text-muted);
|
|
294
|
+
background: var(--bg);
|
|
295
|
+
border-bottom: 1px solid var(--border);
|
|
296
|
+
font-family: 'SF Mono', monospace;
|
|
297
|
+
}
|
|
298
|
+
.output-file-content {
|
|
299
|
+
padding: 0.75rem;
|
|
300
|
+
overflow-x: auto;
|
|
301
|
+
}
|
|
302
|
+
.output-file-content pre {
|
|
303
|
+
font-size: 0.8125rem;
|
|
304
|
+
line-height: 1.5;
|
|
305
|
+
white-space: pre-wrap;
|
|
306
|
+
word-break: break-word;
|
|
307
|
+
font-family: 'SF Mono', monospace;
|
|
308
|
+
}
|
|
309
|
+
.output-file-content img {
|
|
310
|
+
max-width: 100%;
|
|
311
|
+
border-radius: 4px;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/* ---- Collapsible ---- */
|
|
315
|
+
.collapsible-toggle {
|
|
316
|
+
display: flex;
|
|
317
|
+
align-items: center;
|
|
318
|
+
cursor: pointer;
|
|
319
|
+
user-select: none;
|
|
320
|
+
}
|
|
321
|
+
.collapsible-toggle:hover {
|
|
322
|
+
color: var(--accent);
|
|
323
|
+
}
|
|
324
|
+
.collapsible-toggle .arrow {
|
|
325
|
+
margin-right: 0.5rem;
|
|
326
|
+
transition: transform 0.15s;
|
|
327
|
+
font-size: 0.75rem;
|
|
328
|
+
}
|
|
329
|
+
.collapsible-toggle .arrow.open {
|
|
330
|
+
transform: rotate(90deg);
|
|
331
|
+
}
|
|
332
|
+
.collapsible-content {
|
|
333
|
+
display: none;
|
|
334
|
+
}
|
|
335
|
+
.collapsible-content.open {
|
|
336
|
+
display: block;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/* ---- Empty state ---- */
|
|
340
|
+
.empty-state {
|
|
341
|
+
color: var(--text-muted);
|
|
342
|
+
font-style: italic;
|
|
343
|
+
text-align: center;
|
|
344
|
+
padding: 2rem;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/* ---- Run details ---- */
|
|
348
|
+
.run-details {
|
|
349
|
+
border: 1px solid var(--border);
|
|
350
|
+
border-radius: var(--radius);
|
|
351
|
+
margin-bottom: 1rem;
|
|
352
|
+
overflow: hidden;
|
|
353
|
+
}
|
|
354
|
+
.run-header {
|
|
355
|
+
background: var(--bg);
|
|
356
|
+
padding: 0.75rem 1rem;
|
|
357
|
+
border-bottom: 1px solid var(--border);
|
|
358
|
+
display: flex;
|
|
359
|
+
justify-content: space-between;
|
|
360
|
+
align-items: center;
|
|
361
|
+
}
|
|
362
|
+
.run-header .run-title {
|
|
363
|
+
font-family: 'Poppins', sans-serif;
|
|
364
|
+
font-weight: 500;
|
|
365
|
+
}
|
|
366
|
+
.run-body {
|
|
367
|
+
padding: 1rem;
|
|
368
|
+
}
|
|
369
|
+
</style>
|
|
370
|
+
</head>
|
|
371
|
+
<body>
|
|
372
|
+
<div id="app">
|
|
373
|
+
<div class="header">
|
|
374
|
+
<div>
|
|
375
|
+
<h1>Model Comparison: <span id="skill-name"></span></h1>
|
|
376
|
+
<div class="metadata" id="metadata"></div>
|
|
377
|
+
<div class="note" id="blind-note"></div>
|
|
378
|
+
</div>
|
|
379
|
+
</div>
|
|
380
|
+
|
|
381
|
+
<!-- View tabs -->
|
|
382
|
+
<div class="view-tabs">
|
|
383
|
+
<button class="view-tab active" onclick="switchView('summary')">Summary</button>
|
|
384
|
+
<button class="view-tab" onclick="switchView('details')">Details</button>
|
|
385
|
+
</div>
|
|
386
|
+
|
|
387
|
+
<!-- Summary panel -->
|
|
388
|
+
<div class="view-panel active" id="panel-summary">
|
|
389
|
+
<div class="main">
|
|
390
|
+
<!-- Benchmark summary -->
|
|
391
|
+
<div class="section" id="benchmark-section">
|
|
392
|
+
<div class="section-header">Model Performance Comparison</div>
|
|
393
|
+
<div class="section-body" id="benchmark-body">
|
|
394
|
+
<div class="empty-state">No benchmark data available</div>
|
|
395
|
+
</div>
|
|
396
|
+
</div>
|
|
397
|
+
|
|
398
|
+
<!-- Recommendations -->
|
|
399
|
+
<div class="section" id="recommendations-section" style="display:none;">
|
|
400
|
+
<div class="section-header">Recommendations</div>
|
|
401
|
+
<div class="section-body" id="recommendations-body"></div>
|
|
402
|
+
</div>
|
|
403
|
+
|
|
404
|
+
<!-- Notes -->
|
|
405
|
+
<div class="section" id="notes-section" style="display:none;">
|
|
406
|
+
<div class="section-header">Analysis Notes</div>
|
|
407
|
+
<div class="section-body">
|
|
408
|
+
<ul class="notes-list" id="notes-list"></ul>
|
|
409
|
+
</div>
|
|
410
|
+
</div>
|
|
411
|
+
</div>
|
|
412
|
+
</div>
|
|
413
|
+
|
|
414
|
+
<!-- Details panel -->
|
|
415
|
+
<div class="view-panel" id="panel-details">
|
|
416
|
+
<div class="main">
|
|
417
|
+
<!-- Per-eval breakdown -->
|
|
418
|
+
<div class="section">
|
|
419
|
+
<div class="section-header">Per-Eval Breakdown</div>
|
|
420
|
+
<div class="section-body">
|
|
421
|
+
<div class="eval-grid" id="eval-grid"></div>
|
|
422
|
+
</div>
|
|
423
|
+
</div>
|
|
424
|
+
|
|
425
|
+
<!-- All runs with outputs -->
|
|
426
|
+
<div class="section" id="all-runs-section">
|
|
427
|
+
<div class="section-header">All Runs</div>
|
|
428
|
+
<div class="section-body" id="all-runs-body"></div>
|
|
429
|
+
</div>
|
|
430
|
+
</div>
|
|
431
|
+
</div>
|
|
432
|
+
</div>
|
|
433
|
+
|
|
434
|
+
<script>
|
|
435
|
+
// ---- Embedded data (injected by generate_report.py) ----
|
|
436
|
+
/*__EMBEDDED_DATA__*/
|
|
437
|
+
|
|
438
|
+
// ---- Init ----
|
|
439
|
+
function init() {
|
|
440
|
+
document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name || "Unknown";
|
|
441
|
+
|
|
442
|
+
// Render metadata
|
|
443
|
+
if (EMBEDDED_DATA.benchmark && EMBEDDED_DATA.benchmark.metadata) {
|
|
444
|
+
const m = EMBEDDED_DATA.benchmark.metadata;
|
|
445
|
+
const metaText = [];
|
|
446
|
+
if (m.models_compared) metaText.push("Models: " + m.models_compared.join(", "));
|
|
447
|
+
if (m.evals_run) metaText.push("Evals: " + m.evals_run.join(", "));
|
|
448
|
+
if (m.timestamp) metaText.push("Date: " + m.timestamp);
|
|
449
|
+
document.getElementById("metadata").textContent = metaText.join(" | ");
|
|
450
|
+
|
|
451
|
+
if (m.note) {
|
|
452
|
+
document.getElementById("blind-note").textContent = m.note;
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
renderBenchmark();
|
|
457
|
+
renderRecommendations();
|
|
458
|
+
renderNotes();
|
|
459
|
+
renderEvalGrid();
|
|
460
|
+
renderAllRuns();
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// ---- View switching ----
|
|
464
|
+
function switchView(view) {
|
|
465
|
+
document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
|
|
466
|
+
document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
|
|
467
|
+
document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
|
|
468
|
+
document.getElementById("panel-" + view).classList.add("active");
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// ---- Safe number formatting ----
|
|
472
|
+
function safeToFixed(val, decimals, suffix) {
|
|
473
|
+
if (val === undefined || val === null || isNaN(val)) return "—";
|
|
474
|
+
suffix = suffix || "";
|
|
475
|
+
return val.toFixed(decimals) + suffix;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function safePercent(val) {
|
|
479
|
+
if (val === undefined || val === null || isNaN(val)) return "—";
|
|
480
|
+
return (val * 100).toFixed(0) + "%";
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
// ---- Render benchmark summary ----
|
|
484
|
+
function renderBenchmark() {
|
|
485
|
+
const b = EMBEDDED_DATA.benchmark;
|
|
486
|
+
if (!b || !b.model_summary) return;
|
|
487
|
+
|
|
488
|
+
const summary = b.model_summary;
|
|
489
|
+
const models = Object.keys(summary);
|
|
490
|
+
if (models.length === 0) return;
|
|
491
|
+
|
|
492
|
+
let html = '<table class="summary-table">';
|
|
493
|
+
html += "<thead><tr><th>Metric</th>";
|
|
494
|
+
for (const m of models) {
|
|
495
|
+
const label = m.charAt(0).toUpperCase() + m.slice(1);
|
|
496
|
+
html += `<th>${label}</th>`;
|
|
497
|
+
}
|
|
498
|
+
html += "<th>Comparison</th></tr></thead><tbody>";
|
|
499
|
+
|
|
500
|
+
// Pass rate
|
|
501
|
+
html += '<tr><td class="metric-label">Pass Rate</td>';
|
|
502
|
+
let prValues = [];
|
|
503
|
+
for (const m of models) {
|
|
504
|
+
const pr = (summary[m].pass_rate || {}).mean;
|
|
505
|
+
const stddev = (summary[m].pass_rate || {}).stddev || 0;
|
|
506
|
+
if (pr !== undefined && pr !== null) prValues.push(pr);
|
|
507
|
+
html += `<td>${safePercent(pr)} ± ${safePercent(stddev)}</td>`;
|
|
508
|
+
}
|
|
509
|
+
html += `<td>${renderComparison(prValues, true)}</td></tr>`;
|
|
510
|
+
|
|
511
|
+
// Time
|
|
512
|
+
html += '<tr><td class="metric-label">Time (seconds)</td>';
|
|
513
|
+
let timeValues = [];
|
|
514
|
+
for (const m of models) {
|
|
515
|
+
const t = (summary[m].time_seconds || {}).mean;
|
|
516
|
+
if (t !== undefined && t !== null) timeValues.push(t);
|
|
517
|
+
html += `<td>${safeToFixed(t, 1, "s")}</td>`;
|
|
518
|
+
}
|
|
519
|
+
html += `<td>${renderComparison(timeValues, false, true)}</td></tr>`;
|
|
520
|
+
|
|
521
|
+
// Tokens
|
|
522
|
+
html += '<tr><td class="metric-label">Tokens</td>';
|
|
523
|
+
let tokenValues = [];
|
|
524
|
+
for (const m of models) {
|
|
525
|
+
const tk = (summary[m].tokens || {}).mean;
|
|
526
|
+
if (tk !== undefined && tk !== null) tokenValues.push(tk);
|
|
527
|
+
html += `<td>${safeToFixed(tk, 0, "")}</td>`;
|
|
528
|
+
}
|
|
529
|
+
html += `<td>${renderComparison(tokenValues, false, true)}</td></tr>`;
|
|
530
|
+
|
|
531
|
+
// Cost efficiency
|
|
532
|
+
if (b.comparison && b.comparison.cost_efficiency) {
|
|
533
|
+
html += '<tr><td class="metric-label">Cost Efficiency</td>';
|
|
534
|
+
const ce = b.comparison.cost_efficiency;
|
|
535
|
+
for (const m of models) {
|
|
536
|
+
html += `<td>${ce[m] || "—"}</td>`;
|
|
537
|
+
}
|
|
538
|
+
html += "<td>Higher is better</td></tr>";
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
html += "</tbody></table>";
|
|
542
|
+
document.getElementById("benchmark-body").innerHTML = html;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
function renderComparison(values, higherIsBetter, lowerIsBetter) {
|
|
546
|
+
if (!values || values.length < 2) return "—";
|
|
547
|
+
if (values[1] === 0) return "—";
|
|
548
|
+
const diff = values[0] - values[1];
|
|
549
|
+
const pct = ((diff / values[1]) * 100).toFixed(0);
|
|
550
|
+
const isPositive = higherIsBetter ? diff > 0 : (lowerIsBetter ? diff < 0 : diff > 0);
|
|
551
|
+
const cls = isPositive ? "positive" : "negative";
|
|
552
|
+
const sign = diff > 0 ? "+" : "";
|
|
553
|
+
return `<span class="${cls}">${sign}${pct}%</span>`;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// ---- Render recommendations ----
|
|
557
|
+
function renderRecommendations() {
|
|
558
|
+
const b = EMBEDDED_DATA.benchmark;
|
|
559
|
+
if (!b || !b.recommendations || b.recommendations.length === 0) return;
|
|
560
|
+
|
|
561
|
+
const section = document.getElementById("recommendations-section");
|
|
562
|
+
const body = document.getElementById("recommendations-body");
|
|
563
|
+
|
|
564
|
+
let html = '<p style="font-size: 0.8rem; color: var(--text-muted); margin-bottom: 1rem;">These recommendations are derived from actual evaluation results, not pre-conceived assumptions.</p>';
|
|
565
|
+
|
|
566
|
+
for (const rec of b.recommendations) {
|
|
567
|
+
const modelClass = "model-" + rec.recommended_model;
|
|
568
|
+
html += `<div class="recommendation">`;
|
|
569
|
+
html += `<div class="scenario">${rec.scenario}</div>`;
|
|
570
|
+
html += `<span class="model ${modelClass}">${rec.recommended_model.toUpperCase()}</span>`;
|
|
571
|
+
html += `<span class="reason">${rec.reason}</span>`;
|
|
572
|
+
html += `</div>`;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
body.innerHTML = html;
|
|
576
|
+
section.style.display = "block";
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
// ---- Render notes ----
|
|
580
|
+
function renderNotes() {
|
|
581
|
+
const b = EMBEDDED_DATA.benchmark;
|
|
582
|
+
if (!b || !b.notes || b.notes.length === 0) return;
|
|
583
|
+
|
|
584
|
+
const section = document.getElementById("notes-section");
|
|
585
|
+
const list = document.getElementById("notes-list");
|
|
586
|
+
|
|
587
|
+
let html = "";
|
|
588
|
+
for (const note of b.notes) {
|
|
589
|
+
html += `<li>${escapeHtml(note)}</li>`;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
list.innerHTML = html;
|
|
593
|
+
section.style.display = "block";
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// ---- Render per-eval grid ----
|
|
597
|
+
function renderEvalGrid() {
|
|
598
|
+
const b = EMBEDDED_DATA.benchmark;
|
|
599
|
+
const runs = b ? b.runs : EMBEDDED_DATA.runs;
|
|
600
|
+
if (!runs || runs.length === 0) {
|
|
601
|
+
document.getElementById("eval-grid").innerHTML = '<div class="empty-state">No eval runs found</div>';
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
// Group by eval_id
|
|
606
|
+
const evalGroups = {};
|
|
607
|
+
for (const run of runs) {
|
|
608
|
+
const key = run.eval_id != null ? run.eval_id : 0;
|
|
609
|
+
if (!evalGroups[key]) {
|
|
610
|
+
evalGroups[key] = { eval_id: key, eval_name: run.eval_name || `Eval ${key}`, runs: [] };
|
|
611
|
+
}
|
|
612
|
+
evalGroups[key].runs.push(run);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
let html = "";
|
|
616
|
+
for (const key of Object.keys(evalGroups).sort((a, b) => a - b)) {
|
|
617
|
+
const group = evalGroups[key];
|
|
618
|
+
html += `<div class="eval-card">`;
|
|
619
|
+
html += `<div class="eval-card-header">${group.eval_name}</div>`;
|
|
620
|
+
html += `<div class="eval-card-body">`;
|
|
621
|
+
|
|
622
|
+
// Sort runs by model
|
|
623
|
+
group.runs.sort((a, b) => (a.model || "").localeCompare(b.model || ""));
|
|
624
|
+
|
|
625
|
+
for (const run of group.runs) {
|
|
626
|
+
const model = run.model || "unknown";
|
|
627
|
+
const result = run.result || {};
|
|
628
|
+
const grading = run.grading || {};
|
|
629
|
+
const summary = grading.summary || {};
|
|
630
|
+
|
|
631
|
+
const pr = result.pass_rate != null ? result.pass_rate : (summary.pass_rate || 0);
|
|
632
|
+
const prClass = pr >= 0.8 ? "high" : pr >= 0.5 ? "medium" : "low";
|
|
633
|
+
|
|
634
|
+
html += `<div class="model-result">`;
|
|
635
|
+
html += `<div class="model-name">`;
|
|
636
|
+
html += `<span class="model-badge model-${model}">${model}</span>`;
|
|
637
|
+
html += `</div>`;
|
|
638
|
+
html += `<div class="stats">`;
|
|
639
|
+
html += `<span class="pass-rate ${prClass}">${safePercent(pr)}</span>`;
|
|
640
|
+
if (result.time_seconds != null) {
|
|
641
|
+
html += `<span>${result.time_seconds.toFixed(1)}s</span>`;
|
|
642
|
+
}
|
|
643
|
+
if (result.tokens != null) {
|
|
644
|
+
html += `<span>${result.tokens} tok</span>`;
|
|
645
|
+
}
|
|
646
|
+
html += `</div>`;
|
|
647
|
+
html += `</div>`;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
html += `</div></div>`;
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
document.getElementById("eval-grid").innerHTML = html;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// ---- Render all runs ----
|
|
657
|
+
function renderAllRuns() {
|
|
658
|
+
const runs = EMBEDDED_DATA.runs;
|
|
659
|
+
if (!runs || runs.length === 0) {
|
|
660
|
+
document.getElementById("all-runs-body").innerHTML = '<div class="empty-state">No runs found</div>';
|
|
661
|
+
return;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
let html = "";
|
|
665
|
+
for (const run of runs) {
|
|
666
|
+
const model = run.model || "unknown";
|
|
667
|
+
const grading = run.grading || {};
|
|
668
|
+
const summary = grading.summary || {};
|
|
669
|
+
|
|
670
|
+
html += `<div class="run-details">`;
|
|
671
|
+
html += `<div class="run-header">`;
|
|
672
|
+
html += `<span class="run-title">Run: ${run.id || run.run_id || "unknown"}</span>`;
|
|
673
|
+
html += `<span class="model-badge model-${model}">${model}</span>`;
|
|
674
|
+
html += `</div>`;
|
|
675
|
+
html += `<div class="run-body">`;
|
|
676
|
+
|
|
677
|
+
// Prompt
|
|
678
|
+
html += `<p style="margin-bottom: 0.5rem;"><strong>Prompt:</strong> ${escapeHtml(run.prompt || "N/A")}</p>`;
|
|
679
|
+
|
|
680
|
+
// Pass rate
|
|
681
|
+
if (summary.pass_rate != null) {
|
|
682
|
+
const prClass = summary.pass_rate >= 0.8 ? "high" : summary.pass_rate >= 0.5 ? "medium" : "low";
|
|
683
|
+
const total = summary.total || summary.total_expectations || 0;
|
|
684
|
+
html += `<p style="font-size: 0.9rem;">Pass Rate: <span class="pass-rate ${prClass}" style="font-weight: 600;">${safePercent(summary.pass_rate)}</span> (${summary.passed || 0}/${total})</p>`;
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Timing
|
|
688
|
+
if (run.timing) {
|
|
689
|
+
html += `<p style="font-size: 0.8rem; color: var(--text-muted); margin-top: 0.5rem;">`;
|
|
690
|
+
if (run.timing.total_tokens) html += `Tokens: ${run.timing.total_tokens} | `;
|
|
691
|
+
if (run.timing.total_duration_seconds) html += `Time: ${run.timing.total_duration_seconds.toFixed(1)}s`;
|
|
692
|
+
html += `</p>`;
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// Assertions (collapsible)
|
|
696
|
+
if (grading.expectations && grading.expectations.length > 0) {
|
|
697
|
+
const assertTotal = summary.total || summary.total_expectations || grading.expectations.length;
|
|
698
|
+
html += `<div class="collapsible-toggle" onclick="toggleCollapsible(this)" style="margin-top: 1rem;">`;
|
|
699
|
+
html += `<span class="arrow">▶</span> Assertions (${summary.passed || 0}/${assertTotal})`;
|
|
700
|
+
html += `</div>`;
|
|
701
|
+
html += `<div class="collapsible-content">`;
|
|
702
|
+
html += `<ul class="assertion-list">`;
|
|
703
|
+
for (const exp of grading.expectations) {
|
|
704
|
+
const statusClass = exp.passed ? "pass" : "fail";
|
|
705
|
+
const statusIcon = exp.passed ? "✓" : "✗";
|
|
706
|
+
const expText = exp.text || exp.description || "";
|
|
707
|
+
html += `<li class="assertion-item">`;
|
|
708
|
+
html += `<span class="assertion-status ${statusClass}">${statusIcon}</span>`;
|
|
709
|
+
html += `<span>${escapeHtml(expText)}</span>`;
|
|
710
|
+
if (exp.evidence) {
|
|
711
|
+
html += `<div class="assertion-evidence">${escapeHtml(exp.evidence)}</div>`;
|
|
712
|
+
}
|
|
713
|
+
html += `</li>`;
|
|
714
|
+
}
|
|
715
|
+
html += `</ul></div>`;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Outputs (collapsible)
|
|
719
|
+
if (run.outputs && run.outputs.length > 0) {
|
|
720
|
+
html += `<div class="collapsible-toggle" onclick="toggleCollapsible(this)" style="margin-top: 0.5rem;">`;
|
|
721
|
+
html += `<span class="arrow">▶</span> Outputs (${run.outputs.length} files)`;
|
|
722
|
+
html += `</div>`;
|
|
723
|
+
html += `<div class="collapsible-content">`;
|
|
724
|
+
for (const file of run.outputs) {
|
|
725
|
+
html += `<div class="output-file">`;
|
|
726
|
+
html += `<div class="output-file-header">${escapeHtml(file.name)}</div>`;
|
|
727
|
+
html += `<div class="output-file-content">`;
|
|
728
|
+
if (file.type === "text") {
|
|
729
|
+
html += `<pre>${escapeHtml(file.content || "")}</pre>`;
|
|
730
|
+
} else if (file.type === "image") {
|
|
731
|
+
html += `<img src="${file.data_uri}" alt="${escapeHtml(file.name)}">`;
|
|
732
|
+
} else if (file.type === "binary" && file.data_uri) {
|
|
733
|
+
html += `<a href="${file.data_uri}" download="${escapeHtml(file.name)}">Download ${escapeHtml(file.name)}</a>`;
|
|
734
|
+
} else {
|
|
735
|
+
html += `<span style="color: var(--text-muted);">Binary file</span>`;
|
|
736
|
+
}
|
|
737
|
+
html += `</div></div>`;
|
|
738
|
+
}
|
|
739
|
+
html += `</div>`;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
html += `</div></div>`;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
document.getElementById("all-runs-body").innerHTML = html;
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
function toggleCollapsible(el) {
|
|
749
|
+
const arrow = el.querySelector(".arrow");
|
|
750
|
+
const content = el.nextElementSibling;
|
|
751
|
+
if (arrow) arrow.classList.toggle("open");
|
|
752
|
+
if (content) content.classList.toggle("open");
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// ---- Util ----
|
|
756
|
+
function escapeHtml(text) {
|
|
757
|
+
if (!text) return "";
|
|
758
|
+
const div = document.createElement("div");
|
|
759
|
+
div.textContent = text;
|
|
760
|
+
return div.innerHTML;
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
// ---- Start ----
|
|
764
|
+
init();
|
|
765
|
+
</script>
|
|
766
|
+
</body>
|
|
767
|
+
</html>
|