astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,767 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Model Comparison Report</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;600&family=Lora:wght@400;500&display=swap" rel="stylesheet">
10
+ <style>
11
+ :root {
12
+ --bg: #faf9f5;
13
+ --surface: #ffffff;
14
+ --border: #e8e6dc;
15
+ --text: #141413;
16
+ --text-muted: #b0aea5;
17
+ --accent: #d97757;
18
+ --accent-hover: #c4613f;
19
+ --green: #788c5d;
20
+ --green-bg: #eef2e8;
21
+ --red: #c44;
22
+ --red-bg: #fceaea;
23
+ --header-bg: #141413;
24
+ --header-text: #faf9f5;
25
+ --radius: 6px;
26
+ --opus-color: #1976d2;
27
+ --opus-bg: rgba(25, 118, 210, 0.1);
28
+ --sonnet-color: #f57f17;
29
+ --sonnet-bg: rgba(245, 127, 23, 0.1);
30
+ --haiku-color: #2e7d32;
31
+ --haiku-bg: rgba(46, 125, 50, 0.1);
32
+ }
33
+
34
+ * { box-sizing: border-box; margin: 0; padding: 0; }
35
+
36
+ body {
37
+ font-family: 'Lora', Georgia, serif;
38
+ background: var(--bg);
39
+ color: var(--text);
40
+ min-height: 100vh;
41
+ }
42
+
43
+ /* ---- Header ---- */
44
+ .header {
45
+ background: var(--header-bg);
46
+ color: var(--header-text);
47
+ padding: 1rem 2rem;
48
+ display: flex;
49
+ justify-content: space-between;
50
+ align-items: center;
51
+ }
52
+ .header h1 {
53
+ font-family: 'Poppins', sans-serif;
54
+ font-size: 1.25rem;
55
+ font-weight: 600;
56
+ }
57
+ .header .metadata {
58
+ font-size: 0.8rem;
59
+ opacity: 0.7;
60
+ margin-top: 0.25rem;
61
+ }
62
+ .header .note {
63
+ font-size: 0.75rem;
64
+ opacity: 0.6;
65
+ font-style: italic;
66
+ margin-top: 0.5rem;
67
+ }
68
+
69
+ /* ---- Main content ---- */
70
+ .main {
71
+ padding: 1.5rem 2rem;
72
+ max-width: 1200px;
73
+ margin: 0 auto;
74
+ }
75
+
76
+ /* ---- Sections ---- */
77
+ .section {
78
+ background: var(--surface);
79
+ border: 1px solid var(--border);
80
+ border-radius: var(--radius);
81
+ margin-bottom: 1.5rem;
82
+ }
83
+ .section-header {
84
+ font-family: 'Poppins', sans-serif;
85
+ padding: 0.75rem 1rem;
86
+ font-size: 0.75rem;
87
+ font-weight: 500;
88
+ text-transform: uppercase;
89
+ letter-spacing: 0.05em;
90
+ color: var(--text-muted);
91
+ border-bottom: 1px solid var(--border);
92
+ background: var(--bg);
93
+ }
94
+ .section-body {
95
+ padding: 1rem;
96
+ }
97
+
98
+ /* ---- Model badge ---- */
99
+ .model-badge {
100
+ display: inline-block;
101
+ padding: 0.2rem 0.625rem;
102
+ border-radius: 9999px;
103
+ font-family: 'Poppins', sans-serif;
104
+ font-size: 0.6875rem;
105
+ font-weight: 600;
106
+ text-transform: uppercase;
107
+ letter-spacing: 0.03em;
108
+ margin-left: 0.5rem;
109
+ vertical-align: middle;
110
+ }
111
+ .model-badge.model-opus { background: var(--opus-bg); color: var(--opus-color); }
112
+ .model-badge.model-sonnet { background: var(--sonnet-bg); color: var(--sonnet-color); }
113
+ .model-badge.model-haiku { background: var(--haiku-bg); color: var(--haiku-color); }
114
+
115
+ /* ---- Summary Table ---- */
116
+ .summary-table {
117
+ width: 100%;
118
+ border-collapse: collapse;
119
+ font-size: 0.875rem;
120
+ }
121
+ .summary-table th, .summary-table td {
122
+ padding: 0.75rem 1rem;
123
+ text-align: left;
124
+ border-bottom: 1px solid var(--border);
125
+ }
126
+ .summary-table th {
127
+ font-family: 'Poppins', sans-serif;
128
+ font-weight: 500;
129
+ background: var(--bg);
130
+ }
131
+ .summary-table tr:last-child td {
132
+ border-bottom: none;
133
+ }
134
+ .summary-table .metric-label {
135
+ font-weight: 600;
136
+ }
137
+ .summary-table .positive { color: var(--green); font-weight: 600; }
138
+ .summary-table .negative { color: var(--red); font-weight: 600; }
139
+
140
+ /* ---- Per-eval grid ---- */
141
+ .eval-grid {
142
+ display: grid;
143
+ grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
144
+ gap: 1rem;
145
+ }
146
+ .eval-card {
147
+ background: var(--surface);
148
+ border: 1px solid var(--border);
149
+ border-radius: var(--radius);
150
+ overflow: hidden;
151
+ }
152
+ .eval-card-header {
153
+ background: var(--bg);
154
+ padding: 0.75rem 1rem;
155
+ border-bottom: 1px solid var(--border);
156
+ font-family: 'Poppins', sans-serif;
157
+ font-weight: 500;
158
+ font-size: 0.875rem;
159
+ }
160
+ .eval-card-body {
161
+ padding: 1rem;
162
+ }
163
+ .model-result {
164
+ display: flex;
165
+ justify-content: space-between;
166
+ align-items: center;
167
+ padding: 0.5rem 0;
168
+ border-bottom: 1px solid var(--border);
169
+ }
170
+ .model-result:last-child {
171
+ border-bottom: none;
172
+ }
173
+ .model-result .model-name {
174
+ display: flex;
175
+ align-items: center;
176
+ }
177
+ .model-result .stats {
178
+ display: flex;
179
+ gap: 1rem;
180
+ font-size: 0.8rem;
181
+ color: var(--text-muted);
182
+ }
183
+ .model-result .pass-rate {
184
+ font-weight: 600;
185
+ }
186
+ .model-result .pass-rate.high { color: var(--green); }
187
+ .model-result .pass-rate.medium { color: var(--sonnet-color); }
188
+ .model-result .pass-rate.low { color: var(--red); }
189
+
190
+ /* ---- Assertions ---- */
191
+ .assertion-list {
192
+ list-style: none;
193
+ font-size: 0.8125rem;
194
+ }
195
+ .assertion-item {
196
+ padding: 0.5rem 0;
197
+ border-bottom: 1px solid var(--border);
198
+ display: flex;
199
+ align-items: flex-start;
200
+ gap: 0.5rem;
201
+ }
202
+ .assertion-item:last-child { border-bottom: none; }
203
+ .assertion-status {
204
+ font-weight: 600;
205
+ flex-shrink: 0;
206
+ }
207
+ .assertion-status.pass { color: var(--green); }
208
+ .assertion-status.fail { color: var(--red); }
209
+ .assertion-evidence {
210
+ color: var(--text-muted);
211
+ font-size: 0.75rem;
212
+ margin-top: 0.25rem;
213
+ padding-left: 1.5rem;
214
+ }
215
+
216
+ /* ---- Recommendations ---- */
217
+ .recommendation {
218
+ background: var(--bg);
219
+ border: 1px solid var(--border);
220
+ border-radius: var(--radius);
221
+ padding: 1rem;
222
+ margin-bottom: 0.75rem;
223
+ }
224
+ .recommendation .scenario {
225
+ font-family: 'Poppins', sans-serif;
226
+ font-weight: 600;
227
+ font-size: 0.875rem;
228
+ margin-bottom: 0.25rem;
229
+ }
230
+ .recommendation .model {
231
+ display: inline-block;
232
+ padding: 0.125rem 0.5rem;
233
+ border-radius: 4px;
234
+ font-size: 0.75rem;
235
+ font-weight: 600;
236
+ margin-right: 0.5rem;
237
+ }
238
+ .recommendation .reason {
239
+ font-size: 0.8125rem;
240
+ color: var(--text-muted);
241
+ }
242
+
243
+ /* ---- Notes ---- */
244
+ .notes-list {
245
+ list-style: disc;
246
+ padding-left: 1.25rem;
247
+ font-size: 0.875rem;
248
+ }
249
+ .notes-list li {
250
+ margin-bottom: 0.5rem;
251
+ line-height: 1.5;
252
+ }
253
+
254
+ /* ---- View tabs ---- */
255
+ .view-tabs {
256
+ display: flex;
257
+ gap: 0;
258
+ padding: 0 2rem;
259
+ background: var(--bg);
260
+ border-bottom: 1px solid var(--border);
261
+ }
262
+ .view-tab {
263
+ font-family: 'Poppins', sans-serif;
264
+ padding: 0.625rem 1.25rem;
265
+ font-size: 0.8125rem;
266
+ font-weight: 500;
267
+ cursor: pointer;
268
+ border: none;
269
+ background: none;
270
+ color: var(--text-muted);
271
+ border-bottom: 2px solid transparent;
272
+ transition: all 0.15s;
273
+ }
274
+ .view-tab:hover { color: var(--text); }
275
+ .view-tab.active {
276
+ color: var(--accent);
277
+ border-bottom-color: var(--accent);
278
+ }
279
+ .view-panel { display: none; }
280
+ .view-panel.active { display: block; }
281
+
282
+ /* ---- Output files ---- */
283
+ .output-file {
284
+ border: 1px solid var(--border);
285
+ border-radius: var(--radius);
286
+ overflow: hidden;
287
+ margin-top: 1rem;
288
+ }
289
+ .output-file-header {
290
+ padding: 0.5rem 0.75rem;
291
+ font-size: 0.8rem;
292
+ font-weight: 600;
293
+ color: var(--text-muted);
294
+ background: var(--bg);
295
+ border-bottom: 1px solid var(--border);
296
+ font-family: 'SF Mono', monospace;
297
+ }
298
+ .output-file-content {
299
+ padding: 0.75rem;
300
+ overflow-x: auto;
301
+ }
302
+ .output-file-content pre {
303
+ font-size: 0.8125rem;
304
+ line-height: 1.5;
305
+ white-space: pre-wrap;
306
+ word-break: break-word;
307
+ font-family: 'SF Mono', monospace;
308
+ }
309
+ .output-file-content img {
310
+ max-width: 100%;
311
+ border-radius: 4px;
312
+ }
313
+
314
+ /* ---- Collapsible ---- */
315
+ .collapsible-toggle {
316
+ display: flex;
317
+ align-items: center;
318
+ cursor: pointer;
319
+ user-select: none;
320
+ }
321
+ .collapsible-toggle:hover {
322
+ color: var(--accent);
323
+ }
324
+ .collapsible-toggle .arrow {
325
+ margin-right: 0.5rem;
326
+ transition: transform 0.15s;
327
+ font-size: 0.75rem;
328
+ }
329
+ .collapsible-toggle .arrow.open {
330
+ transform: rotate(90deg);
331
+ }
332
+ .collapsible-content {
333
+ display: none;
334
+ }
335
+ .collapsible-content.open {
336
+ display: block;
337
+ }
338
+
339
+ /* ---- Empty state ---- */
340
+ .empty-state {
341
+ color: var(--text-muted);
342
+ font-style: italic;
343
+ text-align: center;
344
+ padding: 2rem;
345
+ }
346
+
347
+ /* ---- Run details ---- */
348
+ .run-details {
349
+ border: 1px solid var(--border);
350
+ border-radius: var(--radius);
351
+ margin-bottom: 1rem;
352
+ overflow: hidden;
353
+ }
354
+ .run-header {
355
+ background: var(--bg);
356
+ padding: 0.75rem 1rem;
357
+ border-bottom: 1px solid var(--border);
358
+ display: flex;
359
+ justify-content: space-between;
360
+ align-items: center;
361
+ }
362
+ .run-header .run-title {
363
+ font-family: 'Poppins', sans-serif;
364
+ font-weight: 500;
365
+ }
366
+ .run-body {
367
+ padding: 1rem;
368
+ }
369
+ </style>
370
+ </head>
371
+ <body>
372
+ <div id="app">
373
+ <div class="header">
374
+ <div>
375
+ <h1>Model Comparison: <span id="skill-name"></span></h1>
376
+ <div class="metadata" id="metadata"></div>
377
+ <div class="note" id="blind-note"></div>
378
+ </div>
379
+ </div>
380
+
381
+ <!-- View tabs -->
382
+ <div class="view-tabs">
383
+ <button class="view-tab active" onclick="switchView('summary')">Summary</button>
384
+ <button class="view-tab" onclick="switchView('details')">Details</button>
385
+ </div>
386
+
387
+ <!-- Summary panel -->
388
+ <div class="view-panel active" id="panel-summary">
389
+ <div class="main">
390
+ <!-- Benchmark summary -->
391
+ <div class="section" id="benchmark-section">
392
+ <div class="section-header">Model Performance Comparison</div>
393
+ <div class="section-body" id="benchmark-body">
394
+ <div class="empty-state">No benchmark data available</div>
395
+ </div>
396
+ </div>
397
+
398
+ <!-- Recommendations -->
399
+ <div class="section" id="recommendations-section" style="display:none;">
400
+ <div class="section-header">Recommendations</div>
401
+ <div class="section-body" id="recommendations-body"></div>
402
+ </div>
403
+
404
+ <!-- Notes -->
405
+ <div class="section" id="notes-section" style="display:none;">
406
+ <div class="section-header">Analysis Notes</div>
407
+ <div class="section-body">
408
+ <ul class="notes-list" id="notes-list"></ul>
409
+ </div>
410
+ </div>
411
+ </div>
412
+ </div>
413
+
414
+ <!-- Details panel -->
415
+ <div class="view-panel" id="panel-details">
416
+ <div class="main">
417
+ <!-- Per-eval breakdown -->
418
+ <div class="section">
419
+ <div class="section-header">Per-Eval Breakdown</div>
420
+ <div class="section-body">
421
+ <div class="eval-grid" id="eval-grid"></div>
422
+ </div>
423
+ </div>
424
+
425
+ <!-- All runs with outputs -->
426
+ <div class="section" id="all-runs-section">
427
+ <div class="section-header">All Runs</div>
428
+ <div class="section-body" id="all-runs-body"></div>
429
+ </div>
430
+ </div>
431
+ </div>
432
+ </div>
433
+
434
+ <script>
435
+ // ---- Embedded data (injected by generate_report.py) ----
436
+ /*__EMBEDDED_DATA__*/
437
+
438
+ // ---- Init ----
439
+ function init() {
440
+ document.getElementById("skill-name").textContent = EMBEDDED_DATA.skill_name || "Unknown";
441
+
442
+ // Render metadata
443
+ if (EMBEDDED_DATA.benchmark && EMBEDDED_DATA.benchmark.metadata) {
444
+ const m = EMBEDDED_DATA.benchmark.metadata;
445
+ const metaText = [];
446
+ if (m.models_compared) metaText.push("Models: " + m.models_compared.join(", "));
447
+ if (m.evals_run) metaText.push("Evals: " + m.evals_run.join(", "));
448
+ if (m.timestamp) metaText.push("Date: " + m.timestamp);
449
+ document.getElementById("metadata").textContent = metaText.join(" | ");
450
+
451
+ if (m.note) {
452
+ document.getElementById("blind-note").textContent = m.note;
453
+ }
454
+ }
455
+
456
+ renderBenchmark();
457
+ renderRecommendations();
458
+ renderNotes();
459
+ renderEvalGrid();
460
+ renderAllRuns();
461
+ }
462
+
463
+ // ---- View switching ----
464
+ function switchView(view) {
465
+ document.querySelectorAll(".view-tab").forEach(t => t.classList.remove("active"));
466
+ document.querySelectorAll(".view-panel").forEach(p => p.classList.remove("active"));
467
+ document.querySelector(`[onclick="switchView('${view}')"]`).classList.add("active");
468
+ document.getElementById("panel-" + view).classList.add("active");
469
+ }
470
+
471
+ // ---- Safe number formatting ----
472
+ function safeToFixed(val, decimals, suffix) {
473
+ if (val === undefined || val === null || isNaN(val)) return "—";
474
+ suffix = suffix || "";
475
+ return val.toFixed(decimals) + suffix;
476
+ }
477
+
478
+ function safePercent(val) {
479
+ if (val === undefined || val === null || isNaN(val)) return "—";
480
+ return (val * 100).toFixed(0) + "%";
481
+ }
482
+
483
+ // ---- Render benchmark summary ----
484
+ function renderBenchmark() {
485
+ const b = EMBEDDED_DATA.benchmark;
486
+ if (!b || !b.model_summary) return;
487
+
488
+ const summary = b.model_summary;
489
+ const models = Object.keys(summary);
490
+ if (models.length === 0) return;
491
+
492
+ let html = '<table class="summary-table">';
493
+ html += "<thead><tr><th>Metric</th>";
494
+ for (const m of models) {
495
+ const label = m.charAt(0).toUpperCase() + m.slice(1);
496
+ html += `<th>${label}</th>`;
497
+ }
498
+ html += "<th>Comparison</th></tr></thead><tbody>";
499
+
500
+ // Pass rate
501
+ html += '<tr><td class="metric-label">Pass Rate</td>';
502
+ let prValues = [];
503
+ for (const m of models) {
504
+ const pr = (summary[m].pass_rate || {}).mean;
505
+ const stddev = (summary[m].pass_rate || {}).stddev || 0;
506
+ if (pr !== undefined && pr !== null) prValues.push(pr);
507
+ html += `<td>${safePercent(pr)} ± ${safePercent(stddev)}</td>`;
508
+ }
509
+ html += `<td>${renderComparison(prValues, true)}</td></tr>`;
510
+
511
+ // Time
512
+ html += '<tr><td class="metric-label">Time (seconds)</td>';
513
+ let timeValues = [];
514
+ for (const m of models) {
515
+ const t = (summary[m].time_seconds || {}).mean;
516
+ if (t !== undefined && t !== null) timeValues.push(t);
517
+ html += `<td>${safeToFixed(t, 1, "s")}</td>`;
518
+ }
519
+ html += `<td>${renderComparison(timeValues, false, true)}</td></tr>`;
520
+
521
+ // Tokens
522
+ html += '<tr><td class="metric-label">Tokens</td>';
523
+ let tokenValues = [];
524
+ for (const m of models) {
525
+ const tk = (summary[m].tokens || {}).mean;
526
+ if (tk !== undefined && tk !== null) tokenValues.push(tk);
527
+ html += `<td>${safeToFixed(tk, 0, "")}</td>`;
528
+ }
529
+ html += `<td>${renderComparison(tokenValues, false, true)}</td></tr>`;
530
+
531
+ // Cost efficiency
532
+ if (b.comparison && b.comparison.cost_efficiency) {
533
+ html += '<tr><td class="metric-label">Cost Efficiency</td>';
534
+ const ce = b.comparison.cost_efficiency;
535
+ for (const m of models) {
536
+ html += `<td>${ce[m] || "—"}</td>`;
537
+ }
538
+ html += "<td>Higher is better</td></tr>";
539
+ }
540
+
541
+ html += "</tbody></table>";
542
+ document.getElementById("benchmark-body").innerHTML = html;
543
+ }
544
+
545
+ function renderComparison(values, higherIsBetter, lowerIsBetter) {
546
+ if (!values || values.length < 2) return "—";
547
+ if (values[1] === 0) return "—";
548
+ const diff = values[0] - values[1];
549
+ const pct = ((diff / values[1]) * 100).toFixed(0);
550
+ const isPositive = higherIsBetter ? diff > 0 : (lowerIsBetter ? diff < 0 : diff > 0);
551
+ const cls = isPositive ? "positive" : "negative";
552
+ const sign = diff > 0 ? "+" : "";
553
+ return `<span class="${cls}">${sign}${pct}%</span>`;
554
+ }
555
+
556
+ // ---- Render recommendations ----
557
+ function renderRecommendations() {
558
+ const b = EMBEDDED_DATA.benchmark;
559
+ if (!b || !b.recommendations || b.recommendations.length === 0) return;
560
+
561
+ const section = document.getElementById("recommendations-section");
562
+ const body = document.getElementById("recommendations-body");
563
+
564
+ let html = '<p style="font-size: 0.8rem; color: var(--text-muted); margin-bottom: 1rem;">These recommendations are derived from actual evaluation results, not pre-conceived assumptions.</p>';
565
+
566
+ for (const rec of b.recommendations) {
567
+ const modelClass = "model-" + rec.recommended_model;
568
+ html += `<div class="recommendation">`;
569
+ html += `<div class="scenario">${rec.scenario}</div>`;
570
+ html += `<span class="model ${modelClass}">${rec.recommended_model.toUpperCase()}</span>`;
571
+ html += `<span class="reason">${rec.reason}</span>`;
572
+ html += `</div>`;
573
+ }
574
+
575
+ body.innerHTML = html;
576
+ section.style.display = "block";
577
+ }
578
+
579
+ // ---- Render notes ----
580
+ function renderNotes() {
581
+ const b = EMBEDDED_DATA.benchmark;
582
+ if (!b || !b.notes || b.notes.length === 0) return;
583
+
584
+ const section = document.getElementById("notes-section");
585
+ const list = document.getElementById("notes-list");
586
+
587
+ let html = "";
588
+ for (const note of b.notes) {
589
+ html += `<li>${escapeHtml(note)}</li>`;
590
+ }
591
+
592
+ list.innerHTML = html;
593
+ section.style.display = "block";
594
+ }
595
+
596
+ // ---- Render per-eval grid ----
597
+ function renderEvalGrid() {
598
+ const b = EMBEDDED_DATA.benchmark;
599
+ const runs = b ? b.runs : EMBEDDED_DATA.runs;
600
+ if (!runs || runs.length === 0) {
601
+ document.getElementById("eval-grid").innerHTML = '<div class="empty-state">No eval runs found</div>';
602
+ return;
603
+ }
604
+
605
+ // Group by eval_id
606
+ const evalGroups = {};
607
+ for (const run of runs) {
608
+ const key = run.eval_id != null ? run.eval_id : 0;
609
+ if (!evalGroups[key]) {
610
+ evalGroups[key] = { eval_id: key, eval_name: run.eval_name || `Eval ${key}`, runs: [] };
611
+ }
612
+ evalGroups[key].runs.push(run);
613
+ }
614
+
615
+ let html = "";
616
+ for (const key of Object.keys(evalGroups).sort((a, b) => a - b)) {
617
+ const group = evalGroups[key];
618
+ html += `<div class="eval-card">`;
619
+ html += `<div class="eval-card-header">${group.eval_name}</div>`;
620
+ html += `<div class="eval-card-body">`;
621
+
622
+ // Sort runs by model
623
+ group.runs.sort((a, b) => (a.model || "").localeCompare(b.model || ""));
624
+
625
+ for (const run of group.runs) {
626
+ const model = run.model || "unknown";
627
+ const result = run.result || {};
628
+ const grading = run.grading || {};
629
+ const summary = grading.summary || {};
630
+
631
+ const pr = result.pass_rate != null ? result.pass_rate : (summary.pass_rate || 0);
632
+ const prClass = pr >= 0.8 ? "high" : pr >= 0.5 ? "medium" : "low";
633
+
634
+ html += `<div class="model-result">`;
635
+ html += `<div class="model-name">`;
636
+ html += `<span class="model-badge model-${model}">${model}</span>`;
637
+ html += `</div>`;
638
+ html += `<div class="stats">`;
639
+ html += `<span class="pass-rate ${prClass}">${safePercent(pr)}</span>`;
640
+ if (result.time_seconds != null) {
641
+ html += `<span>${result.time_seconds.toFixed(1)}s</span>`;
642
+ }
643
+ if (result.tokens != null) {
644
+ html += `<span>${result.tokens} tok</span>`;
645
+ }
646
+ html += `</div>`;
647
+ html += `</div>`;
648
+ }
649
+
650
+ html += `</div></div>`;
651
+ }
652
+
653
+ document.getElementById("eval-grid").innerHTML = html;
654
+ }
655
+
656
+ // ---- Render all runs ----
657
+ function renderAllRuns() {
658
+ const runs = EMBEDDED_DATA.runs;
659
+ if (!runs || runs.length === 0) {
660
+ document.getElementById("all-runs-body").innerHTML = '<div class="empty-state">No runs found</div>';
661
+ return;
662
+ }
663
+
664
+ let html = "";
665
+ for (const run of runs) {
666
+ const model = run.model || "unknown";
667
+ const grading = run.grading || {};
668
+ const summary = grading.summary || {};
669
+
670
+ html += `<div class="run-details">`;
671
+ html += `<div class="run-header">`;
672
+ html += `<span class="run-title">Run: ${run.id || run.run_id || "unknown"}</span>`;
673
+ html += `<span class="model-badge model-${model}">${model}</span>`;
674
+ html += `</div>`;
675
+ html += `<div class="run-body">`;
676
+
677
+ // Prompt
678
+ html += `<p style="margin-bottom: 0.5rem;"><strong>Prompt:</strong> ${escapeHtml(run.prompt || "N/A")}</p>`;
679
+
680
+ // Pass rate
681
+ if (summary.pass_rate != null) {
682
+ const prClass = summary.pass_rate >= 0.8 ? "high" : summary.pass_rate >= 0.5 ? "medium" : "low";
683
+ const total = summary.total || summary.total_expectations || 0;
684
+ html += `<p style="font-size: 0.9rem;">Pass Rate: <span class="pass-rate ${prClass}" style="font-weight: 600;">${safePercent(summary.pass_rate)}</span> (${summary.passed || 0}/${total})</p>`;
685
+ }
686
+
687
+ // Timing
688
+ if (run.timing) {
689
+ html += `<p style="font-size: 0.8rem; color: var(--text-muted); margin-top: 0.5rem;">`;
690
+ if (run.timing.total_tokens) html += `Tokens: ${run.timing.total_tokens} | `;
691
+ if (run.timing.total_duration_seconds) html += `Time: ${run.timing.total_duration_seconds.toFixed(1)}s`;
692
+ html += `</p>`;
693
+ }
694
+
695
+ // Assertions (collapsible)
696
+ if (grading.expectations && grading.expectations.length > 0) {
697
+ const assertTotal = summary.total || summary.total_expectations || grading.expectations.length;
698
+ html += `<div class="collapsible-toggle" onclick="toggleCollapsible(this)" style="margin-top: 1rem;">`;
699
+ html += `<span class="arrow">&#9654;</span> Assertions (${summary.passed || 0}/${assertTotal})`;
700
+ html += `</div>`;
701
+ html += `<div class="collapsible-content">`;
702
+ html += `<ul class="assertion-list">`;
703
+ for (const exp of grading.expectations) {
704
+ const statusClass = exp.passed ? "pass" : "fail";
705
+ const statusIcon = exp.passed ? "✓" : "✗";
706
+ const expText = exp.text || exp.description || "";
707
+ html += `<li class="assertion-item">`;
708
+ html += `<span class="assertion-status ${statusClass}">${statusIcon}</span>`;
709
+ html += `<span>${escapeHtml(expText)}</span>`;
710
+ if (exp.evidence) {
711
+ html += `<div class="assertion-evidence">${escapeHtml(exp.evidence)}</div>`;
712
+ }
713
+ html += `</li>`;
714
+ }
715
+ html += `</ul></div>`;
716
+ }
717
+
718
+ // Outputs (collapsible)
719
+ if (run.outputs && run.outputs.length > 0) {
720
+ html += `<div class="collapsible-toggle" onclick="toggleCollapsible(this)" style="margin-top: 0.5rem;">`;
721
+ html += `<span class="arrow">&#9654;</span> Outputs (${run.outputs.length} files)`;
722
+ html += `</div>`;
723
+ html += `<div class="collapsible-content">`;
724
+ for (const file of run.outputs) {
725
+ html += `<div class="output-file">`;
726
+ html += `<div class="output-file-header">${escapeHtml(file.name)}</div>`;
727
+ html += `<div class="output-file-content">`;
728
+ if (file.type === "text") {
729
+ html += `<pre>${escapeHtml(file.content || "")}</pre>`;
730
+ } else if (file.type === "image") {
731
+ html += `<img src="${file.data_uri}" alt="${escapeHtml(file.name)}">`;
732
+ } else if (file.type === "binary" && file.data_uri) {
733
+ html += `<a href="${file.data_uri}" download="${escapeHtml(file.name)}">Download ${escapeHtml(file.name)}</a>`;
734
+ } else {
735
+ html += `<span style="color: var(--text-muted);">Binary file</span>`;
736
+ }
737
+ html += `</div></div>`;
738
+ }
739
+ html += `</div>`;
740
+ }
741
+
742
+ html += `</div></div>`;
743
+ }
744
+
745
+ document.getElementById("all-runs-body").innerHTML = html;
746
+ }
747
+
748
+ function toggleCollapsible(el) {
749
+ const arrow = el.querySelector(".arrow");
750
+ const content = el.nextElementSibling;
751
+ if (arrow) arrow.classList.toggle("open");
752
+ if (content) content.classList.toggle("open");
753
+ }
754
+
755
+ // ---- Util ----
756
+ function escapeHtml(text) {
757
+ if (!text) return "";
758
+ const div = document.createElement("div");
759
+ div.textContent = text;
760
+ return div.innerHTML;
761
+ }
762
+
763
+ // ---- Start ----
764
+ init();
765
+ </script>
766
+ </body>
767
+ </html>