limbo-ai 1.25.0 → 1.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "get-file",
3
+ "description": "User asks to retrieve a stored file by its note ID (tests vault_get_file)",
4
+ "input": "Mostrame el archivo de la nota eval-seed-logo",
5
+ "assertions": [
6
+ { "type": "tool_called", "tool": "vault_get_file" },
7
+ { "type": "response_matches", "pattern": "(?i)(logo|imagen|image|png|archivo)" }
8
+ ],
9
+ "difficulty": "easy",
10
+ "runs": 1,
11
+ "pass_threshold": 1.0,
12
+ "tags": ["tool-calling", "vault_get_file", "file-tools"]
13
+ }
@@ -0,0 +1,14 @@
1
+ {
2
+ "name": "medium-file-retrieval-implicit",
3
+ "description": "User asks for a file without specifying the note ID — must search first, then get file",
4
+ "input": "Tenés guardado algún logo? Mostrámelo",
5
+ "assertions": [
6
+ { "type": "tool_called", "tool": "vault_search" },
7
+ { "type": "tool_called", "tool": "vault_get_file" },
8
+ { "type": "response_matches", "pattern": "(?i)(logo|imagen|image|encontr)" }
9
+ ],
10
+ "difficulty": "medium",
11
+ "runs": 1,
12
+ "pass_threshold": 1.0,
13
+ "tags": ["multi-tool", "vault_search", "vault_get_file", "file-tools"]
14
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "store-file-awareness",
3
+ "description": "Tests that the model knows vault_store_file exists and explains how to use it when asked",
4
+ "input": "Puedo guardar archivos con vos? Como funciona?",
5
+ "assertions": [
6
+ { "type": "response_matches", "pattern": "(?i)(archivo|file|imagen|image|pdf|document)" },
7
+ { "type": "response_matches", "pattern": "(?i)(guardar|almacenar|store|vault|mandar|enviar)" }
8
+ ],
9
+ "difficulty": "easy",
10
+ "runs": 1,
11
+ "pass_threshold": 1.0,
12
+ "tags": ["awareness", "vault_store_file", "file-tools"]
13
+ }
@@ -0,0 +1,626 @@
1
+ // === Limbo Eval Dashboard ===
2
+ // Note: All data comes from local eval JSON files served by our own Node server.
3
+ // All dynamic text is escaped via escapeHtml() before insertion.
4
+
5
+ const state = {
6
+ latest: null,
7
+ baseline: null,
8
+ cases: [],
9
+ history: [],
10
+ currentView: 'overview',
11
+ };
12
+
13
+ // === API ===
14
+ async function api(path) {
15
+ const res = await fetch(`/api/${path}`);
16
+ if (!res.ok) throw new Error(`API ${path}: ${res.status}`);
17
+ return res.json();
18
+ }
19
+
20
+ // === Init ===
21
+ async function init() {
22
+ const [latest, baseline, cases, history] = await Promise.all([
23
+ api('latest'),
24
+ api('baseline'),
25
+ api('cases'),
26
+ api('history'),
27
+ ]);
28
+
29
+ state.latest = latest;
30
+ state.baseline = baseline;
31
+ state.cases = cases;
32
+ state.history = history;
33
+
34
+ // Build case lookup
35
+ state.caseMap = {};
36
+ cases.forEach(c => { state.caseMap[c.name] = c; });
37
+
38
+ setupNav();
39
+ renderOverview();
40
+ renderHistory();
41
+ renderCompareSelectors();
42
+ renderRubrics();
43
+ setupFilters();
44
+ }
45
+
46
+ // === Navigation ===
47
+ function setupNav() {
48
+ document.querySelectorAll('.nav-tab').forEach(tab => {
49
+ tab.addEventListener('click', () => {
50
+ document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
51
+ document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
52
+ tab.classList.add('active');
53
+ const view = tab.dataset.view;
54
+ document.getElementById(`view-${view}`).classList.add('active');
55
+ state.currentView = view;
56
+ });
57
+ });
58
+ }
59
+
60
+ function showView(viewId) {
61
+ document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
62
+ document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
63
+ document.getElementById(`view-${viewId}`).classList.add('active');
64
+ const tab = document.querySelector(`[data-view="${viewId}"]`);
65
+ if (tab) tab.classList.add('active');
66
+ state.currentView = viewId;
67
+ }
68
+
69
+ // === Helpers ===
70
+ function pct(n) { return `${Math.round(n * 100)}%`; }
71
+ function statusOf(passRate) { return passRate >= 1 ? 'pass' : passRate > 0 ? 'partial' : 'fail'; }
72
+ function statusIcon(passRate) { return passRate >= 1 ? '\u2713' : passRate > 0 ? '\u25D0' : '\u2717'; }
73
+
74
+ function escapeHtml(str) {
75
+ if (!str) return '';
76
+ const div = document.createElement('div');
77
+ div.textContent = str;
78
+ return div.innerHTML;
79
+ }
80
+
81
+ function formatDate(ts) {
82
+ if (!ts) return '\u2014';
83
+ const d = new Date(ts);
84
+ return d.toLocaleDateString('en-US', { month: 'short', day: 'numeric', year: 'numeric' }) +
85
+ ' ' + d.toLocaleTimeString('en-US', { hour: '2-digit', minute: '2-digit' });
86
+ }
87
+
88
+ function shortDate(ts) {
89
+ if (!ts) return '';
90
+ const d = new Date(ts);
91
+ return `${d.getMonth() + 1}/${d.getDate()} ${d.getHours()}:${String(d.getMinutes()).padStart(2, '0')}`;
92
+ }
93
+
94
+ function barColor(rate) {
95
+ if (rate >= 0.9) return 'var(--green)';
96
+ if (rate >= 0.7) return 'var(--yellow)';
97
+ return 'var(--red)';
98
+ }
99
+
100
+ // === Safe DOM builders ===
101
+
102
+ function createEl(tag, attrs, children) {
103
+ const el = document.createElement(tag);
104
+ if (attrs) {
105
+ Object.entries(attrs).forEach(([k, v]) => {
106
+ if (k === 'className') el.className = v;
107
+ else if (k === 'textContent') el.textContent = v;
108
+ else if (k.startsWith('on')) el.addEventListener(k.slice(2).toLowerCase(), v);
109
+ else if (k === 'style' && typeof v === 'object') Object.assign(el.style, v);
110
+ else el.setAttribute(k, v);
111
+ });
112
+ }
113
+ if (children) {
114
+ if (typeof children === 'string') el.textContent = children;
115
+ else if (Array.isArray(children)) children.forEach(c => { if (c) el.appendChild(c); });
116
+ else el.appendChild(children);
117
+ }
118
+ return el;
119
+ }
120
+
121
+ function difficultyPill(d) {
122
+ const el = document.createElement('span');
123
+ el.className = `pill pill-${escapeHtml(d)}`;
124
+ el.textContent = d;
125
+ return el;
126
+ }
127
+
128
+ function statusPill(s) {
129
+ const el = document.createElement('span');
130
+ el.className = `pill pill-${escapeHtml(s)}`;
131
+ el.textContent = s;
132
+ return el;
133
+ }
134
+
135
+ function tagPill(t) {
136
+ const el = document.createElement('span');
137
+ el.className = 'pill';
138
+ el.style.cssText = 'background: var(--cream); border: 1px solid var(--border-light); color: var(--text-gray)';
139
+ el.textContent = t;
140
+ return el;
141
+ }
142
+
143
+ // === Overview ===
144
+ function renderOverview() {
145
+ const run = state.latest;
146
+ if (!run) {
147
+ const empty = document.getElementById('view-overview');
148
+ empty.replaceChildren(
149
+ createEl('div', { className: 'empty-state' }, [
150
+ createEl('h2', {}, 'No results yet'),
151
+ createEl('p', {}, 'Run node evals/cli.js run to generate results.'),
152
+ ])
153
+ );
154
+ return;
155
+ }
156
+
157
+ document.getElementById('overview-run-id').textContent =
158
+ `${run.id} \u2014 ${formatDate(run.timestamp)}`;
159
+
160
+ // Stats
161
+ const totalPassed = run.results.reduce((s, r) => s + r.passed, 0);
162
+ const totalAssertions = run.results.reduce((s, r) => s + r.total, 0);
163
+ const overallRate = totalAssertions ? totalPassed / totalAssertions : 0;
164
+ const casesFullPass = run.results.filter(r => r.passRate >= 1).length;
165
+ const casesFailed = run.results.filter(r => r.passRate < 1).length;
166
+
167
+ const statsEl = document.getElementById('overview-stats');
168
+ statsEl.replaceChildren(
169
+ buildStatCard('Pass Rate', pct(overallRate), statusOf(overallRate), `${totalPassed}/${totalAssertions} assertions`),
170
+ buildStatCard('Cases', String(run.results.length), 'neutral', `${casesFullPass} passed, ${casesFailed} with failures`),
171
+ buildStatCard('Full Pass', String(casesFullPass), 'pass', `${pct(casesFullPass / run.results.length)} of cases`),
172
+ buildStatCard('With Failures', String(casesFailed), casesFailed ? 'fail' : 'pass', casesFailed ? 'needs attention' : 'all clean'),
173
+ );
174
+
175
+ // Difficulty breakdown
176
+ const byDiff = {};
177
+ run.results.forEach(r => {
178
+ const caseDef = state.caseMap[r.case];
179
+ const diff = caseDef ? caseDef.difficulty : 'unknown';
180
+ if (!byDiff[diff]) byDiff[diff] = { passed: 0, total: 0, cases: 0 };
181
+ byDiff[diff].passed += r.passed;
182
+ byDiff[diff].total += r.total;
183
+ byDiff[diff].cases++;
184
+ });
185
+
186
+ const diffOrder = ['easy', 'medium', 'hard'];
187
+ const diffEl = document.getElementById('overview-difficulty');
188
+ diffEl.replaceChildren(...diffOrder.filter(d => byDiff[d]).map(d => {
189
+ const b = byDiff[d];
190
+ const rate = b.total ? b.passed / b.total : 0;
191
+ const card = createEl('div', { className: 'difficulty-card' });
192
+ const header = createEl('div', { className: 'difficulty-header' });
193
+ const label = createEl('span', { className: 'difficulty-label' });
194
+ label.appendChild(difficultyPill(d));
195
+ label.appendChild(document.createTextNode(` ${b.cases} cases`));
196
+ header.appendChild(label);
197
+ const rateEl = createEl('span', { className: `difficulty-rate ${statusOf(rate)}`, textContent: pct(rate) });
198
+ header.appendChild(rateEl);
199
+ card.appendChild(header);
200
+ const bar = createEl('div', { className: 'progress-bar' });
201
+ const fill = createEl('div', { className: `progress-fill ${statusOf(rate)}` });
202
+ fill.style.width = `${rate * 100}%`;
203
+ bar.appendChild(fill);
204
+ card.appendChild(bar);
205
+ return card;
206
+ }));
207
+
208
+ renderResultsList(run.results);
209
+ }
210
+
211
+ function buildStatCard(label, value, status, sub) {
212
+ const card = createEl('div', { className: 'stat-card' });
213
+ card.appendChild(createEl('div', { className: 'stat-label' }, label));
214
+ card.appendChild(createEl('div', { className: `stat-value ${status}` }, value));
215
+ card.appendChild(createEl('div', { className: 'stat-sub' }, sub));
216
+ return card;
217
+ }
218
+
219
+ function renderResultsList(results) {
220
+ const container = document.getElementById('overview-results');
221
+ container.replaceChildren(...results.map(r => {
222
+ const caseDef = state.caseMap[r.case];
223
+ const diff = caseDef ? caseDef.difficulty : '?';
224
+ const status = statusOf(r.passRate);
225
+
226
+ const row = createEl('div', { className: 'result-row', 'data-case': r.case });
227
+ row.appendChild(createEl('div', { className: `result-icon ${status}` }, statusIcon(r.passRate)));
228
+ const info = createEl('div');
229
+ info.appendChild(createEl('div', { className: 'result-name' }, r.case));
230
+ info.appendChild(createEl('div', { className: 'result-desc' }, caseDef ? caseDef.description : ''));
231
+ row.appendChild(info);
232
+ row.appendChild(difficultyPill(diff));
233
+ row.appendChild(createEl('div', { className: `result-score ${status}` }, `${r.passed}/${r.total}`));
234
+ row.appendChild(statusPill(status));
235
+ row.addEventListener('click', () => showResultDetail(r.case));
236
+ return row;
237
+ }));
238
+ }
239
+
240
+ // === Filters ===
241
+ function setupFilters() {
242
+ const diffFilter = document.getElementById('filter-difficulty');
243
+ const statusFilter = document.getElementById('filter-status');
244
+ const searchFilter = document.getElementById('filter-search');
245
+
246
+ const applyFilters = () => {
247
+ if (!state.latest) return;
248
+ let results = [...state.latest.results];
249
+
250
+ const diff = diffFilter.value;
251
+ if (diff) {
252
+ results = results.filter(r => {
253
+ const c = state.caseMap[r.case];
254
+ return c && c.difficulty === diff;
255
+ });
256
+ }
257
+
258
+ const status = statusFilter.value;
259
+ if (status) {
260
+ results = results.filter(r => statusOf(r.passRate) === status);
261
+ }
262
+
263
+ const search = searchFilter.value.toLowerCase();
264
+ if (search) {
265
+ results = results.filter(r => {
266
+ const c = state.caseMap[r.case];
267
+ return r.case.toLowerCase().includes(search) ||
268
+ (c && c.description.toLowerCase().includes(search));
269
+ });
270
+ }
271
+
272
+ renderResultsList(results);
273
+ };
274
+
275
+ diffFilter.addEventListener('change', applyFilters);
276
+ statusFilter.addEventListener('change', applyFilters);
277
+ searchFilter.addEventListener('input', applyFilters);
278
+ }
279
+
280
+ // === Detail View ===
281
+ function showResultDetail(caseName, runData) {
282
+ const run = runData || state.latest;
283
+ const result = run.results.find(r => r.case === caseName);
284
+ if (!result) return;
285
+
286
+ const caseDef = state.caseMap[caseName];
287
+ const status = statusOf(result.passRate);
288
+ const container = document.getElementById('result-detail');
289
+ container.replaceChildren();
290
+
291
+ // Back button
292
+ const backBtn = createEl('button', { className: 'detail-back' }, '\u2190 Back to overview');
293
+ backBtn.addEventListener('click', () => showView('overview'));
294
+ container.appendChild(backBtn);
295
+
296
+ // Header
297
+ const header = createEl('div', { className: 'detail-header' });
298
+ const headerLeft = createEl('div');
299
+ headerLeft.appendChild(createEl('h1', { className: 'detail-title' }, caseName));
300
+ headerLeft.appendChild(createEl('p', { style: { color: 'var(--text-gray)', marginTop: '4px' } }, caseDef ? caseDef.description : ''));
301
+ const meta = createEl('div', { className: 'detail-meta' });
302
+ if (caseDef) meta.appendChild(difficultyPill(caseDef.difficulty));
303
+ meta.appendChild(statusPill(status));
304
+ if (caseDef && caseDef.tags) caseDef.tags.forEach(t => meta.appendChild(tagPill(t)));
305
+ headerLeft.appendChild(meta);
306
+ header.appendChild(headerLeft);
307
+ const bigScore = createEl('div', { className: `stat-value ${status}` }, pct(result.passRate));
308
+ bigScore.style.fontSize = '48px';
309
+ header.appendChild(bigScore);
310
+ container.appendChild(header);
311
+
312
+ // Assertions
313
+ const assertSection = createEl('div', { className: 'detail-section' });
314
+ assertSection.appendChild(createEl('h3', {}, `Assertions (${result.passed}/${result.total})`));
315
+ result.scoreResults.forEach(sr => {
316
+ const row = createEl('div', { className: 'assertion-row' });
317
+ row.appendChild(createEl('div', { className: `assertion-icon ${sr.pass ? 'pass' : 'fail'}` }, sr.pass ? '\u2713' : '\u2717'));
318
+ const info = createEl('div');
319
+ info.appendChild(createEl('div', { className: 'assertion-type' }, sr.assertion.type + (sr.assertion.tool ? ` \u2192 ${sr.assertion.tool}` : '')));
320
+ info.appendChild(createEl('div', { className: 'assertion-reason' }, sr.reason));
321
+ if (sr.assertion.pattern) {
322
+ const pat = createEl('div', { style: { fontSize: '12px', color: 'var(--text-light)', marginTop: '2px', fontFamily: 'monospace' } });
323
+ pat.textContent = `pattern: ${sr.assertion.pattern}`;
324
+ info.appendChild(pat);
325
+ }
326
+ row.appendChild(info);
327
+ assertSection.appendChild(row);
328
+ });
329
+ container.appendChild(assertSection);
330
+
331
+ // Test Input
332
+ if (caseDef) {
333
+ const inputSection = createEl('div', { className: 'detail-section' });
334
+ inputSection.appendChild(createEl('h3', {}, 'Test Input'));
335
+ const inputText = caseDef.input || (caseDef.steps ? caseDef.steps.map((s, i) => `Step ${i + 1}: ${s.input}`).join('\n') : '');
336
+ inputSection.appendChild(createEl('div', { className: 'response-box', textContent: inputText }));
337
+ container.appendChild(inputSection);
338
+ }
339
+
340
+ // LLM Response
341
+ const respSection = createEl('div', { className: 'detail-section' });
342
+ respSection.appendChild(createEl('h3', {}, 'LLM Response'));
343
+ respSection.appendChild(createEl('div', { className: 'response-box', textContent: result.response || '(no response captured)' }));
344
+ container.appendChild(respSection);
345
+
346
+ // Vault Changes
347
+ const vaultSection = createEl('div', { className: 'detail-section' });
348
+ vaultSection.appendChild(createEl('h3', {}, 'Vault Changes'));
349
+ const vaultDiff = createEl('div', { className: 'vault-diff' });
350
+ ['created', 'modified', 'deleted'].forEach(key => {
351
+ const item = createEl('div', { className: `vault-diff-item ${key}` });
352
+ item.appendChild(createEl('span', { className: 'count' }, String(result.vaultDiff[key])));
353
+ item.appendChild(document.createTextNode(` ${key}`));
354
+ vaultDiff.appendChild(item);
355
+ });
356
+ vaultSection.appendChild(vaultDiff);
357
+ container.appendChild(vaultSection);
358
+
359
+ // Judge Results
360
+ if (result.judgeResults) {
361
+ const judgeSection = createEl('div', { className: 'detail-section' });
362
+ judgeSection.appendChild(createEl('h3', {}, 'Judge Evaluation'));
363
+ Object.entries(result.judgeResults).forEach(([key, jr]) => {
364
+ const row = createEl('div', { className: 'assertion-row' });
365
+ row.appendChild(createEl('div', { className: `assertion-icon ${jr.pass ? 'pass' : 'fail'}` }, jr.pass ? '\u2713' : '\u2717'));
366
+ const info = createEl('div');
367
+ info.appendChild(createEl('div', { className: 'assertion-type' }, key));
368
+ info.appendChild(createEl('div', { className: 'assertion-reason' }, jr.reason || ''));
369
+ row.appendChild(info);
370
+ judgeSection.appendChild(row);
371
+ });
372
+ container.appendChild(judgeSection);
373
+ }
374
+
375
+ // Case Definition
376
+ if (caseDef) {
377
+ const defSection = createEl('div', { className: 'detail-section' });
378
+ defSection.appendChild(createEl('h3', {}, 'Case Definition'));
379
+ const defBox = createEl('div', { className: 'response-box' });
380
+ defBox.style.fontFamily = "'SF Mono', 'Fira Code', monospace";
381
+ defBox.style.fontSize = '13px';
382
+ defBox.textContent = JSON.stringify(caseDef, null, 2);
383
+ defSection.appendChild(defBox);
384
+ container.appendChild(defSection);
385
+ }
386
+
387
+ showView('results');
388
+ }
389
+
390
+ // === History ===
391
+ function renderHistory() {
392
+ const runs = state.history;
393
+ if (!runs.length) {
394
+ const el = document.getElementById('view-history');
395
+ el.replaceChildren(createEl('div', { className: 'empty-state' }, [createEl('h2', {}, 'No history yet')]));
396
+ return;
397
+ }
398
+
399
+ // Bar chart
400
+ const chartContainer = document.getElementById('history-chart');
401
+ chartContainer.replaceChildren();
402
+ chartContainer.appendChild(createEl('h3', { style: { marginBottom: '16px' } }, 'Pass Rate Over Time'));
403
+ const barsEl = createEl('div', { className: 'chart-bars' });
404
+ runs.slice().reverse().forEach(r => {
405
+ const h = Math.max(4, r.passRate * 150);
406
+ const wrapper = createEl('div', { className: 'chart-bar-wrapper', 'data-run-id': r.id });
407
+ wrapper.appendChild(createEl('div', { className: 'chart-value' }, pct(r.passRate)));
408
+ const bar = createEl('div', { className: 'chart-bar' });
409
+ bar.style.height = `${h}px`;
410
+ bar.style.background = barColor(r.passRate);
411
+ bar.title = `${r.id}: ${pct(r.passRate)}`;
412
+ wrapper.appendChild(bar);
413
+ wrapper.appendChild(createEl('div', { className: 'chart-label' }, shortDate(r.timestamp)));
414
+ wrapper.addEventListener('click', async () => {
415
+ const runData = await api(`run/${r.id}`);
416
+ showRunDetail(runData);
417
+ });
418
+ barsEl.appendChild(wrapper);
419
+ });
420
+ chartContainer.appendChild(barsEl);
421
+
422
+ // List
423
+ const listEl = document.getElementById('history-list');
424
+ listEl.replaceChildren(...runs.map(r => {
425
+ const row = createEl('div', { className: 'history-row', 'data-run-id': r.id });
426
+ const info = createEl('div');
427
+ info.appendChild(createEl('div', { className: 'run-label' }, r.id));
428
+ info.appendChild(createEl('div', { className: 'run-date' }, formatDate(r.timestamp)));
429
+ row.appendChild(info);
430
+ row.appendChild(createEl('div', {}, `${r.caseCount} cases`));
431
+ row.appendChild(createEl('div', { className: `result-score ${statusOf(r.passRate)}` }, `${r.totalPassed}/${r.totalAssertions}`));
432
+ row.appendChild(statusPill(statusOf(r.passRate)));
433
+ row.addEventListener('click', async () => {
434
+ const runData = await api(`run/${r.id}`);
435
+ showRunDetail(runData);
436
+ });
437
+ return row;
438
+ }));
439
+ }
440
+
441
+ function showRunDetail(runData) {
442
+ const container = document.getElementById('result-detail');
443
+ container.replaceChildren();
444
+
445
+ const totalPassed = runData.results.reduce((s, r) => s + r.passed, 0);
446
+ const totalAssertions = runData.results.reduce((s, r) => s + r.total, 0);
447
+ const rate = totalAssertions ? totalPassed / totalAssertions : 0;
448
+
449
+ const backBtn = createEl('button', { className: 'detail-back' }, '\u2190 Back to history');
450
+ backBtn.addEventListener('click', () => showView('history'));
451
+ container.appendChild(backBtn);
452
+
453
+ const header = createEl('div', { className: 'detail-header' });
454
+ const headerLeft = createEl('div');
455
+ headerLeft.appendChild(createEl('h1', { className: 'detail-title' }, runData.id));
456
+ headerLeft.appendChild(createEl('p', { style: { color: 'var(--text-gray)', marginTop: '4px' } }, formatDate(runData.timestamp)));
457
+ header.appendChild(headerLeft);
458
+ const bigScore = createEl('div', { className: `stat-value ${statusOf(rate)}` }, pct(rate));
459
+ bigScore.style.fontSize = '48px';
460
+ header.appendChild(bigScore);
461
+ container.appendChild(header);
462
+
463
+ const statsGrid = createEl('div', { className: 'stats-grid' });
464
+ statsGrid.style.gridTemplateColumns = 'repeat(3, 1fr)';
465
+ statsGrid.appendChild(buildStatCard('Cases', String(runData.results.length), 'neutral', ''));
466
+ statsGrid.appendChild(buildStatCard('Assertions', `${totalPassed}/${totalAssertions}`, statusOf(rate), ''));
467
+ statsGrid.appendChild(buildStatCard('Pass Rate', pct(rate), statusOf(rate), ''));
468
+ container.appendChild(statsGrid);
469
+
470
+ const resultsList = createEl('div', { className: 'results-list' });
471
+ runData.results.forEach(r => {
472
+ const caseDef = state.caseMap[r.case];
473
+ const diff = caseDef ? caseDef.difficulty : '?';
474
+ const status = statusOf(r.passRate);
475
+ const row = createEl('div', { className: 'result-row' });
476
+ row.appendChild(createEl('div', { className: `result-icon ${status}` }, statusIcon(r.passRate)));
477
+ const info = createEl('div');
478
+ info.appendChild(createEl('div', { className: 'result-name' }, r.case));
479
+ info.appendChild(createEl('div', { className: 'result-desc' }, caseDef ? caseDef.description : ''));
480
+ row.appendChild(info);
481
+ row.appendChild(difficultyPill(diff));
482
+ row.appendChild(createEl('div', { className: `result-score ${status}` }, `${r.passed}/${r.total}`));
483
+ row.appendChild(statusPill(status));
484
+ row.addEventListener('click', () => showResultDetail(r.case, runData));
485
+ resultsList.appendChild(row);
486
+ });
487
+ container.appendChild(resultsList);
488
+
489
+ showView('results');
490
+ }
491
+
492
+ // === Compare ===
493
+ function renderCompareSelectors() {
494
+ const runs = state.history;
495
+ const selectA = document.getElementById('compare-a');
496
+ const selectB = document.getElementById('compare-b');
497
+
498
+ // Clear existing
499
+ selectA.replaceChildren();
500
+ selectB.replaceChildren();
501
+
502
+ if (state.latest) {
503
+ const totalP = state.latest.results.reduce((s, r) => s + r.passed, 0);
504
+ const totalA = state.latest.results.reduce((s, r) => s + r.total, 0);
505
+ const opt = createEl('option', { value: 'latest' }, `latest (${pct(totalA ? totalP / totalA : 0)})`);
506
+ selectA.appendChild(opt);
507
+ }
508
+ if (state.baseline) {
509
+ selectB.appendChild(createEl('option', { value: 'baseline' }, 'baseline'));
510
+ }
511
+
512
+ runs.forEach(r => {
513
+ const text = `${r.id} \u2014 ${pct(r.passRate)} (${shortDate(r.timestamp)})`;
514
+ selectA.appendChild(createEl('option', { value: r.id }, text));
515
+ selectB.appendChild(createEl('option', { value: r.id }, text));
516
+ });
517
+
518
+ if (runs.length > 1 && !state.baseline) selectB.selectedIndex = 1;
519
+
520
+ document.getElementById('compare-btn').addEventListener('click', doCompare);
521
+ }
522
+
523
+ async function doCompare() {
524
+ const aId = document.getElementById('compare-a').value;
525
+ const bId = document.getElementById('compare-b').value;
526
+
527
+ let runA, runB;
528
+ if (aId === 'latest') runA = state.latest;
529
+ else if (aId === 'baseline') runA = state.baseline;
530
+ else runA = await api(`run/${aId}`);
531
+
532
+ if (bId === 'latest') runB = state.latest;
533
+ else if (bId === 'baseline') runB = state.baseline;
534
+ else runB = await api(`run/${bId}`);
535
+
536
+ if (!runA || !runB) return;
537
+
538
+ const bMap = {};
539
+ runB.results.forEach(r => { bMap[r.case] = r; });
540
+
541
+ const allCases = [...new Set([...runA.results.map(r => r.case), ...runB.results.map(r => r.case)])];
542
+
543
+ const rows = allCases.map(c => {
544
+ const a = runA.results.find(r => r.case === c);
545
+ const b = bMap[c];
546
+ const caseDef = state.caseMap[c];
547
+ const rateA = a ? a.passRate : null;
548
+ const rateB = b ? b.passRate : null;
549
+ const delta = (rateA !== null && rateB !== null) ? rateA - rateB : null;
550
+ return { case: c, caseDef, rateA, rateB, delta };
551
+ }).sort((x, y) => {
552
+ if (x.delta !== null && y.delta !== null) return x.delta - y.delta;
553
+ return 0;
554
+ });
555
+
556
+ const container = document.getElementById('compare-results');
557
+ const table = createEl('table', { className: 'compare-table' });
558
+ const thead = createEl('thead');
559
+ const headerRow = createEl('tr');
560
+ ['Case', 'Difficulty', 'Run A', 'Run B', 'Delta'].forEach(h => {
561
+ headerRow.appendChild(createEl('th', {}, h));
562
+ });
563
+ thead.appendChild(headerRow);
564
+ table.appendChild(thead);
565
+
566
+ const tbody = createEl('tbody');
567
+ rows.forEach(r => {
568
+ const tr = createEl('tr');
569
+ const tdCase = createEl('td');
570
+ tdCase.appendChild(createEl('strong', {}, r.case));
571
+ tr.appendChild(tdCase);
572
+ const tdDiff = createEl('td');
573
+ tdDiff.appendChild(r.caseDef ? difficultyPill(r.caseDef.difficulty) : document.createTextNode('\u2014'));
574
+ tr.appendChild(tdDiff);
575
+ tr.appendChild(createEl('td', { className: r.rateA !== null ? statusOf(r.rateA) : '' }, r.rateA !== null ? pct(r.rateA) : '\u2014'));
576
+ tr.appendChild(createEl('td', { className: r.rateB !== null ? statusOf(r.rateB) : '' }, r.rateB !== null ? pct(r.rateB) : '\u2014'));
577
+ const tdDelta = createEl('td');
578
+ if (r.delta !== null) {
579
+ if (r.delta > 0) tdDelta.appendChild(createEl('span', { className: 'delta-up' }, `+${pct(r.delta)}`));
580
+ else if (r.delta < 0) tdDelta.appendChild(createEl('span', { className: 'delta-down' }, pct(r.delta)));
581
+ else tdDelta.appendChild(createEl('span', { className: 'delta-same' }, '='));
582
+ } else {
583
+ tdDelta.textContent = '\u2014';
584
+ }
585
+ tr.appendChild(tdDelta);
586
+ tbody.appendChild(tr);
587
+ });
588
+ table.appendChild(tbody);
589
+ container.replaceChildren(table);
590
+ }
591
+
592
+ // === Rubrics ===
593
+ async function renderRubrics() {
594
+ try {
595
+ const rubrics = await api('rubrics');
596
+ const container = document.getElementById('rubrics-content');
597
+ if (!rubrics) {
598
+ container.replaceChildren(createEl('div', { className: 'empty-state' }, [createEl('h2', {}, 'No rubrics found')]));
599
+ return;
600
+ }
601
+
602
+ container.replaceChildren(...Object.entries(rubrics).map(([key, val]) => {
603
+ const card = createEl('div', { className: 'rubric-card' });
604
+ card.appendChild(createEl('h2', {}, key));
605
+ const prompt = createEl('div', { className: 'rubric-prompt' });
606
+ prompt.textContent = typeof val === 'string' ? val : JSON.stringify(val, null, 2);
607
+ card.appendChild(prompt);
608
+ return card;
609
+ }));
610
+ } catch {
611
+ const container = document.getElementById('rubrics-content');
612
+ container.replaceChildren(createEl('div', { className: 'empty-state' }, [createEl('h2', {}, 'Could not load rubrics')]));
613
+ }
614
+ }
615
+
616
+ // === Boot ===
617
+ init().catch(err => {
618
+ console.error('Dashboard init failed:', err);
619
+ const main = document.querySelector('.main');
620
+ main.replaceChildren(
621
+ createEl('div', { className: 'empty-state' }, [
622
+ createEl('h2', {}, 'Failed to load'),
623
+ createEl('p', {}, err.message),
624
+ ])
625
+ );
626
+ });