@sebastiantuyu/agest 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/preview.js CHANGED
@@ -2,7 +2,7 @@ import { readFile, writeFile } from "fs/promises";
2
2
  import { join, relative } from "path";
3
3
  import os from "os";
4
4
  import { exec } from "child_process";
5
- import { parseReport, findReports, loadDiffEntry, computeDiff, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
5
+ import { parseReport, findReports, loadDiffEntry, wilsonLowerBound, computeDiff, formatDuration, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
6
6
  // ---------------------------------------------------------------------------
7
7
  // Helpers
8
8
  // ---------------------------------------------------------------------------
@@ -59,6 +59,109 @@ function formatTimestamp(ts) {
59
59
  }
60
60
  }
61
61
  // ---------------------------------------------------------------------------
62
+ // Smart dimension labels
63
+ // ---------------------------------------------------------------------------
64
+ /**
65
+ * Build human-readable labels for each unique value of each dimension.
66
+ * - model: short name after "/"
67
+ * - tools: compact tool names or count
68
+ * - prompt: diff-based labels from .diff/ entries when available
69
+ */
70
+ function buildSmartLabels(sorted, allDims, diffEntries) {
71
+ const labels = new Map();
72
+ for (const dim of allDims) {
73
+ const dimLabels = new Map();
74
+ if (dim === "model") {
75
+ for (const r of sorted) {
76
+ const val = r.dimensions?.[dim] ?? "?";
77
+ if (!dimLabels.has(val)) {
78
+ const short = val.length > 16 ? val.split("/").pop()?.slice(0, 16) ?? val.slice(0, 16) : val;
79
+ dimLabels.set(val, short);
80
+ }
81
+ }
82
+ }
83
+ else if (dim === "tools") {
84
+ for (const r of sorted) {
85
+ const val = r.dimensions?.[dim] ?? "?";
86
+ if (!dimLabels.has(val)) {
87
+ if (val === "none") {
88
+ dimLabels.set(val, "no tools");
89
+ }
90
+ else {
91
+ const toolList = val.split(",");
92
+ if (toolList.length <= 2) {
93
+ dimLabels.set(val, toolList.join(", "));
94
+ }
95
+ else {
96
+ dimLabels.set(val, `${toolList.length} tools`);
97
+ }
98
+ }
99
+ }
100
+ }
101
+ }
102
+ else if (dim === "prompt") {
103
+ // Collect unique prompt hashes in chronological order
104
+ const uniqueHashes = [];
105
+ const hashToDiff = new Map();
106
+ for (let i = 0; i < sorted.length; i++) {
107
+ const val = sorted[i].dimensions?.[dim] ?? "?";
108
+ if (!uniqueHashes.includes(val)) {
109
+ uniqueHashes.push(val);
110
+ const diff = diffEntries[i];
111
+ if (diff)
112
+ hashToDiff.set(val, diff);
113
+ }
114
+ }
115
+ for (let j = 0; j < uniqueHashes.length; j++) {
116
+ const hash = uniqueHashes[j];
117
+ const diff = hashToDiff.get(hash);
118
+ if (j === 0) {
119
+ // First prompt: show truncated first line or "baseline"
120
+ if (diff?.systemPrompt) {
121
+ const firstLine = diff.systemPrompt.split("\n").find((l) => l.trim()) ?? "";
122
+ dimLabels.set(hash, firstLine.length > 28 ? firstLine.slice(0, 27) + "…" : firstLine || "baseline");
123
+ }
124
+ else {
125
+ dimLabels.set(hash, "baseline");
126
+ }
127
+ }
128
+ else {
129
+ // Subsequent: compute diff snippet vs previous
130
+ const prevHash = uniqueHashes[j - 1];
131
+ const prevDiff = hashToDiff.get(prevHash);
132
+ if (diff && prevDiff) {
133
+ const changes = computeDiff(prevDiff, diff);
134
+ const promptChanges = changes
135
+ .filter((l) => l.startsWith("prompt:"))
136
+ .map((l) => l.replace(/^prompt:\s*/, "").slice(0, 30));
137
+ const toolChanges = changes
138
+ .filter((l) => l.startsWith("tools:"))
139
+ .map((l) => l.replace(/^tools:\s*/, "").slice(0, 30));
140
+ const snippets = [...promptChanges, ...toolChanges].slice(0, 2);
141
+ dimLabels.set(hash, snippets.length > 0 ? snippets.join(", ") : `v${j + 1}`);
142
+ }
143
+ else {
144
+ dimLabels.set(hash, `v${j + 1}`);
145
+ }
146
+ }
147
+ }
148
+ }
149
+ else {
150
+ // Generic fallback: version numbering
151
+ let idx = 1;
152
+ for (const r of sorted) {
153
+ const val = r.dimensions?.[dim] ?? "?";
154
+ if (!dimLabels.has(val)) {
155
+ dimLabels.set(val, `v${idx}`);
156
+ idx++;
157
+ }
158
+ }
159
+ }
160
+ labels.set(dim, dimLabels);
161
+ }
162
+ return labels;
163
+ }
164
+ // ---------------------------------------------------------------------------
62
165
  // Rendering
63
166
  // ---------------------------------------------------------------------------
64
167
  function renderTools(tools) {
@@ -133,6 +236,195 @@ function renderRunRow(entry, idx) {
133
236
  </div>`;
134
237
  }
135
238
  // ---------------------------------------------------------------------------
239
+ // Radar Chart (suite breakdown)
240
+ // ---------------------------------------------------------------------------
241
+ const RADAR_COLORS = [
242
+ { border: "#f87171", fill: "rgba(248,113,113,0.15)" },
243
+ { border: "#38bdf8", fill: "rgba(56,189,248,0.15)" },
244
+ { border: "#4ade80", fill: "rgba(74,222,128,0.15)" },
245
+ { border: "#facc15", fill: "rgba(250,204,21,0.15)" },
246
+ { border: "#a78bfa", fill: "rgba(167,139,250,0.15)" },
247
+ { border: "#fb923c", fill: "rgba(251,146,60,0.15)" },
248
+ { border: "#f472b6", fill: "rgba(244,114,182,0.15)" },
249
+ { border: "#2dd4bf", fill: "rgba(45,212,191,0.15)" },
250
+ ];
251
+ function renderRadarChart(group) {
252
+ const reportsWithSuites = group.runs
253
+ .map((r) => r.report)
254
+ .filter((r) => r.suites && r.suites.length > 0);
255
+ if (reportsWithSuites.length === 0)
256
+ return "";
257
+ // Collect all unique suite names
258
+ const allSuiteNames = [
259
+ ...new Set(reportsWithSuites.flatMap((r) => r.suites.map((s) => s.name))),
260
+ ];
261
+ if (allSuiteNames.length < 3)
262
+ return ""; // Radar needs at least 3 axes
263
+ // Group by model — each model gets its own dataset
264
+ const byModel = new Map();
265
+ for (const r of reportsWithSuites) {
266
+ const model = r.model ?? "unknown";
267
+ const arr = byModel.get(model) ?? [];
268
+ arr.push(r);
269
+ byModel.set(model, arr);
270
+ }
271
+ const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
272
+ // Build one canvas per model + one "all" combined view
273
+ const modelEntries = [...byModel.entries()];
274
+ const allModels = modelEntries.map(([m]) => m);
275
+ const showToggle = allModels.length > 1;
276
+ // "All models" combined dataset
277
+ const allDatasets = modelEntries.map(([model, reports], i) => {
278
+ const latest = reports[reports.length - 1];
279
+ const color = RADAR_COLORS[i % RADAR_COLORS.length];
280
+ const rawData = allSuiteNames.map((suiteName) => {
281
+ const suite = latest.suites.find((s) => s.name === suiteName);
282
+ return suite ? +(suite.successRate * 100).toFixed(1) : 0;
283
+ });
284
+ const wilsonData = allSuiteNames.map((suiteName) => {
285
+ const suite = latest.suites.find((s) => s.name === suiteName);
286
+ return suite ? +(wilsonLowerBound(suite.successRate, suite.totalCases) * 100).toFixed(1) : 0;
287
+ });
288
+ const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
289
+ return {
290
+ label: short,
291
+ data: rawData,
292
+ _rawData: rawData,
293
+ _wilsonData: wilsonData,
294
+ borderColor: color.border,
295
+ backgroundColor: color.fill,
296
+ pointBackgroundColor: color.border,
297
+ pointBorderColor: "#18181b",
298
+ pointRadius: 4,
299
+ borderWidth: 2,
300
+ };
301
+ });
302
+ const radarOptions = `{
303
+ responsive: true,
304
+ maintainAspectRatio: false,
305
+ plugins: {
306
+ legend: {
307
+ labels: {
308
+ color: '#a1a1aa',
309
+ font: { family: 'ui-monospace, monospace', size: 10 },
310
+ boxWidth: 12,
311
+ padding: 16
312
+ }
313
+ },
314
+ tooltip: {
315
+ callbacks: {
316
+ label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.r + '%'; }
317
+ }
318
+ }
319
+ },
320
+ scales: {
321
+ r: {
322
+ min: 0,
323
+ max: 100,
324
+ ticks: {
325
+ color: '#71717a',
326
+ backdropColor: 'transparent',
327
+ font: { family: 'ui-monospace, monospace', size: 9 },
328
+ callback: function(v) { return v + '%'; }
329
+ },
330
+ pointLabels: {
331
+ color: '#a1a1aa',
332
+ font: { family: 'ui-monospace, monospace', size: 11 }
333
+ },
334
+ grid: { color: '#27272a' },
335
+ angleLines: { color: '#27272a' }
336
+ }
337
+ }
338
+ }`;
339
+ // Build combined radar canvas
340
+ const allCanvasId = `radar-all-${agentId}`;
341
+ let canvasesHtml = `
342
+ <div class="radar-model-view" data-agent="${agentId}" data-model="__all__" style="display:block">
343
+ <div style="position:relative;height:400px">
344
+ <canvas id="${allCanvasId}"></canvas>
345
+ </div>
346
+ <script>
347
+ (function() {
348
+ var chart = new Chart(document.getElementById('${allCanvasId}'), {
349
+ type: 'radar',
350
+ data: {
351
+ labels: ${JSON.stringify(allSuiteNames)},
352
+ datasets: ${JSON.stringify(allDatasets)}
353
+ },
354
+ options: ${radarOptions}
355
+ });
356
+ window.__agestCharts['${allCanvasId}'] = chart;
357
+ })();
358
+ </script>
359
+ </div>`;
360
+ // Per-model radar canvases (hidden by default)
361
+ if (showToggle) {
362
+ for (let i = 0; i < modelEntries.length; i++) {
363
+ const [model, reports] = modelEntries[i];
364
+ const latest = reports[reports.length - 1];
365
+ const color = RADAR_COLORS[i % RADAR_COLORS.length];
366
+ const rawData = allSuiteNames.map((suiteName) => {
367
+ const suite = latest.suites.find((s) => s.name === suiteName);
368
+ return suite ? +(suite.successRate * 100).toFixed(1) : 0;
369
+ });
370
+ const wilsonData = allSuiteNames.map((suiteName) => {
371
+ const suite = latest.suites.find((s) => s.name === suiteName);
372
+ return suite ? +(wilsonLowerBound(suite.successRate, suite.totalCases) * 100).toFixed(1) : 0;
373
+ });
374
+ const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
375
+ const canvasId = `radar-${agentId}-${i}`;
376
+ const safeModel = escHtml(model);
377
+ canvasesHtml += `
378
+ <div class="radar-model-view" data-agent="${agentId}" data-model="${safeModel}" style="display:none">
379
+ <div style="position:relative;height:400px">
380
+ <canvas id="${canvasId}"></canvas>
381
+ </div>
382
+ <script>
383
+ (function() {
384
+ var chart = new Chart(document.getElementById('${canvasId}'), {
385
+ type: 'radar',
386
+ data: {
387
+ labels: ${JSON.stringify(allSuiteNames)},
388
+ datasets: [${JSON.stringify({
389
+ label: short,
390
+ data: rawData,
391
+ _rawData: rawData,
392
+ _wilsonData: wilsonData,
393
+ borderColor: color.border,
394
+ backgroundColor: color.fill,
395
+ pointBackgroundColor: color.border,
396
+ pointBorderColor: "#18181b",
397
+ pointRadius: 4,
398
+ borderWidth: 2,
399
+ })}]
400
+ },
401
+ options: ${radarOptions}
402
+ });
403
+ window.__agestCharts['${canvasId}'] = chart;
404
+ })();
405
+ </script>
406
+ </div>`;
407
+ }
408
+ }
409
+ // Model selector dropdown (only when multiple models)
410
+ const modelSelector = showToggle
411
+ ? `<select class="radar-model-select bg-zinc-800 text-zinc-300 text-xs border border-zinc-700 rounded px-2 py-1"
412
+ data-agent="${agentId}"
413
+ onchange="filterRadarModel('${agentId}', this.value)">
414
+ <option value="__all__">All Models</option>
415
+ ${allModels.map((m) => `<option value="${escHtml(m)}">${escHtml(m.split("/").pop()?.slice(0, 30) ?? m.slice(0, 30))}</option>`).join("\n")}
416
+ </select>`
417
+ : "";
418
+ return `
419
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
420
+ <div class="flex items-center justify-between mb-4">
421
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">suite breakdown</span>
422
+ ${modelSelector}
423
+ </div>
424
+ ${canvasesHtml}
425
+ </div>`;
426
+ }
427
+ // ---------------------------------------------------------------------------
136
428
  // Grouped Bar Chart (benchmark-style)
137
429
  // ---------------------------------------------------------------------------
138
430
  const SERIES_COLORS = [
@@ -145,7 +437,7 @@ const SERIES_COLORS = [
145
437
  { bg: "#f472b6", text: "#f9a8d4" }, // pink
146
438
  { bg: "#2dd4bf", text: "#5eead4" }, // teal
147
439
  ];
148
- function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agentId, isActive) {
440
+ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel) {
149
441
  const otherDims = allDims.filter((d) => d !== groupDim);
150
442
  // Column dimension: prefer "model", else first other dim
151
443
  const colDim = otherDims.includes("model") ? "model" : otherDims[0];
@@ -202,6 +494,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
202
494
  return `<td class="px-4 py-2"><span class="text-xs text-zinc-700">&mdash;</span></td>`;
203
495
  }
204
496
  const pct = r.successRate * 100;
497
+ const wilsonPct = wilsonLowerBound(r.successRate, r.totalCases) * 100;
205
498
  const color = barColor(r.successRate);
206
499
  const tc = rateClass(r.successRate);
207
500
  return `<td class="px-4 py-2">
@@ -209,7 +502,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
209
502
  <div class="flex-1 bg-zinc-800 rounded h-2 overflow-hidden" style="min-width:80px">
210
503
  <div class="h-2 rounded" style="width:${pct.toFixed(1)}%;background:${color}"></div>
211
504
  </div>
212
- <span class="text-sm font-medium ${tc} w-12 text-right">${pct.toFixed(0)}%</span>
505
+ <span class="text-sm font-medium ${tc} w-12 text-right" data-raw="${pct.toFixed(0)}%" data-wilson="${wilsonPct.toFixed(0)}%">${pct.toFixed(0)}%</span>
213
506
  </div>
214
507
  </td>`;
215
508
  })
@@ -239,10 +532,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
239
532
  })
240
533
  .filter(Boolean)
241
534
  .join("\n");
242
- return `<div class="chart-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:${isActive ? "block" : "none"}">
243
- <div class="mb-4">
244
- <div class="text-xs text-zinc-600 uppercase tracking-wider mb-1">grouped by ${escHtml(groupDim)}</div>
245
- </div>
535
+ return `
246
536
  <div class="overflow-x-auto">
247
537
  <table class="w-full">
248
538
  <thead>
@@ -258,8 +548,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
258
548
  </div>
259
549
  <div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
260
550
  ${versionRef}
261
- </div>
262
- </div>`;
551
+ </div>`;
263
552
  }
264
553
  function renderGroupedBarChart(group) {
265
554
  const reports = group.runs.map((r) => r.report);
@@ -274,42 +563,19 @@ function renderGroupedBarChart(group) {
274
563
  if (varying.length < 1)
275
564
  return "";
276
565
  const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
277
- // Build version labels for each dimension: first unique value seen = v1, etc.
278
- const versionMaps = new Map();
279
566
  const sorted = [...reports].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
280
- for (const dim of allDims) {
281
- const seen = new Map();
282
- let idx = 1;
283
- for (const r of sorted) {
284
- const val = r.dimensions?.[dim] ?? "?";
285
- if (!seen.has(val)) {
286
- seen.set(val, `v${idx}`);
287
- idx++;
288
- }
289
- }
290
- versionMaps.set(dim, seen);
291
- }
567
+ // Build smart labels using diff entries for prompt/tools readability
568
+ const versionMaps = buildSmartLabels(sorted, allDims, group.diffEntries);
292
569
  const dimLabel = (dim, val) => {
293
- const vMap = versionMaps.get(dim);
294
- const version = vMap?.get(val) ?? "?";
295
- // For model, show short model name. For others, show version tag.
296
- if (dim === "model") {
297
- const short = val.length > 16 ? val.split("/").pop()?.slice(0, 16) ?? val.slice(0, 16) : val;
298
- return short;
299
- }
300
- // For tools, show "none" directly instead of a version tag
301
- if (dim === "tools" && val === "none") {
302
- return "none";
303
- }
304
- return version;
570
+ return versionMaps.get(dim)?.get(val) ?? val;
305
571
  };
306
572
  // Build a chart for each possible grouping dimension
307
573
  const charts = varying.map((groupDim, dimIdx) => {
308
574
  const isActive = dimIdx === 0;
309
- // For non-model dimensions, render a matrix/heatmap view instead of bar chart
310
- if (groupDim !== "model") {
311
- return renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agentId, isActive);
312
- }
575
+ // Also render a matrix/table view for non-model dimensions (hidden by default)
576
+ const matrixHtml = groupDim !== "model"
577
+ ? renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel)
578
+ : "";
313
579
  const otherDims = allDims.filter((d) => d !== groupDim);
314
580
  // Group runs by the grouping dimension
315
581
  const groupVals = [...new Set(sorted.map((r) => r.dimensions?.[groupDim] ?? "?"))];
@@ -330,12 +596,17 @@ function renderGroupedBarChart(group) {
330
596
  const color = SERIES_COLORS[ci % SERIES_COLORS.length];
331
597
  const parts = ck.split("|");
332
598
  const cfgLabel = otherDims.map((d, i) => `${d}: ${dimLabel(d, parts[i] ?? "?")}`).join(", ");
333
- const data = groupVals.map((gv) => {
599
+ const rawData = groupVals.map((gv) => {
334
600
  const groupRuns = grouped.get(gv) ?? [];
335
601
  const match = groupRuns.find((r) => configKey(r) === ck);
336
602
  return match ? +(match.successRate * 100).toFixed(1) : null;
337
603
  });
338
- return { label: cfgLabel, data, backgroundColor: color.bg, borderColor: color.bg, borderWidth: 0, borderRadius: 4 };
604
+ const wilsonData = groupVals.map((gv) => {
605
+ const groupRuns = grouped.get(gv) ?? [];
606
+ const match = groupRuns.find((r) => configKey(r) === ck);
607
+ return match ? +(wilsonLowerBound(match.successRate, match.totalCases) * 100).toFixed(1) : null;
608
+ });
609
+ return { label: cfgLabel, data: rawData, _rawData: rawData, _wilsonData: wilsonData, backgroundColor: color.bg, borderColor: color.bg, borderWidth: 0, borderRadius: 4 };
339
610
  });
340
611
  const canvasId = `bar-${agentId}-${escHtml(groupDim)}`;
341
612
  // Version reference
@@ -354,55 +625,66 @@ function renderGroupedBarChart(group) {
354
625
  })
355
626
  .filter(Boolean)
356
627
  .join("\n");
628
+ const viewToggle = matrixHtml ? `<button
629
+ class="view-toggle text-[10px] text-zinc-600 hover:text-zinc-400 transition-colors"
630
+ data-agent="${agentId}" data-dim="${escHtml(groupDim)}"
631
+ onclick="switchView('${agentId}', '${escHtml(groupDim)}')"
632
+ >Table</button>` : "";
357
633
  return `<div class="chart-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:${isActive ? "block" : "none"}">
358
- <div class="mb-4">
359
- <div class="text-xs text-zinc-600 uppercase tracking-wider mb-1">grouped by ${escHtml(groupDim)}</div>
360
- </div>
361
- <div style="position:relative;height:280px">
362
- <canvas id="${canvasId}"></canvas>
634
+ <div class="flex items-center justify-between mb-4">
635
+ <div class="text-xs text-zinc-600 uppercase tracking-wider">grouped by ${escHtml(groupDim)}</div>
636
+ ${viewToggle}
363
637
  </div>
364
- <script>
365
- new Chart(document.getElementById('${canvasId}'), {
366
- type: 'bar',
367
- data: { labels: ${JSON.stringify(labels)}, datasets: ${JSON.stringify(datasets)} },
368
- options: {
369
- responsive: true,
370
- maintainAspectRatio: false,
371
- plugins: {
372
- legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
373
- tooltip: { callbacks: { label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.y + '%'; } } }
374
- },
375
- scales: {
376
- x: { ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } }, grid: { color: '#27272a' } },
377
- y: { min: 0, max: 100, ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } }, grid: { color: '#27272a' } }
378
- }
379
- }
380
- });
381
- </script>
382
- <div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
383
- ${versionRef}
638
+ <div class="bar-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}">
639
+ <div style="position:relative;height:280px">
640
+ <canvas id="${canvasId}"></canvas>
641
+ </div>
642
+ <script>
643
+ (function() {
644
+ var chart = new Chart(document.getElementById('${canvasId}'), {
645
+ type: 'bar',
646
+ data: { labels: ${JSON.stringify(labels)}, datasets: ${JSON.stringify(datasets)} },
647
+ options: {
648
+ responsive: true,
649
+ maintainAspectRatio: false,
650
+ plugins: {
651
+ legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
652
+ tooltip: { callbacks: { label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.y + '%'; } } }
653
+ },
654
+ scales: {
655
+ x: { ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } }, grid: { color: '#27272a' } },
656
+ y: { min: 0, max: 100, ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } }, grid: { color: '#27272a' } }
657
+ }
658
+ }
659
+ });
660
+ window.__agestCharts['${canvasId}'] = chart;
661
+ })();
662
+ </script>
663
+ <div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
664
+ ${versionRef}
665
+ </div>
384
666
  </div>
667
+ ${matrixHtml ? `<div class="table-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:none">${matrixHtml}</div>` : ""}
385
668
  </div>`;
386
669
  });
387
- // Dimension toggle tabs
388
- const tabs = varying
389
- .map((dim, i) => {
390
- const active = i === 0;
391
- return `<button
392
- class="dim-tab px-3 py-1.5 text-xs rounded-md transition-colors ${active ? "bg-zinc-700 text-zinc-200" : "bg-zinc-800/50 text-zinc-500 hover:text-zinc-300"}"
393
- data-agent="${agentId}"
394
- data-dim="${escHtml(dim)}"
395
- onclick="switchDim('${agentId}', '${escHtml(dim)}')"
396
- >${escHtml(dim)}</button>`;
397
- })
670
+ // Primary dimension selector
671
+ const dimOptions = varying
672
+ .map((dim) => `<option value="${escHtml(dim)}">${escHtml(dim)}</option>`)
398
673
  .join("\n");
674
+ const dimSelector = varying.length > 1
675
+ ? `<div class="flex items-center gap-2">
676
+ <span class="text-[10px] text-zinc-600 uppercase tracking-wider">Group by</span>
677
+ <select class="bg-zinc-800 text-zinc-300 text-xs border border-zinc-700 rounded px-2 py-1"
678
+ onchange="switchDim('${agentId}', this.value)">
679
+ ${dimOptions}
680
+ </select>
681
+ </div>`
682
+ : `<span class="text-[10px] text-zinc-600">grouped by ${escHtml(varying[0])}</span>`;
399
683
  return `
400
684
  <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
401
685
  <div class="flex items-center justify-between mb-5">
402
686
  <span class="text-xs text-zinc-600 uppercase tracking-wider">success rate</span>
403
- <div class="flex gap-1.5">
404
- ${tabs}
405
- </div>
687
+ ${dimSelector}
406
688
  </div>
407
689
  ${charts.join("\n")}
408
690
  </div>`;
@@ -530,28 +812,33 @@ function renderScatterPlot(group) {
530
812
  const model = r.dimensions?.["model"] ?? r.model ?? "?";
531
813
  const avgDurSec = r.totalCases > 0 ? +(r.duration / r.totalCases / 1000).toFixed(2) : 0;
532
814
  const accuracy = +(r.successRate * 100).toFixed(1);
815
+ const wilsonAccuracy = +(wilsonLowerBound(r.successRate, r.totalCases) * 100).toFixed(1);
533
816
  const configLabel = allDims
534
817
  .filter((d) => d !== "model")
535
818
  .map((d) => `${d}: ${r.dimensions?.[d] ?? "?"}`)
536
819
  .join(", ");
537
- const arr = byModel.get(model) ?? [];
538
- arr.push({ x: avgDurSec, y: accuracy, label: configLabel });
539
- byModel.set(model, arr);
820
+ const entry = byModel.get(model) ?? { raw: [], wilson: [] };
821
+ entry.raw.push({ x: avgDurSec, y: accuracy, label: configLabel });
822
+ entry.wilson.push({ x: avgDurSec, y: wilsonAccuracy, label: configLabel });
823
+ byModel.set(model, entry);
540
824
  }
541
825
  const uniqueModels = [...byModel.keys()];
542
826
  const datasets = uniqueModels.map((model, i) => {
543
827
  const color = SERIES_COLORS[i % SERIES_COLORS.length];
544
828
  const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
829
+ const entry = byModel.get(model);
545
830
  return {
546
831
  label: short,
547
- data: byModel.get(model),
832
+ data: entry.raw,
833
+ _rawScatter: entry.raw,
834
+ _wilsonScatter: entry.wilson,
548
835
  backgroundColor: color.bg,
549
836
  borderColor: color.text,
550
837
  pointRadius: 7,
551
838
  pointHoverRadius: 9,
552
839
  };
553
840
  });
554
- const allX = [...byModel.values()].flat().map((p) => p.x);
841
+ const allX = [...byModel.values()].flatMap((e) => e.raw).map((p) => p.x);
555
842
  const midX = allX.length > 0 ? +((Math.min(...allX) + Math.max(...allX)) / 2).toFixed(2) : 0;
556
843
  const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
557
844
  const canvasId = `scatter-${agentId}`;
@@ -564,97 +851,276 @@ function renderScatterPlot(group) {
564
851
  <canvas id="${canvasId}"></canvas>
565
852
  </div>
566
853
  <script>
567
- new Chart(document.getElementById('${canvasId}'), {
568
- type: 'scatter',
569
- data: { datasets: ${JSON.stringify(datasets)} },
570
- options: {
571
- responsive: true,
572
- maintainAspectRatio: false,
573
- plugins: {
574
- legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
575
- tooltip: {
576
- callbacks: {
577
- label: function(ctx) {
578
- var p = ctx.raw;
579
- var lines = [ctx.dataset.label + ': ' + p.y + '% accuracy, ' + p.x.toFixed(1) + 's/case'];
580
- if (p.label) lines.push(p.label);
581
- return lines;
854
+ (function() {
855
+ var chart = new Chart(document.getElementById('${canvasId}'), {
856
+ type: 'scatter',
857
+ data: { datasets: ${JSON.stringify(datasets)} },
858
+ options: {
859
+ responsive: true,
860
+ maintainAspectRatio: false,
861
+ plugins: {
862
+ legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
863
+ tooltip: {
864
+ callbacks: {
865
+ label: function(ctx) {
866
+ var p = ctx.raw;
867
+ var lines = [ctx.dataset.label + ': ' + p.y + '% accuracy, ' + p.x.toFixed(1) + 's/case'];
868
+ if (p.label) lines.push(p.label);
869
+ return lines;
870
+ }
582
871
  }
583
872
  }
584
- }
585
- },
586
- scales: {
587
- x: {
588
- title: { display: true, text: 'avg duration per case (s)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
589
- ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } },
590
- grid: { color: '#27272a' }
591
873
  },
592
- y: {
593
- min: 0, max: 100,
594
- title: { display: true, text: 'accuracy (%)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
595
- ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } },
596
- grid: { color: '#27272a' }
874
+ scales: {
875
+ x: {
876
+ title: { display: true, text: 'avg duration per case (s)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
877
+ ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } },
878
+ grid: { color: '#27272a' }
879
+ },
880
+ y: {
881
+ min: 0, max: 100,
882
+ title: { display: true, text: 'accuracy (%)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
883
+ ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } },
884
+ grid: { color: '#27272a' }
885
+ }
597
886
  }
598
- }
599
- },
600
- plugins: [{
601
- id: 'quadrantLines',
602
- afterDraw: function(chart) {
603
- var ctx = chart.ctx;
604
- var area = chart.chartArea;
605
- var xScale = chart.scales.x;
606
- var yScale = chart.scales.y;
607
- var midXPx = xScale.getPixelForValue(${midX});
608
- var midYPx = yScale.getPixelForValue(50);
887
+ },
888
+ plugins: [{
889
+ id: 'quadrantLines',
890
+ afterDraw: function(chart) {
891
+ var ctx = chart.ctx;
892
+ var area = chart.chartArea;
893
+ var xScale = chart.scales.x;
894
+ var yScale = chart.scales.y;
895
+ var midXPx = xScale.getPixelForValue(${midX});
896
+ var midYPx = yScale.getPixelForValue(50);
609
897
 
610
- ctx.save();
611
- ctx.setLineDash([6, 4]);
612
- ctx.lineWidth = 1;
613
- ctx.strokeStyle = 'rgba(113, 113, 122, 0.4)';
898
+ ctx.save();
899
+ ctx.setLineDash([6, 4]);
900
+ ctx.lineWidth = 1;
901
+ ctx.strokeStyle = 'rgba(113, 113, 122, 0.4)';
614
902
 
615
- ctx.beginPath();
616
- ctx.moveTo(midXPx, area.top);
617
- ctx.lineTo(midXPx, area.bottom);
618
- ctx.stroke();
903
+ ctx.beginPath();
904
+ ctx.moveTo(midXPx, area.top);
905
+ ctx.lineTo(midXPx, area.bottom);
906
+ ctx.stroke();
619
907
 
620
- ctx.beginPath();
621
- ctx.moveTo(area.left, midYPx);
622
- ctx.lineTo(area.right, midYPx);
623
- ctx.stroke();
908
+ ctx.beginPath();
909
+ ctx.moveTo(area.left, midYPx);
910
+ ctx.lineTo(area.right, midYPx);
911
+ ctx.stroke();
624
912
 
625
- ctx.setLineDash([]);
626
- ctx.font = '10px ui-monospace, monospace';
627
- ctx.fillStyle = 'rgba(113, 113, 122, 0.5)';
913
+ ctx.setLineDash([]);
914
+ ctx.font = '10px ui-monospace, monospace';
915
+ ctx.fillStyle = 'rgba(113, 113, 122, 0.5)';
628
916
 
629
- ctx.textAlign = 'left';
630
- ctx.textBaseline = 'top';
631
- ctx.fillText('Ideal', area.left + 8, area.top + 8);
917
+ ctx.textAlign = 'left';
918
+ ctx.textBaseline = 'top';
919
+ ctx.fillText('Ideal', area.left + 8, area.top + 8);
632
920
 
633
- ctx.textAlign = 'right';
634
- ctx.fillText('Smart but slow', area.right - 8, area.top + 8);
921
+ ctx.textAlign = 'right';
922
+ ctx.fillText('Smart but slow', area.right - 8, area.top + 8);
635
923
 
636
- ctx.textBaseline = 'bottom';
637
- ctx.fillText('Dumb and slow', area.right - 8, area.bottom - 8);
924
+ ctx.textBaseline = 'bottom';
925
+ ctx.fillText('Dumb and slow', area.right - 8, area.bottom - 8);
638
926
 
639
- ctx.textAlign = 'left';
640
- ctx.fillText('Dumb and fast', area.left + 8, area.bottom - 8);
927
+ ctx.textAlign = 'left';
928
+ ctx.fillText('Dumb and fast', area.left + 8, area.bottom - 8);
641
929
 
642
- ctx.restore();
643
- }
644
- }]
645
- });
930
+ ctx.restore();
931
+ }
932
+ }]
933
+ });
934
+ window.__agestCharts['${canvasId}'] = chart;
935
+ })();
646
936
  </script>
647
937
  </div>`;
648
938
  }
939
+ function renderSingleRun(report) {
940
+ const pct = (report.successRate * 100).toFixed(0);
941
+ const passed = report.totalCases - report.failedCasesCount;
942
+ const color = rateClass(report.successRate);
943
+ const dur = formatDuration(report.duration);
944
+ const failedRows = report.failedCases
945
+ .map((fc) => `
946
+ <tr class="border-t border-zinc-800/50">
947
+ <td class="py-2 pr-4 text-zinc-300 text-xs">${escHtml(fc.prompt)}</td>
948
+ <td class="py-2 text-zinc-500 text-xs">${escHtml(fc.reason ?? "")}</td>
949
+ </tr>`)
950
+ .join("");
951
+ const failedSection = report.failedCases.length > 0
952
+ ? `
953
+ <div class="mt-6">
954
+ <h4 class="text-xs text-zinc-500 uppercase tracking-widest mb-2">Failed Cases</h4>
955
+ <table class="w-full text-left">
956
+ <thead><tr class="text-zinc-600 text-xs">
957
+ <th class="pb-1 pr-4">Prompt</th>
958
+ <th class="pb-1">Reason</th>
959
+ </tr></thead>
960
+ <tbody>${failedRows}</tbody>
961
+ </table>
962
+ </div>`
963
+ : "";
964
+ return `
965
+ <div class="bg-zinc-900/50 rounded-lg border border-zinc-800 p-6">
966
+ <div class="flex items-baseline gap-6 mb-4">
967
+ <span class="${color} text-3xl font-bold">${pct}%</span>
968
+ <span class="text-zinc-500 text-sm">${passed}/${report.totalCases} passed</span>
969
+ <span class="text-zinc-600 text-sm">${dur}</span>
970
+ </div>
971
+
972
+ <div class="grid grid-cols-2 gap-4 text-sm">
973
+ <div>
974
+ <span class="text-zinc-500">Model</span>
975
+ <p class="text-zinc-300">${escHtml(report.model)}</p>
976
+ </div>
977
+ <div>
978
+ <span class="text-zinc-500">Timestamp</span>
979
+ <p class="text-zinc-300">${formatTimestamp(report.timestamp)}</p>
980
+ </div>
981
+ ${report.averageInputTokensPerCase != null
982
+ ? `<div>
983
+ <span class="text-zinc-500">Avg Input Tokens</span>
984
+ <p class="text-zinc-300">${Math.round(report.averageInputTokensPerCase)}</p>
985
+ </div>`
986
+ : ""}
987
+ ${report.averageOutputTokensPerCase != null
988
+ ? `<div>
989
+ <span class="text-zinc-500">Avg Output Tokens</span>
990
+ <p class="text-zinc-300">${Math.round(report.averageOutputTokensPerCase)}</p>
991
+ </div>`
992
+ : ""}
993
+ ${report.tools && report.tools.length > 0
994
+ ? `<div>
995
+ <span class="text-zinc-500">Tools</span>
996
+ <p class="text-zinc-300">${escHtml(report.tools.join(", "))}</p>
997
+ </div>`
998
+ : ""}
999
+ </div>
1000
+ ${failedSection}
1001
+ </div>`;
1002
+ }
1003
+ function renderDebugPanel(group) {
1004
+ // Collect all failed cases across all runs, with dimension context
1005
+ const failures = [];
1006
+ for (const run of group.runs) {
1007
+ const r = run.report;
1008
+ const dimTags = Object.entries(r.dimensions ?? {})
1009
+ .map(([k, v]) => {
1010
+ const short = v.length > 20 ? v.slice(0, 19) + "…" : v;
1011
+ return `${k}:${short}`;
1012
+ })
1013
+ .join(" ");
1014
+ // Top-level failed cases
1015
+ for (const fc of r.failedCases) {
1016
+ failures.push({ prompt: fc.prompt, reason: fc.reason, response: fc.response, dims: dimTags });
1017
+ }
1018
+ // Suite-level failed cases (may overlap with top-level, dedupe by prompt+dims)
1019
+ if (r.suites) {
1020
+ for (const s of r.suites) {
1021
+ for (const fc of s.failedCases) {
1022
+ const alreadyAdded = failures.some((f) => f.prompt === fc.prompt && f.dims === dimTags);
1023
+ if (!alreadyAdded) {
1024
+ failures.push({ prompt: fc.prompt, reason: fc.reason, response: fc.response, suite: s.name, dims: dimTags });
1025
+ }
1026
+ else {
1027
+ // Enrich existing entry with suite name
1028
+ const existing = failures.find((f) => f.prompt === fc.prompt && f.dims === dimTags);
1029
+ if (existing && !existing.suite)
1030
+ existing.suite = s.name;
1031
+ // Enrich with response if missing at top-level
1032
+ if (existing && !existing.response && fc.response)
1033
+ existing.response = fc.response;
1034
+ }
1035
+ }
1036
+ }
1037
+ }
1038
+ }
1039
+ if (failures.length === 0)
1040
+ return "";
1041
+ // Group by suite
1042
+ const suiteOrder = [];
1043
+ const bySuite = new Map();
1044
+ for (const f of failures) {
1045
+ const key = f.suite ?? "__none__";
1046
+ if (!bySuite.has(key)) {
1047
+ suiteOrder.push(key);
1048
+ bySuite.set(key, []);
1049
+ }
1050
+ bySuite.get(key).push(f);
1051
+ }
1052
+ const renderFailure = (f) => {
1053
+ const promptShort = f.prompt.length > 70 ? f.prompt.slice(0, 67) + "…" : f.prompt;
1054
+ const reasonShort = f.reason
1055
+ ? `<span class="text-red-400/60 text-[10px] ml-2">${escHtml(f.reason.length > 50 ? f.reason.slice(0, 47) + "…" : f.reason)}</span>`
1056
+ : "";
1057
+ const responseHtml = f.response
1058
+ ? escHtml(f.response).replace(/\n/g, "<br>")
1059
+ : `<span class="text-zinc-700">no response captured</span>`;
1060
+ return `
1061
+ <details class="border-t border-zinc-800/50">
1062
+ <summary class="py-2.5 cursor-pointer select-none hover:bg-zinc-800/30 rounded px-2 -mx-2 flex items-center gap-2">
1063
+ <span class="text-red-400 text-xs shrink-0">FAIL</span>
1064
+ <span class="text-xs text-zinc-300 truncate flex-1">${escHtml(promptShort)}</span>
1065
+ ${reasonShort}
1066
+ <span class="text-[10px] text-zinc-700">${escHtml(f.dims)}</span>
1067
+ </summary>
1068
+ <div class="pb-3 px-2 -mx-2 space-y-2">
1069
+ <div>
1070
+ <div class="text-[10px] text-zinc-600 uppercase mb-1">Input</div>
1071
+ <div class="text-xs text-zinc-300 bg-zinc-800/50 rounded px-3 py-2">${escHtml(f.prompt)}</div>
1072
+ </div>
1073
+ <div>
1074
+ <div class="text-[10px] text-zinc-600 uppercase mb-1">Output</div>
1075
+ <div class="text-xs text-zinc-400 bg-zinc-800/50 rounded px-3 py-2 max-h-48 overflow-y-auto">${responseHtml}</div>
1076
+ </div>
1077
+ ${f.reason ? `<div><div class="text-[10px] text-zinc-600 uppercase mb-1">Reason</div><div class="text-xs text-red-400/80">${escHtml(f.reason)}</div></div>` : ""}
1078
+ </div>
1079
+ </details>`;
1080
+ };
1081
+ const rows = suiteOrder.map((key) => {
1082
+ const items = bySuite.get(key);
1083
+ const label = key === "__none__" ? "no suite" : key;
1084
+ return `
1085
+ <div class="mb-3 last:mb-0">
1086
+ <div class="flex items-center gap-2 mb-1">
1087
+ <span class="text-[10px] text-zinc-500 uppercase tracking-wider font-medium">${escHtml(label)}</span>
1088
+ <span class="text-[10px] text-zinc-700">${items.length}</span>
1089
+ </div>
1090
+ <div class="pl-2 border-l border-zinc-800">
1091
+ ${items.map(renderFailure).join("")}
1092
+ </div>
1093
+ </div>`;
1094
+ }).join("");
1095
+ return `
1096
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
1097
+ <div class="flex items-center justify-between mb-4">
1098
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">failed cases</span>
1099
+ <span class="text-xs text-zinc-600">${failures.length} failure${failures.length !== 1 ? "s" : ""}</span>
1100
+ </div>
1101
+ <div>
1102
+ ${rows}
1103
+ </div>
1104
+ </div>`;
1105
+ }
649
1106
  function renderAgentSection(group) {
1107
+ const radarHtml = renderRadarChart(group);
650
1108
  const chartHtml = renderGroupedBarChart(group);
651
1109
  const scatterHtml = renderScatterPlot(group);
1110
+ const debugHtml = renderDebugPanel(group);
1111
+ // When there are no comparative charts, show a single-run summary card
1112
+ const singleRunHtml = !chartHtml && !scatterHtml && !radarHtml && group.runs.length > 0
1113
+ ? renderSingleRun(group.runs[0].report)
1114
+ : "";
652
1115
  return `
653
1116
  <section class="mb-12">
654
1117
  <h2 class="text-base font-semibold mb-4 text-zinc-400 uppercase tracking-widest">${escHtml(group.label)}</h2>
655
1118
 
656
1119
  ${chartHtml}
657
1120
  ${scatterHtml}
1121
+ ${radarHtml}
1122
+ ${singleRunHtml}
1123
+ ${debugHtml}
658
1124
  </section>`;
659
1125
  }
660
1126
  // ---------------------------------------------------------------------------
@@ -672,16 +1138,57 @@ function generateHTML(groups, totalReports) {
672
1138
  <script src="https://cdn.tailwindcss.com"></script>
673
1139
  <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
674
1140
  <script>
1141
+ window.__agestCharts = {};
1142
+ window.__agestWilson = false;
1143
+
1144
+ function toggleWilson() {
1145
+ window.__agestWilson = !window.__agestWilson;
1146
+ var useWilson = window.__agestWilson;
1147
+ var btn = document.getElementById('wilson-toggle');
1148
+ if (btn) {
1149
+ btn.className = useWilson
1150
+ ? 'px-3 py-1 text-xs rounded-full border transition-colors bg-violet-600 border-violet-500 text-violet-100'
1151
+ : 'px-3 py-1 text-xs rounded-full border transition-colors bg-zinc-800/50 border-zinc-700 text-zinc-500 hover:text-zinc-300';
1152
+ btn.textContent = useWilson ? 'Wilson CI (95%)' : 'Raw';
1153
+ }
1154
+ // Update all Chart.js instances
1155
+ Object.values(window.__agestCharts).forEach(function(chart) {
1156
+ chart.data.datasets.forEach(function(ds) {
1157
+ if (ds._rawData && ds._wilsonData) {
1158
+ ds.data = useWilson ? ds._wilsonData : ds._rawData;
1159
+ }
1160
+ // Scatter plot: swap y values
1161
+ if (ds._rawScatter && ds._wilsonScatter) {
1162
+ ds.data = useWilson ? ds._wilsonScatter : ds._rawScatter;
1163
+ }
1164
+ });
1165
+ chart.update();
1166
+ });
1167
+ // Update matrix view cells
1168
+ document.querySelectorAll('[data-wilson]').forEach(function(el) {
1169
+ el.textContent = useWilson ? el.getAttribute('data-wilson') : el.getAttribute('data-raw');
1170
+ });
1171
+ }
1172
+
1173
+ function switchView(agent, dim) {
1174
+ var barEl = document.querySelector('.bar-view[data-agent="' + agent + '"][data-dim="' + dim + '"]');
1175
+ var tableEl = document.querySelector('.table-view[data-agent="' + agent + '"][data-dim="' + dim + '"]');
1176
+ var btn = document.querySelector('.view-toggle[data-agent="' + agent + '"][data-dim="' + dim + '"]');
1177
+ if (!barEl || !tableEl || !btn) return;
1178
+ var showingTable = tableEl.style.display !== 'none';
1179
+ barEl.style.display = showingTable ? 'block' : 'none';
1180
+ tableEl.style.display = showingTable ? 'none' : 'block';
1181
+ btn.textContent = showingTable ? 'Table' : 'Chart';
1182
+ }
1183
+
675
1184
  function switchDim(agent, dim) {
676
1185
  document.querySelectorAll('.chart-view[data-agent="' + agent + '"]').forEach(el => {
677
1186
  el.style.display = el.dataset.dim === dim ? 'block' : 'none';
678
1187
  });
679
- document.querySelectorAll('.dim-tab[data-agent="' + agent + '"]').forEach(el => {
680
- if (el.dataset.dim === dim) {
681
- el.className = el.className.replace('bg-zinc-800/50 text-zinc-500', 'bg-zinc-700 text-zinc-200');
682
- } else {
683
- el.className = el.className.replace('bg-zinc-700 text-zinc-200', 'bg-zinc-800/50 text-zinc-500');
684
- }
1188
+ }
1189
+ function filterRadarModel(agent, model) {
1190
+ document.querySelectorAll('.radar-model-view[data-agent="' + agent + '"]').forEach(el => {
1191
+ el.style.display = el.dataset.model === model ? 'block' : 'none';
685
1192
  });
686
1193
  }
687
1194
  </script>
@@ -691,7 +1198,11 @@ function generateHTML(groups, totalReports) {
691
1198
 
692
1199
  <header class="mb-10">
693
1200
  <h1 class="text-2xl font-bold tracking-tight">agest</h1>
694
- <p class="text-zinc-500 text-sm mt-1">${totalReports} report${totalReports !== 1 ? "s" : ""} &middot; generated ${generated}</p>
1201
+ <div class="flex items-center gap-3 mt-1">
1202
+ <p class="text-zinc-500 text-sm">${totalReports} report${totalReports !== 1 ? "s" : ""} &middot; generated ${generated}</p>
1203
+ <button id="wilson-toggle" onclick="toggleWilson()" title="Wilson score lower bound (95% CI) — adjusts for sample size"
1204
+ class="px-3 py-1 text-xs rounded-full border transition-colors bg-zinc-800/50 border-zinc-700 text-zinc-500 hover:text-zinc-300">Raw</button>
1205
+ </div>
695
1206
  </header>
696
1207
 
697
1208
  ${sections}
@@ -763,6 +1274,7 @@ async function main() {
763
1274
  runs,
764
1275
  varyingDims,
765
1276
  controlledPairs,
1277
+ diffEntries,
766
1278
  };
767
1279
  }));
768
1280
  const html = generateHTML(groups, reports.length);
@@ -771,7 +1283,4 @@ async function main() {
771
1283
  console.log(`\n Preview: ${tmpPath}\n`);
772
1284
  openBrowser(tmpPath);
773
1285
  }
774
- main().catch((err) => {
775
- console.error("Error:", err.message);
776
- process.exit(1);
777
- });
1286
+ export { main };