@sebastiantuyu/agest 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +3 -1
- package/dist/context.d.ts +14 -1
- package/dist/context.js +95 -11
- package/dist/index.d.ts +7 -2
- package/dist/index.js +26 -1
- package/dist/preview.js +670 -158
- package/dist/reporter.js +46 -0
- package/dist/reports.d.ts +20 -0
- package/dist/reports.js +99 -3
- package/dist/runner.js +69 -14
- package/dist/stats.js +33 -1
- package/dist/types.d.ts +14 -0
- package/package.json +1 -1
package/dist/preview.js
CHANGED
|
@@ -2,7 +2,7 @@ import { readFile, writeFile } from "fs/promises";
|
|
|
2
2
|
import { join, relative } from "path";
|
|
3
3
|
import os from "os";
|
|
4
4
|
import { exec } from "child_process";
|
|
5
|
-
import { parseReport, findReports, loadDiffEntry, computeDiff, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
|
|
5
|
+
import { parseReport, findReports, loadDiffEntry, wilsonLowerBound, computeDiff, formatDuration, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
|
|
6
6
|
// ---------------------------------------------------------------------------
|
|
7
7
|
// Helpers
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
@@ -59,6 +59,109 @@ function formatTimestamp(ts) {
|
|
|
59
59
|
}
|
|
60
60
|
}
|
|
61
61
|
// ---------------------------------------------------------------------------
|
|
62
|
+
// Smart dimension labels
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
/**
|
|
65
|
+
* Build human-readable labels for each unique value of each dimension.
|
|
66
|
+
* - model: short name after "/"
|
|
67
|
+
* - tools: compact tool names or count
|
|
68
|
+
* - prompt: diff-based labels from .diff/ entries when available
|
|
69
|
+
*/
|
|
70
|
+
function buildSmartLabels(sorted, allDims, diffEntries) {
|
|
71
|
+
const labels = new Map();
|
|
72
|
+
for (const dim of allDims) {
|
|
73
|
+
const dimLabels = new Map();
|
|
74
|
+
if (dim === "model") {
|
|
75
|
+
for (const r of sorted) {
|
|
76
|
+
const val = r.dimensions?.[dim] ?? "?";
|
|
77
|
+
if (!dimLabels.has(val)) {
|
|
78
|
+
const short = val.length > 16 ? val.split("/").pop()?.slice(0, 16) ?? val.slice(0, 16) : val;
|
|
79
|
+
dimLabels.set(val, short);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
else if (dim === "tools") {
|
|
84
|
+
for (const r of sorted) {
|
|
85
|
+
const val = r.dimensions?.[dim] ?? "?";
|
|
86
|
+
if (!dimLabels.has(val)) {
|
|
87
|
+
if (val === "none") {
|
|
88
|
+
dimLabels.set(val, "no tools");
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
const toolList = val.split(",");
|
|
92
|
+
if (toolList.length <= 2) {
|
|
93
|
+
dimLabels.set(val, toolList.join(", "));
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
dimLabels.set(val, `${toolList.length} tools`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
else if (dim === "prompt") {
|
|
103
|
+
// Collect unique prompt hashes in chronological order
|
|
104
|
+
const uniqueHashes = [];
|
|
105
|
+
const hashToDiff = new Map();
|
|
106
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
107
|
+
const val = sorted[i].dimensions?.[dim] ?? "?";
|
|
108
|
+
if (!uniqueHashes.includes(val)) {
|
|
109
|
+
uniqueHashes.push(val);
|
|
110
|
+
const diff = diffEntries[i];
|
|
111
|
+
if (diff)
|
|
112
|
+
hashToDiff.set(val, diff);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
for (let j = 0; j < uniqueHashes.length; j++) {
|
|
116
|
+
const hash = uniqueHashes[j];
|
|
117
|
+
const diff = hashToDiff.get(hash);
|
|
118
|
+
if (j === 0) {
|
|
119
|
+
// First prompt: show truncated first line or "baseline"
|
|
120
|
+
if (diff?.systemPrompt) {
|
|
121
|
+
const firstLine = diff.systemPrompt.split("\n").find((l) => l.trim()) ?? "";
|
|
122
|
+
dimLabels.set(hash, firstLine.length > 28 ? firstLine.slice(0, 27) + "…" : firstLine || "baseline");
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
dimLabels.set(hash, "baseline");
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
// Subsequent: compute diff snippet vs previous
|
|
130
|
+
const prevHash = uniqueHashes[j - 1];
|
|
131
|
+
const prevDiff = hashToDiff.get(prevHash);
|
|
132
|
+
if (diff && prevDiff) {
|
|
133
|
+
const changes = computeDiff(prevDiff, diff);
|
|
134
|
+
const promptChanges = changes
|
|
135
|
+
.filter((l) => l.startsWith("prompt:"))
|
|
136
|
+
.map((l) => l.replace(/^prompt:\s*/, "").slice(0, 30));
|
|
137
|
+
const toolChanges = changes
|
|
138
|
+
.filter((l) => l.startsWith("tools:"))
|
|
139
|
+
.map((l) => l.replace(/^tools:\s*/, "").slice(0, 30));
|
|
140
|
+
const snippets = [...promptChanges, ...toolChanges].slice(0, 2);
|
|
141
|
+
dimLabels.set(hash, snippets.length > 0 ? snippets.join(", ") : `v${j + 1}`);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
dimLabels.set(hash, `v${j + 1}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
// Generic fallback: version numbering
|
|
151
|
+
let idx = 1;
|
|
152
|
+
for (const r of sorted) {
|
|
153
|
+
const val = r.dimensions?.[dim] ?? "?";
|
|
154
|
+
if (!dimLabels.has(val)) {
|
|
155
|
+
dimLabels.set(val, `v${idx}`);
|
|
156
|
+
idx++;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
labels.set(dim, dimLabels);
|
|
161
|
+
}
|
|
162
|
+
return labels;
|
|
163
|
+
}
|
|
164
|
+
// ---------------------------------------------------------------------------
|
|
62
165
|
// Rendering
|
|
63
166
|
// ---------------------------------------------------------------------------
|
|
64
167
|
function renderTools(tools) {
|
|
@@ -133,6 +236,195 @@ function renderRunRow(entry, idx) {
|
|
|
133
236
|
</div>`;
|
|
134
237
|
}
|
|
135
238
|
// ---------------------------------------------------------------------------
|
|
239
|
+
// Radar Chart (suite breakdown)
|
|
240
|
+
// ---------------------------------------------------------------------------
|
|
241
|
+
const RADAR_COLORS = [
|
|
242
|
+
{ border: "#f87171", fill: "rgba(248,113,113,0.15)" },
|
|
243
|
+
{ border: "#38bdf8", fill: "rgba(56,189,248,0.15)" },
|
|
244
|
+
{ border: "#4ade80", fill: "rgba(74,222,128,0.15)" },
|
|
245
|
+
{ border: "#facc15", fill: "rgba(250,204,21,0.15)" },
|
|
246
|
+
{ border: "#a78bfa", fill: "rgba(167,139,250,0.15)" },
|
|
247
|
+
{ border: "#fb923c", fill: "rgba(251,146,60,0.15)" },
|
|
248
|
+
{ border: "#f472b6", fill: "rgba(244,114,182,0.15)" },
|
|
249
|
+
{ border: "#2dd4bf", fill: "rgba(45,212,191,0.15)" },
|
|
250
|
+
];
|
|
251
|
+
function renderRadarChart(group) {
|
|
252
|
+
const reportsWithSuites = group.runs
|
|
253
|
+
.map((r) => r.report)
|
|
254
|
+
.filter((r) => r.suites && r.suites.length > 0);
|
|
255
|
+
if (reportsWithSuites.length === 0)
|
|
256
|
+
return "";
|
|
257
|
+
// Collect all unique suite names
|
|
258
|
+
const allSuiteNames = [
|
|
259
|
+
...new Set(reportsWithSuites.flatMap((r) => r.suites.map((s) => s.name))),
|
|
260
|
+
];
|
|
261
|
+
if (allSuiteNames.length < 3)
|
|
262
|
+
return ""; // Radar needs at least 3 axes
|
|
263
|
+
// Group by model — each model gets its own dataset
|
|
264
|
+
const byModel = new Map();
|
|
265
|
+
for (const r of reportsWithSuites) {
|
|
266
|
+
const model = r.model ?? "unknown";
|
|
267
|
+
const arr = byModel.get(model) ?? [];
|
|
268
|
+
arr.push(r);
|
|
269
|
+
byModel.set(model, arr);
|
|
270
|
+
}
|
|
271
|
+
const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
|
|
272
|
+
// Build one canvas per model + one "all" combined view
|
|
273
|
+
const modelEntries = [...byModel.entries()];
|
|
274
|
+
const allModels = modelEntries.map(([m]) => m);
|
|
275
|
+
const showToggle = allModels.length > 1;
|
|
276
|
+
// "All models" combined dataset
|
|
277
|
+
const allDatasets = modelEntries.map(([model, reports], i) => {
|
|
278
|
+
const latest = reports[reports.length - 1];
|
|
279
|
+
const color = RADAR_COLORS[i % RADAR_COLORS.length];
|
|
280
|
+
const rawData = allSuiteNames.map((suiteName) => {
|
|
281
|
+
const suite = latest.suites.find((s) => s.name === suiteName);
|
|
282
|
+
return suite ? +(suite.successRate * 100).toFixed(1) : 0;
|
|
283
|
+
});
|
|
284
|
+
const wilsonData = allSuiteNames.map((suiteName) => {
|
|
285
|
+
const suite = latest.suites.find((s) => s.name === suiteName);
|
|
286
|
+
return suite ? +(wilsonLowerBound(suite.successRate, suite.totalCases) * 100).toFixed(1) : 0;
|
|
287
|
+
});
|
|
288
|
+
const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
|
|
289
|
+
return {
|
|
290
|
+
label: short,
|
|
291
|
+
data: rawData,
|
|
292
|
+
_rawData: rawData,
|
|
293
|
+
_wilsonData: wilsonData,
|
|
294
|
+
borderColor: color.border,
|
|
295
|
+
backgroundColor: color.fill,
|
|
296
|
+
pointBackgroundColor: color.border,
|
|
297
|
+
pointBorderColor: "#18181b",
|
|
298
|
+
pointRadius: 4,
|
|
299
|
+
borderWidth: 2,
|
|
300
|
+
};
|
|
301
|
+
});
|
|
302
|
+
const radarOptions = `{
|
|
303
|
+
responsive: true,
|
|
304
|
+
maintainAspectRatio: false,
|
|
305
|
+
plugins: {
|
|
306
|
+
legend: {
|
|
307
|
+
labels: {
|
|
308
|
+
color: '#a1a1aa',
|
|
309
|
+
font: { family: 'ui-monospace, monospace', size: 10 },
|
|
310
|
+
boxWidth: 12,
|
|
311
|
+
padding: 16
|
|
312
|
+
}
|
|
313
|
+
},
|
|
314
|
+
tooltip: {
|
|
315
|
+
callbacks: {
|
|
316
|
+
label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.r + '%'; }
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
},
|
|
320
|
+
scales: {
|
|
321
|
+
r: {
|
|
322
|
+
min: 0,
|
|
323
|
+
max: 100,
|
|
324
|
+
ticks: {
|
|
325
|
+
color: '#71717a',
|
|
326
|
+
backdropColor: 'transparent',
|
|
327
|
+
font: { family: 'ui-monospace, monospace', size: 9 },
|
|
328
|
+
callback: function(v) { return v + '%'; }
|
|
329
|
+
},
|
|
330
|
+
pointLabels: {
|
|
331
|
+
color: '#a1a1aa',
|
|
332
|
+
font: { family: 'ui-monospace, monospace', size: 11 }
|
|
333
|
+
},
|
|
334
|
+
grid: { color: '#27272a' },
|
|
335
|
+
angleLines: { color: '#27272a' }
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}`;
|
|
339
|
+
// Build combined radar canvas
|
|
340
|
+
const allCanvasId = `radar-all-${agentId}`;
|
|
341
|
+
let canvasesHtml = `
|
|
342
|
+
<div class="radar-model-view" data-agent="${agentId}" data-model="__all__" style="display:block">
|
|
343
|
+
<div style="position:relative;height:400px">
|
|
344
|
+
<canvas id="${allCanvasId}"></canvas>
|
|
345
|
+
</div>
|
|
346
|
+
<script>
|
|
347
|
+
(function() {
|
|
348
|
+
var chart = new Chart(document.getElementById('${allCanvasId}'), {
|
|
349
|
+
type: 'radar',
|
|
350
|
+
data: {
|
|
351
|
+
labels: ${JSON.stringify(allSuiteNames)},
|
|
352
|
+
datasets: ${JSON.stringify(allDatasets)}
|
|
353
|
+
},
|
|
354
|
+
options: ${radarOptions}
|
|
355
|
+
});
|
|
356
|
+
window.__agestCharts['${allCanvasId}'] = chart;
|
|
357
|
+
})();
|
|
358
|
+
</script>
|
|
359
|
+
</div>`;
|
|
360
|
+
// Per-model radar canvases (hidden by default)
|
|
361
|
+
if (showToggle) {
|
|
362
|
+
for (let i = 0; i < modelEntries.length; i++) {
|
|
363
|
+
const [model, reports] = modelEntries[i];
|
|
364
|
+
const latest = reports[reports.length - 1];
|
|
365
|
+
const color = RADAR_COLORS[i % RADAR_COLORS.length];
|
|
366
|
+
const rawData = allSuiteNames.map((suiteName) => {
|
|
367
|
+
const suite = latest.suites.find((s) => s.name === suiteName);
|
|
368
|
+
return suite ? +(suite.successRate * 100).toFixed(1) : 0;
|
|
369
|
+
});
|
|
370
|
+
const wilsonData = allSuiteNames.map((suiteName) => {
|
|
371
|
+
const suite = latest.suites.find((s) => s.name === suiteName);
|
|
372
|
+
return suite ? +(wilsonLowerBound(suite.successRate, suite.totalCases) * 100).toFixed(1) : 0;
|
|
373
|
+
});
|
|
374
|
+
const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
|
|
375
|
+
const canvasId = `radar-${agentId}-${i}`;
|
|
376
|
+
const safeModel = escHtml(model);
|
|
377
|
+
canvasesHtml += `
|
|
378
|
+
<div class="radar-model-view" data-agent="${agentId}" data-model="${safeModel}" style="display:none">
|
|
379
|
+
<div style="position:relative;height:400px">
|
|
380
|
+
<canvas id="${canvasId}"></canvas>
|
|
381
|
+
</div>
|
|
382
|
+
<script>
|
|
383
|
+
(function() {
|
|
384
|
+
var chart = new Chart(document.getElementById('${canvasId}'), {
|
|
385
|
+
type: 'radar',
|
|
386
|
+
data: {
|
|
387
|
+
labels: ${JSON.stringify(allSuiteNames)},
|
|
388
|
+
datasets: [${JSON.stringify({
|
|
389
|
+
label: short,
|
|
390
|
+
data: rawData,
|
|
391
|
+
_rawData: rawData,
|
|
392
|
+
_wilsonData: wilsonData,
|
|
393
|
+
borderColor: color.border,
|
|
394
|
+
backgroundColor: color.fill,
|
|
395
|
+
pointBackgroundColor: color.border,
|
|
396
|
+
pointBorderColor: "#18181b",
|
|
397
|
+
pointRadius: 4,
|
|
398
|
+
borderWidth: 2,
|
|
399
|
+
})}]
|
|
400
|
+
},
|
|
401
|
+
options: ${radarOptions}
|
|
402
|
+
});
|
|
403
|
+
window.__agestCharts['${canvasId}'] = chart;
|
|
404
|
+
})();
|
|
405
|
+
</script>
|
|
406
|
+
</div>`;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
// Model selector dropdown (only when multiple models)
|
|
410
|
+
const modelSelector = showToggle
|
|
411
|
+
? `<select class="radar-model-select bg-zinc-800 text-zinc-300 text-xs border border-zinc-700 rounded px-2 py-1"
|
|
412
|
+
data-agent="${agentId}"
|
|
413
|
+
onchange="filterRadarModel('${agentId}', this.value)">
|
|
414
|
+
<option value="__all__">All Models</option>
|
|
415
|
+
${allModels.map((m) => `<option value="${escHtml(m)}">${escHtml(m.split("/").pop()?.slice(0, 30) ?? m.slice(0, 30))}</option>`).join("\n")}
|
|
416
|
+
</select>`
|
|
417
|
+
: "";
|
|
418
|
+
return `
|
|
419
|
+
<div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
|
|
420
|
+
<div class="flex items-center justify-between mb-4">
|
|
421
|
+
<span class="text-xs text-zinc-600 uppercase tracking-wider">suite breakdown</span>
|
|
422
|
+
${modelSelector}
|
|
423
|
+
</div>
|
|
424
|
+
${canvasesHtml}
|
|
425
|
+
</div>`;
|
|
426
|
+
}
|
|
427
|
+
// ---------------------------------------------------------------------------
|
|
136
428
|
// Grouped Bar Chart (benchmark-style)
|
|
137
429
|
// ---------------------------------------------------------------------------
|
|
138
430
|
const SERIES_COLORS = [
|
|
@@ -145,7 +437,7 @@ const SERIES_COLORS = [
|
|
|
145
437
|
{ bg: "#f472b6", text: "#f9a8d4" }, // pink
|
|
146
438
|
{ bg: "#2dd4bf", text: "#5eead4" }, // teal
|
|
147
439
|
];
|
|
148
|
-
function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel
|
|
440
|
+
function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel) {
|
|
149
441
|
const otherDims = allDims.filter((d) => d !== groupDim);
|
|
150
442
|
// Column dimension: prefer "model", else first other dim
|
|
151
443
|
const colDim = otherDims.includes("model") ? "model" : otherDims[0];
|
|
@@ -202,6 +494,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
|
|
|
202
494
|
return `<td class="px-4 py-2"><span class="text-xs text-zinc-700">—</span></td>`;
|
|
203
495
|
}
|
|
204
496
|
const pct = r.successRate * 100;
|
|
497
|
+
const wilsonPct = wilsonLowerBound(r.successRate, r.totalCases) * 100;
|
|
205
498
|
const color = barColor(r.successRate);
|
|
206
499
|
const tc = rateClass(r.successRate);
|
|
207
500
|
return `<td class="px-4 py-2">
|
|
@@ -209,7 +502,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
|
|
|
209
502
|
<div class="flex-1 bg-zinc-800 rounded h-2 overflow-hidden" style="min-width:80px">
|
|
210
503
|
<div class="h-2 rounded" style="width:${pct.toFixed(1)}%;background:${color}"></div>
|
|
211
504
|
</div>
|
|
212
|
-
<span class="text-sm font-medium ${tc} w-12 text-right">${pct.toFixed(0)}%</span>
|
|
505
|
+
<span class="text-sm font-medium ${tc} w-12 text-right" data-raw="${pct.toFixed(0)}%" data-wilson="${wilsonPct.toFixed(0)}%">${pct.toFixed(0)}%</span>
|
|
213
506
|
</div>
|
|
214
507
|
</td>`;
|
|
215
508
|
})
|
|
@@ -239,10 +532,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
|
|
|
239
532
|
})
|
|
240
533
|
.filter(Boolean)
|
|
241
534
|
.join("\n");
|
|
242
|
-
return
|
|
243
|
-
<div class="mb-4">
|
|
244
|
-
<div class="text-xs text-zinc-600 uppercase tracking-wider mb-1">grouped by ${escHtml(groupDim)}</div>
|
|
245
|
-
</div>
|
|
535
|
+
return `
|
|
246
536
|
<div class="overflow-x-auto">
|
|
247
537
|
<table class="w-full">
|
|
248
538
|
<thead>
|
|
@@ -258,8 +548,7 @@ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agen
|
|
|
258
548
|
</div>
|
|
259
549
|
<div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
|
|
260
550
|
${versionRef}
|
|
261
|
-
</div
|
|
262
|
-
</div>`;
|
|
551
|
+
</div>`;
|
|
263
552
|
}
|
|
264
553
|
function renderGroupedBarChart(group) {
|
|
265
554
|
const reports = group.runs.map((r) => r.report);
|
|
@@ -274,42 +563,19 @@ function renderGroupedBarChart(group) {
|
|
|
274
563
|
if (varying.length < 1)
|
|
275
564
|
return "";
|
|
276
565
|
const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
|
|
277
|
-
// Build version labels for each dimension: first unique value seen = v1, etc.
|
|
278
|
-
const versionMaps = new Map();
|
|
279
566
|
const sorted = [...reports].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
let idx = 1;
|
|
283
|
-
for (const r of sorted) {
|
|
284
|
-
const val = r.dimensions?.[dim] ?? "?";
|
|
285
|
-
if (!seen.has(val)) {
|
|
286
|
-
seen.set(val, `v${idx}`);
|
|
287
|
-
idx++;
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
versionMaps.set(dim, seen);
|
|
291
|
-
}
|
|
567
|
+
// Build smart labels using diff entries for prompt/tools readability
|
|
568
|
+
const versionMaps = buildSmartLabels(sorted, allDims, group.diffEntries);
|
|
292
569
|
const dimLabel = (dim, val) => {
|
|
293
|
-
|
|
294
|
-
const version = vMap?.get(val) ?? "?";
|
|
295
|
-
// For model, show short model name. For others, show version tag.
|
|
296
|
-
if (dim === "model") {
|
|
297
|
-
const short = val.length > 16 ? val.split("/").pop()?.slice(0, 16) ?? val.slice(0, 16) : val;
|
|
298
|
-
return short;
|
|
299
|
-
}
|
|
300
|
-
// For tools, show "none" directly instead of a version tag
|
|
301
|
-
if (dim === "tools" && val === "none") {
|
|
302
|
-
return "none";
|
|
303
|
-
}
|
|
304
|
-
return version;
|
|
570
|
+
return versionMaps.get(dim)?.get(val) ?? val;
|
|
305
571
|
};
|
|
306
572
|
// Build a chart for each possible grouping dimension
|
|
307
573
|
const charts = varying.map((groupDim, dimIdx) => {
|
|
308
574
|
const isActive = dimIdx === 0;
|
|
309
|
-
//
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
575
|
+
// Also render a matrix/table view for non-model dimensions (hidden by default)
|
|
576
|
+
const matrixHtml = groupDim !== "model"
|
|
577
|
+
? renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel)
|
|
578
|
+
: "";
|
|
313
579
|
const otherDims = allDims.filter((d) => d !== groupDim);
|
|
314
580
|
// Group runs by the grouping dimension
|
|
315
581
|
const groupVals = [...new Set(sorted.map((r) => r.dimensions?.[groupDim] ?? "?"))];
|
|
@@ -330,12 +596,17 @@ function renderGroupedBarChart(group) {
|
|
|
330
596
|
const color = SERIES_COLORS[ci % SERIES_COLORS.length];
|
|
331
597
|
const parts = ck.split("|");
|
|
332
598
|
const cfgLabel = otherDims.map((d, i) => `${d}: ${dimLabel(d, parts[i] ?? "?")}`).join(", ");
|
|
333
|
-
const
|
|
599
|
+
const rawData = groupVals.map((gv) => {
|
|
334
600
|
const groupRuns = grouped.get(gv) ?? [];
|
|
335
601
|
const match = groupRuns.find((r) => configKey(r) === ck);
|
|
336
602
|
return match ? +(match.successRate * 100).toFixed(1) : null;
|
|
337
603
|
});
|
|
338
|
-
|
|
604
|
+
const wilsonData = groupVals.map((gv) => {
|
|
605
|
+
const groupRuns = grouped.get(gv) ?? [];
|
|
606
|
+
const match = groupRuns.find((r) => configKey(r) === ck);
|
|
607
|
+
return match ? +(wilsonLowerBound(match.successRate, match.totalCases) * 100).toFixed(1) : null;
|
|
608
|
+
});
|
|
609
|
+
return { label: cfgLabel, data: rawData, _rawData: rawData, _wilsonData: wilsonData, backgroundColor: color.bg, borderColor: color.bg, borderWidth: 0, borderRadius: 4 };
|
|
339
610
|
});
|
|
340
611
|
const canvasId = `bar-${agentId}-${escHtml(groupDim)}`;
|
|
341
612
|
// Version reference
|
|
@@ -354,55 +625,66 @@ function renderGroupedBarChart(group) {
|
|
|
354
625
|
})
|
|
355
626
|
.filter(Boolean)
|
|
356
627
|
.join("\n");
|
|
628
|
+
const viewToggle = matrixHtml ? `<button
|
|
629
|
+
class="view-toggle text-[10px] text-zinc-600 hover:text-zinc-400 transition-colors"
|
|
630
|
+
data-agent="${agentId}" data-dim="${escHtml(groupDim)}"
|
|
631
|
+
onclick="switchView('${agentId}', '${escHtml(groupDim)}')"
|
|
632
|
+
>Table</button>` : "";
|
|
357
633
|
return `<div class="chart-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:${isActive ? "block" : "none"}">
|
|
358
|
-
<div class="mb-4">
|
|
359
|
-
<div class="text-xs text-zinc-600 uppercase tracking-wider
|
|
360
|
-
|
|
361
|
-
<div style="position:relative;height:280px">
|
|
362
|
-
<canvas id="${canvasId}"></canvas>
|
|
634
|
+
<div class="flex items-center justify-between mb-4">
|
|
635
|
+
<div class="text-xs text-zinc-600 uppercase tracking-wider">grouped by ${escHtml(groupDim)}</div>
|
|
636
|
+
${viewToggle}
|
|
363
637
|
</div>
|
|
364
|
-
<
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
638
|
+
<div class="bar-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}">
|
|
639
|
+
<div style="position:relative;height:280px">
|
|
640
|
+
<canvas id="${canvasId}"></canvas>
|
|
641
|
+
</div>
|
|
642
|
+
<script>
|
|
643
|
+
(function() {
|
|
644
|
+
var chart = new Chart(document.getElementById('${canvasId}'), {
|
|
645
|
+
type: 'bar',
|
|
646
|
+
data: { labels: ${JSON.stringify(labels)}, datasets: ${JSON.stringify(datasets)} },
|
|
647
|
+
options: {
|
|
648
|
+
responsive: true,
|
|
649
|
+
maintainAspectRatio: false,
|
|
650
|
+
plugins: {
|
|
651
|
+
legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
|
|
652
|
+
tooltip: { callbacks: { label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.y + '%'; } } }
|
|
653
|
+
},
|
|
654
|
+
scales: {
|
|
655
|
+
x: { ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } }, grid: { color: '#27272a' } },
|
|
656
|
+
y: { min: 0, max: 100, ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } }, grid: { color: '#27272a' } }
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
});
|
|
660
|
+
window.__agestCharts['${canvasId}'] = chart;
|
|
661
|
+
})();
|
|
662
|
+
</script>
|
|
663
|
+
<div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
|
|
664
|
+
${versionRef}
|
|
665
|
+
</div>
|
|
384
666
|
</div>
|
|
667
|
+
${matrixHtml ? `<div class="table-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:none">${matrixHtml}</div>` : ""}
|
|
385
668
|
</div>`;
|
|
386
669
|
});
|
|
387
|
-
//
|
|
388
|
-
const
|
|
389
|
-
.map((dim
|
|
390
|
-
const active = i === 0;
|
|
391
|
-
return `<button
|
|
392
|
-
class="dim-tab px-3 py-1.5 text-xs rounded-md transition-colors ${active ? "bg-zinc-700 text-zinc-200" : "bg-zinc-800/50 text-zinc-500 hover:text-zinc-300"}"
|
|
393
|
-
data-agent="${agentId}"
|
|
394
|
-
data-dim="${escHtml(dim)}"
|
|
395
|
-
onclick="switchDim('${agentId}', '${escHtml(dim)}')"
|
|
396
|
-
>${escHtml(dim)}</button>`;
|
|
397
|
-
})
|
|
670
|
+
// Primary dimension selector
|
|
671
|
+
const dimOptions = varying
|
|
672
|
+
.map((dim) => `<option value="${escHtml(dim)}">${escHtml(dim)}</option>`)
|
|
398
673
|
.join("\n");
|
|
674
|
+
const dimSelector = varying.length > 1
|
|
675
|
+
? `<div class="flex items-center gap-2">
|
|
676
|
+
<span class="text-[10px] text-zinc-600 uppercase tracking-wider">Group by</span>
|
|
677
|
+
<select class="bg-zinc-800 text-zinc-300 text-xs border border-zinc-700 rounded px-2 py-1"
|
|
678
|
+
onchange="switchDim('${agentId}', this.value)">
|
|
679
|
+
${dimOptions}
|
|
680
|
+
</select>
|
|
681
|
+
</div>`
|
|
682
|
+
: `<span class="text-[10px] text-zinc-600">grouped by ${escHtml(varying[0])}</span>`;
|
|
399
683
|
return `
|
|
400
684
|
<div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
|
|
401
685
|
<div class="flex items-center justify-between mb-5">
|
|
402
686
|
<span class="text-xs text-zinc-600 uppercase tracking-wider">success rate</span>
|
|
403
|
-
|
|
404
|
-
${tabs}
|
|
405
|
-
</div>
|
|
687
|
+
${dimSelector}
|
|
406
688
|
</div>
|
|
407
689
|
${charts.join("\n")}
|
|
408
690
|
</div>`;
|
|
@@ -530,28 +812,33 @@ function renderScatterPlot(group) {
|
|
|
530
812
|
const model = r.dimensions?.["model"] ?? r.model ?? "?";
|
|
531
813
|
const avgDurSec = r.totalCases > 0 ? +(r.duration / r.totalCases / 1000).toFixed(2) : 0;
|
|
532
814
|
const accuracy = +(r.successRate * 100).toFixed(1);
|
|
815
|
+
const wilsonAccuracy = +(wilsonLowerBound(r.successRate, r.totalCases) * 100).toFixed(1);
|
|
533
816
|
const configLabel = allDims
|
|
534
817
|
.filter((d) => d !== "model")
|
|
535
818
|
.map((d) => `${d}: ${r.dimensions?.[d] ?? "?"}`)
|
|
536
819
|
.join(", ");
|
|
537
|
-
const
|
|
538
|
-
|
|
539
|
-
|
|
820
|
+
const entry = byModel.get(model) ?? { raw: [], wilson: [] };
|
|
821
|
+
entry.raw.push({ x: avgDurSec, y: accuracy, label: configLabel });
|
|
822
|
+
entry.wilson.push({ x: avgDurSec, y: wilsonAccuracy, label: configLabel });
|
|
823
|
+
byModel.set(model, entry);
|
|
540
824
|
}
|
|
541
825
|
const uniqueModels = [...byModel.keys()];
|
|
542
826
|
const datasets = uniqueModels.map((model, i) => {
|
|
543
827
|
const color = SERIES_COLORS[i % SERIES_COLORS.length];
|
|
544
828
|
const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
|
|
829
|
+
const entry = byModel.get(model);
|
|
545
830
|
return {
|
|
546
831
|
label: short,
|
|
547
|
-
data:
|
|
832
|
+
data: entry.raw,
|
|
833
|
+
_rawScatter: entry.raw,
|
|
834
|
+
_wilsonScatter: entry.wilson,
|
|
548
835
|
backgroundColor: color.bg,
|
|
549
836
|
borderColor: color.text,
|
|
550
837
|
pointRadius: 7,
|
|
551
838
|
pointHoverRadius: 9,
|
|
552
839
|
};
|
|
553
840
|
});
|
|
554
|
-
const allX = [...byModel.values()].
|
|
841
|
+
const allX = [...byModel.values()].flatMap((e) => e.raw).map((p) => p.x);
|
|
555
842
|
const midX = allX.length > 0 ? +((Math.min(...allX) + Math.max(...allX)) / 2).toFixed(2) : 0;
|
|
556
843
|
const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
|
|
557
844
|
const canvasId = `scatter-${agentId}`;
|
|
@@ -564,97 +851,276 @@ function renderScatterPlot(group) {
|
|
|
564
851
|
<canvas id="${canvasId}"></canvas>
|
|
565
852
|
</div>
|
|
566
853
|
<script>
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
854
|
+
(function() {
|
|
855
|
+
var chart = new Chart(document.getElementById('${canvasId}'), {
|
|
856
|
+
type: 'scatter',
|
|
857
|
+
data: { datasets: ${JSON.stringify(datasets)} },
|
|
858
|
+
options: {
|
|
859
|
+
responsive: true,
|
|
860
|
+
maintainAspectRatio: false,
|
|
861
|
+
plugins: {
|
|
862
|
+
legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
|
|
863
|
+
tooltip: {
|
|
864
|
+
callbacks: {
|
|
865
|
+
label: function(ctx) {
|
|
866
|
+
var p = ctx.raw;
|
|
867
|
+
var lines = [ctx.dataset.label + ': ' + p.y + '% accuracy, ' + p.x.toFixed(1) + 's/case'];
|
|
868
|
+
if (p.label) lines.push(p.label);
|
|
869
|
+
return lines;
|
|
870
|
+
}
|
|
582
871
|
}
|
|
583
872
|
}
|
|
584
|
-
}
|
|
585
|
-
},
|
|
586
|
-
scales: {
|
|
587
|
-
x: {
|
|
588
|
-
title: { display: true, text: 'avg duration per case (s)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
|
|
589
|
-
ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } },
|
|
590
|
-
grid: { color: '#27272a' }
|
|
591
873
|
},
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
874
|
+
scales: {
|
|
875
|
+
x: {
|
|
876
|
+
title: { display: true, text: 'avg duration per case (s)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
|
|
877
|
+
ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } },
|
|
878
|
+
grid: { color: '#27272a' }
|
|
879
|
+
},
|
|
880
|
+
y: {
|
|
881
|
+
min: 0, max: 100,
|
|
882
|
+
title: { display: true, text: 'accuracy (%)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
|
|
883
|
+
ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } },
|
|
884
|
+
grid: { color: '#27272a' }
|
|
885
|
+
}
|
|
597
886
|
}
|
|
598
|
-
}
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
var midYPx = yScale.getPixelForValue(50);
|
|
887
|
+
},
|
|
888
|
+
plugins: [{
|
|
889
|
+
id: 'quadrantLines',
|
|
890
|
+
afterDraw: function(chart) {
|
|
891
|
+
var ctx = chart.ctx;
|
|
892
|
+
var area = chart.chartArea;
|
|
893
|
+
var xScale = chart.scales.x;
|
|
894
|
+
var yScale = chart.scales.y;
|
|
895
|
+
var midXPx = xScale.getPixelForValue(${midX});
|
|
896
|
+
var midYPx = yScale.getPixelForValue(50);
|
|
609
897
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
898
|
+
ctx.save();
|
|
899
|
+
ctx.setLineDash([6, 4]);
|
|
900
|
+
ctx.lineWidth = 1;
|
|
901
|
+
ctx.strokeStyle = 'rgba(113, 113, 122, 0.4)';
|
|
614
902
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
903
|
+
ctx.beginPath();
|
|
904
|
+
ctx.moveTo(midXPx, area.top);
|
|
905
|
+
ctx.lineTo(midXPx, area.bottom);
|
|
906
|
+
ctx.stroke();
|
|
619
907
|
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
908
|
+
ctx.beginPath();
|
|
909
|
+
ctx.moveTo(area.left, midYPx);
|
|
910
|
+
ctx.lineTo(area.right, midYPx);
|
|
911
|
+
ctx.stroke();
|
|
624
912
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
913
|
+
ctx.setLineDash([]);
|
|
914
|
+
ctx.font = '10px ui-monospace, monospace';
|
|
915
|
+
ctx.fillStyle = 'rgba(113, 113, 122, 0.5)';
|
|
628
916
|
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
917
|
+
ctx.textAlign = 'left';
|
|
918
|
+
ctx.textBaseline = 'top';
|
|
919
|
+
ctx.fillText('Ideal', area.left + 8, area.top + 8);
|
|
632
920
|
|
|
633
|
-
|
|
634
|
-
|
|
921
|
+
ctx.textAlign = 'right';
|
|
922
|
+
ctx.fillText('Smart but slow', area.right - 8, area.top + 8);
|
|
635
923
|
|
|
636
|
-
|
|
637
|
-
|
|
924
|
+
ctx.textBaseline = 'bottom';
|
|
925
|
+
ctx.fillText('Dumb and slow', area.right - 8, area.bottom - 8);
|
|
638
926
|
|
|
639
|
-
|
|
640
|
-
|
|
927
|
+
ctx.textAlign = 'left';
|
|
928
|
+
ctx.fillText('Dumb and fast', area.left + 8, area.bottom - 8);
|
|
641
929
|
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
930
|
+
ctx.restore();
|
|
931
|
+
}
|
|
932
|
+
}]
|
|
933
|
+
});
|
|
934
|
+
window.__agestCharts['${canvasId}'] = chart;
|
|
935
|
+
})();
|
|
646
936
|
</script>
|
|
647
937
|
</div>`;
|
|
648
938
|
}
|
|
939
|
+
function renderSingleRun(report) {
|
|
940
|
+
const pct = (report.successRate * 100).toFixed(0);
|
|
941
|
+
const passed = report.totalCases - report.failedCasesCount;
|
|
942
|
+
const color = rateClass(report.successRate);
|
|
943
|
+
const dur = formatDuration(report.duration);
|
|
944
|
+
const failedRows = report.failedCases
|
|
945
|
+
.map((fc) => `
|
|
946
|
+
<tr class="border-t border-zinc-800/50">
|
|
947
|
+
<td class="py-2 pr-4 text-zinc-300 text-xs">${escHtml(fc.prompt)}</td>
|
|
948
|
+
<td class="py-2 text-zinc-500 text-xs">${escHtml(fc.reason ?? "")}</td>
|
|
949
|
+
</tr>`)
|
|
950
|
+
.join("");
|
|
951
|
+
const failedSection = report.failedCases.length > 0
|
|
952
|
+
? `
|
|
953
|
+
<div class="mt-6">
|
|
954
|
+
<h4 class="text-xs text-zinc-500 uppercase tracking-widest mb-2">Failed Cases</h4>
|
|
955
|
+
<table class="w-full text-left">
|
|
956
|
+
<thead><tr class="text-zinc-600 text-xs">
|
|
957
|
+
<th class="pb-1 pr-4">Prompt</th>
|
|
958
|
+
<th class="pb-1">Reason</th>
|
|
959
|
+
</tr></thead>
|
|
960
|
+
<tbody>${failedRows}</tbody>
|
|
961
|
+
</table>
|
|
962
|
+
</div>`
|
|
963
|
+
: "";
|
|
964
|
+
return `
|
|
965
|
+
<div class="bg-zinc-900/50 rounded-lg border border-zinc-800 p-6">
|
|
966
|
+
<div class="flex items-baseline gap-6 mb-4">
|
|
967
|
+
<span class="${color} text-3xl font-bold">${pct}%</span>
|
|
968
|
+
<span class="text-zinc-500 text-sm">${passed}/${report.totalCases} passed</span>
|
|
969
|
+
<span class="text-zinc-600 text-sm">${dur}</span>
|
|
970
|
+
</div>
|
|
971
|
+
|
|
972
|
+
<div class="grid grid-cols-2 gap-4 text-sm">
|
|
973
|
+
<div>
|
|
974
|
+
<span class="text-zinc-500">Model</span>
|
|
975
|
+
<p class="text-zinc-300">${escHtml(report.model)}</p>
|
|
976
|
+
</div>
|
|
977
|
+
<div>
|
|
978
|
+
<span class="text-zinc-500">Timestamp</span>
|
|
979
|
+
<p class="text-zinc-300">${formatTimestamp(report.timestamp)}</p>
|
|
980
|
+
</div>
|
|
981
|
+
${report.averageInputTokensPerCase != null
|
|
982
|
+
? `<div>
|
|
983
|
+
<span class="text-zinc-500">Avg Input Tokens</span>
|
|
984
|
+
<p class="text-zinc-300">${Math.round(report.averageInputTokensPerCase)}</p>
|
|
985
|
+
</div>`
|
|
986
|
+
: ""}
|
|
987
|
+
${report.averageOutputTokensPerCase != null
|
|
988
|
+
? `<div>
|
|
989
|
+
<span class="text-zinc-500">Avg Output Tokens</span>
|
|
990
|
+
<p class="text-zinc-300">${Math.round(report.averageOutputTokensPerCase)}</p>
|
|
991
|
+
</div>`
|
|
992
|
+
: ""}
|
|
993
|
+
${report.tools && report.tools.length > 0
|
|
994
|
+
? `<div>
|
|
995
|
+
<span class="text-zinc-500">Tools</span>
|
|
996
|
+
<p class="text-zinc-300">${escHtml(report.tools.join(", "))}</p>
|
|
997
|
+
</div>`
|
|
998
|
+
: ""}
|
|
999
|
+
</div>
|
|
1000
|
+
${failedSection}
|
|
1001
|
+
</div>`;
|
|
1002
|
+
}
|
|
1003
|
+
function renderDebugPanel(group) {
|
|
1004
|
+
// Collect all failed cases across all runs, with dimension context
|
|
1005
|
+
const failures = [];
|
|
1006
|
+
for (const run of group.runs) {
|
|
1007
|
+
const r = run.report;
|
|
1008
|
+
const dimTags = Object.entries(r.dimensions ?? {})
|
|
1009
|
+
.map(([k, v]) => {
|
|
1010
|
+
const short = v.length > 20 ? v.slice(0, 19) + "…" : v;
|
|
1011
|
+
return `${k}:${short}`;
|
|
1012
|
+
})
|
|
1013
|
+
.join(" ");
|
|
1014
|
+
// Top-level failed cases
|
|
1015
|
+
for (const fc of r.failedCases) {
|
|
1016
|
+
failures.push({ prompt: fc.prompt, reason: fc.reason, response: fc.response, dims: dimTags });
|
|
1017
|
+
}
|
|
1018
|
+
// Suite-level failed cases (may overlap with top-level, dedupe by prompt+dims)
|
|
1019
|
+
if (r.suites) {
|
|
1020
|
+
for (const s of r.suites) {
|
|
1021
|
+
for (const fc of s.failedCases) {
|
|
1022
|
+
const alreadyAdded = failures.some((f) => f.prompt === fc.prompt && f.dims === dimTags);
|
|
1023
|
+
if (!alreadyAdded) {
|
|
1024
|
+
failures.push({ prompt: fc.prompt, reason: fc.reason, response: fc.response, suite: s.name, dims: dimTags });
|
|
1025
|
+
}
|
|
1026
|
+
else {
|
|
1027
|
+
// Enrich existing entry with suite name
|
|
1028
|
+
const existing = failures.find((f) => f.prompt === fc.prompt && f.dims === dimTags);
|
|
1029
|
+
if (existing && !existing.suite)
|
|
1030
|
+
existing.suite = s.name;
|
|
1031
|
+
// Enrich with response if missing at top-level
|
|
1032
|
+
if (existing && !existing.response && fc.response)
|
|
1033
|
+
existing.response = fc.response;
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
if (failures.length === 0)
|
|
1040
|
+
return "";
|
|
1041
|
+
// Group by suite
|
|
1042
|
+
const suiteOrder = [];
|
|
1043
|
+
const bySuite = new Map();
|
|
1044
|
+
for (const f of failures) {
|
|
1045
|
+
const key = f.suite ?? "__none__";
|
|
1046
|
+
if (!bySuite.has(key)) {
|
|
1047
|
+
suiteOrder.push(key);
|
|
1048
|
+
bySuite.set(key, []);
|
|
1049
|
+
}
|
|
1050
|
+
bySuite.get(key).push(f);
|
|
1051
|
+
}
|
|
1052
|
+
const renderFailure = (f) => {
|
|
1053
|
+
const promptShort = f.prompt.length > 70 ? f.prompt.slice(0, 67) + "…" : f.prompt;
|
|
1054
|
+
const reasonShort = f.reason
|
|
1055
|
+
? `<span class="text-red-400/60 text-[10px] ml-2">${escHtml(f.reason.length > 50 ? f.reason.slice(0, 47) + "…" : f.reason)}</span>`
|
|
1056
|
+
: "";
|
|
1057
|
+
const responseHtml = f.response
|
|
1058
|
+
? escHtml(f.response).replace(/\n/g, "<br>")
|
|
1059
|
+
: `<span class="text-zinc-700">no response captured</span>`;
|
|
1060
|
+
return `
|
|
1061
|
+
<details class="border-t border-zinc-800/50">
|
|
1062
|
+
<summary class="py-2.5 cursor-pointer select-none hover:bg-zinc-800/30 rounded px-2 -mx-2 flex items-center gap-2">
|
|
1063
|
+
<span class="text-red-400 text-xs shrink-0">FAIL</span>
|
|
1064
|
+
<span class="text-xs text-zinc-300 truncate flex-1">${escHtml(promptShort)}</span>
|
|
1065
|
+
${reasonShort}
|
|
1066
|
+
<span class="text-[10px] text-zinc-700">${escHtml(f.dims)}</span>
|
|
1067
|
+
</summary>
|
|
1068
|
+
<div class="pb-3 px-2 -mx-2 space-y-2">
|
|
1069
|
+
<div>
|
|
1070
|
+
<div class="text-[10px] text-zinc-600 uppercase mb-1">Input</div>
|
|
1071
|
+
<div class="text-xs text-zinc-300 bg-zinc-800/50 rounded px-3 py-2">${escHtml(f.prompt)}</div>
|
|
1072
|
+
</div>
|
|
1073
|
+
<div>
|
|
1074
|
+
<div class="text-[10px] text-zinc-600 uppercase mb-1">Output</div>
|
|
1075
|
+
<div class="text-xs text-zinc-400 bg-zinc-800/50 rounded px-3 py-2 max-h-48 overflow-y-auto">${responseHtml}</div>
|
|
1076
|
+
</div>
|
|
1077
|
+
${f.reason ? `<div><div class="text-[10px] text-zinc-600 uppercase mb-1">Reason</div><div class="text-xs text-red-400/80">${escHtml(f.reason)}</div></div>` : ""}
|
|
1078
|
+
</div>
|
|
1079
|
+
</details>`;
|
|
1080
|
+
};
|
|
1081
|
+
const rows = suiteOrder.map((key) => {
|
|
1082
|
+
const items = bySuite.get(key);
|
|
1083
|
+
const label = key === "__none__" ? "no suite" : key;
|
|
1084
|
+
return `
|
|
1085
|
+
<div class="mb-3 last:mb-0">
|
|
1086
|
+
<div class="flex items-center gap-2 mb-1">
|
|
1087
|
+
<span class="text-[10px] text-zinc-500 uppercase tracking-wider font-medium">${escHtml(label)}</span>
|
|
1088
|
+
<span class="text-[10px] text-zinc-700">${items.length}</span>
|
|
1089
|
+
</div>
|
|
1090
|
+
<div class="pl-2 border-l border-zinc-800">
|
|
1091
|
+
${items.map(renderFailure).join("")}
|
|
1092
|
+
</div>
|
|
1093
|
+
</div>`;
|
|
1094
|
+
}).join("");
|
|
1095
|
+
return `
|
|
1096
|
+
<div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
|
|
1097
|
+
<div class="flex items-center justify-between mb-4">
|
|
1098
|
+
<span class="text-xs text-zinc-600 uppercase tracking-wider">failed cases</span>
|
|
1099
|
+
<span class="text-xs text-zinc-600">${failures.length} failure${failures.length !== 1 ? "s" : ""}</span>
|
|
1100
|
+
</div>
|
|
1101
|
+
<div>
|
|
1102
|
+
${rows}
|
|
1103
|
+
</div>
|
|
1104
|
+
</div>`;
|
|
1105
|
+
}
|
|
649
1106
|
function renderAgentSection(group) {
|
|
1107
|
+
const radarHtml = renderRadarChart(group);
|
|
650
1108
|
const chartHtml = renderGroupedBarChart(group);
|
|
651
1109
|
const scatterHtml = renderScatterPlot(group);
|
|
1110
|
+
const debugHtml = renderDebugPanel(group);
|
|
1111
|
+
// When there are no comparative charts, show a single-run summary card
|
|
1112
|
+
const singleRunHtml = !chartHtml && !scatterHtml && !radarHtml && group.runs.length > 0
|
|
1113
|
+
? renderSingleRun(group.runs[0].report)
|
|
1114
|
+
: "";
|
|
652
1115
|
return `
|
|
653
1116
|
<section class="mb-12">
|
|
654
1117
|
<h2 class="text-base font-semibold mb-4 text-zinc-400 uppercase tracking-widest">${escHtml(group.label)}</h2>
|
|
655
1118
|
|
|
656
1119
|
${chartHtml}
|
|
657
1120
|
${scatterHtml}
|
|
1121
|
+
${radarHtml}
|
|
1122
|
+
${singleRunHtml}
|
|
1123
|
+
${debugHtml}
|
|
658
1124
|
</section>`;
|
|
659
1125
|
}
|
|
660
1126
|
// ---------------------------------------------------------------------------
|
|
@@ -672,16 +1138,57 @@ function generateHTML(groups, totalReports) {
|
|
|
672
1138
|
<script src="https://cdn.tailwindcss.com"></script>
|
|
673
1139
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
674
1140
|
<script>
|
|
1141
|
+
window.__agestCharts = {};
|
|
1142
|
+
window.__agestWilson = false;
|
|
1143
|
+
|
|
1144
|
+
function toggleWilson() {
|
|
1145
|
+
window.__agestWilson = !window.__agestWilson;
|
|
1146
|
+
var useWilson = window.__agestWilson;
|
|
1147
|
+
var btn = document.getElementById('wilson-toggle');
|
|
1148
|
+
if (btn) {
|
|
1149
|
+
btn.className = useWilson
|
|
1150
|
+
? 'px-3 py-1 text-xs rounded-full border transition-colors bg-violet-600 border-violet-500 text-violet-100'
|
|
1151
|
+
: 'px-3 py-1 text-xs rounded-full border transition-colors bg-zinc-800/50 border-zinc-700 text-zinc-500 hover:text-zinc-300';
|
|
1152
|
+
btn.textContent = useWilson ? 'Wilson CI (95%)' : 'Raw';
|
|
1153
|
+
}
|
|
1154
|
+
// Update all Chart.js instances
|
|
1155
|
+
Object.values(window.__agestCharts).forEach(function(chart) {
|
|
1156
|
+
chart.data.datasets.forEach(function(ds) {
|
|
1157
|
+
if (ds._rawData && ds._wilsonData) {
|
|
1158
|
+
ds.data = useWilson ? ds._wilsonData : ds._rawData;
|
|
1159
|
+
}
|
|
1160
|
+
// Scatter plot: swap y values
|
|
1161
|
+
if (ds._rawScatter && ds._wilsonScatter) {
|
|
1162
|
+
ds.data = useWilson ? ds._wilsonScatter : ds._rawScatter;
|
|
1163
|
+
}
|
|
1164
|
+
});
|
|
1165
|
+
chart.update();
|
|
1166
|
+
});
|
|
1167
|
+
// Update matrix view cells
|
|
1168
|
+
document.querySelectorAll('[data-wilson]').forEach(function(el) {
|
|
1169
|
+
el.textContent = useWilson ? el.getAttribute('data-wilson') : el.getAttribute('data-raw');
|
|
1170
|
+
});
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
function switchView(agent, dim) {
|
|
1174
|
+
var barEl = document.querySelector('.bar-view[data-agent="' + agent + '"][data-dim="' + dim + '"]');
|
|
1175
|
+
var tableEl = document.querySelector('.table-view[data-agent="' + agent + '"][data-dim="' + dim + '"]');
|
|
1176
|
+
var btn = document.querySelector('.view-toggle[data-agent="' + agent + '"][data-dim="' + dim + '"]');
|
|
1177
|
+
if (!barEl || !tableEl || !btn) return;
|
|
1178
|
+
var showingTable = tableEl.style.display !== 'none';
|
|
1179
|
+
barEl.style.display = showingTable ? 'block' : 'none';
|
|
1180
|
+
tableEl.style.display = showingTable ? 'none' : 'block';
|
|
1181
|
+
btn.textContent = showingTable ? 'Table' : 'Chart';
|
|
1182
|
+
}
|
|
1183
|
+
|
|
675
1184
|
function switchDim(agent, dim) {
|
|
676
1185
|
document.querySelectorAll('.chart-view[data-agent="' + agent + '"]').forEach(el => {
|
|
677
1186
|
el.style.display = el.dataset.dim === dim ? 'block' : 'none';
|
|
678
1187
|
});
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
el.className = el.className.replace('bg-zinc-700 text-zinc-200', 'bg-zinc-800/50 text-zinc-500');
|
|
684
|
-
}
|
|
1188
|
+
}
|
|
1189
|
+
function filterRadarModel(agent, model) {
|
|
1190
|
+
document.querySelectorAll('.radar-model-view[data-agent="' + agent + '"]').forEach(el => {
|
|
1191
|
+
el.style.display = el.dataset.model === model ? 'block' : 'none';
|
|
685
1192
|
});
|
|
686
1193
|
}
|
|
687
1194
|
</script>
|
|
@@ -691,7 +1198,11 @@ function generateHTML(groups, totalReports) {
|
|
|
691
1198
|
|
|
692
1199
|
<header class="mb-10">
|
|
693
1200
|
<h1 class="text-2xl font-bold tracking-tight">agest</h1>
|
|
694
|
-
<
|
|
1201
|
+
<div class="flex items-center gap-3 mt-1">
|
|
1202
|
+
<p class="text-zinc-500 text-sm">${totalReports} report${totalReports !== 1 ? "s" : ""} · generated ${generated}</p>
|
|
1203
|
+
<button id="wilson-toggle" onclick="toggleWilson()" title="Wilson score lower bound (95% CI) — adjusts for sample size"
|
|
1204
|
+
class="px-3 py-1 text-xs rounded-full border transition-colors bg-zinc-800/50 border-zinc-700 text-zinc-500 hover:text-zinc-300">Raw</button>
|
|
1205
|
+
</div>
|
|
695
1206
|
</header>
|
|
696
1207
|
|
|
697
1208
|
${sections}
|
|
@@ -763,6 +1274,7 @@ async function main() {
|
|
|
763
1274
|
runs,
|
|
764
1275
|
varyingDims,
|
|
765
1276
|
controlledPairs,
|
|
1277
|
+
diffEntries,
|
|
766
1278
|
};
|
|
767
1279
|
}));
|
|
768
1280
|
const html = generateHTML(groups, reports.length);
|