@sebastiantuyu/agest 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,777 @@
1
+ import { readFile, writeFile } from "fs/promises";
2
+ import { join, relative } from "path";
3
+ import os from "os";
4
+ import { exec } from "child_process";
5
+ import { parseReport, findReports, loadDiffEntry, computeDiff, ensureDimensions, findVaryingDimensions, groupByDimension, findControlledPairs, diffConfigs, } from "./reports.js";
6
+ // ---------------------------------------------------------------------------
7
+ // Helpers
8
+ // ---------------------------------------------------------------------------
9
+ function openBrowser(filepath) {
10
+ const cmd = process.platform === "win32"
11
+ ? `start "" "${filepath}"`
12
+ : process.platform === "darwin"
13
+ ? `open "${filepath}"`
14
+ : `xdg-open "${filepath}"`;
15
+ exec(cmd, (err) => {
16
+ if (err)
17
+ console.error(" Could not open browser:", err.message);
18
+ });
19
+ }
20
+ function escHtml(str) {
21
+ return str
22
+ .replace(/&/g, "&")
23
+ .replace(/</g, "&lt;")
24
+ .replace(/>/g, "&gt;")
25
+ .replace(/"/g, "&quot;");
26
+ }
27
+ function barColor(rate) {
28
+ if (rate >= 0.8)
29
+ return "#4ade80";
30
+ if (rate >= 0.5)
31
+ return "#facc15";
32
+ return "#f87171";
33
+ }
34
+ function rateClass(rate) {
35
+ if (rate >= 0.8)
36
+ return "text-green-400";
37
+ if (rate >= 0.5)
38
+ return "text-yellow-400";
39
+ return "text-red-400";
40
+ }
41
+ function deltaClass(delta) {
42
+ if (delta > 0)
43
+ return "text-green-400";
44
+ if (delta < 0)
45
+ return "text-red-400";
46
+ return "text-zinc-500";
47
+ }
48
+ function formatDelta(d) {
49
+ if (Math.abs(d) < 0.5)
50
+ return "=";
51
+ return (d > 0 ? "+" : "") + d.toFixed(0) + "%";
52
+ }
53
+ function formatTimestamp(ts) {
54
+ try {
55
+ return new Date(ts).toLocaleString();
56
+ }
57
+ catch {
58
+ return ts;
59
+ }
60
+ }
61
+ // ---------------------------------------------------------------------------
62
+ // Rendering
63
+ // ---------------------------------------------------------------------------
64
+ function renderTools(tools) {
65
+ return tools
66
+ .map((t) => `<span class="text-xs bg-zinc-800 border border-zinc-700 px-2 py-0.5 rounded-full text-zinc-400">${escHtml(t)}</span>`)
67
+ .join(" ");
68
+ }
69
+ function renderFailedCases(cases) {
70
+ if (cases.length === 0)
71
+ return "";
72
+ const items = cases
73
+ .map((fc) => `
74
+ <li class="pl-3 border-l border-zinc-700">
75
+ <div class="text-sm text-zinc-300">&ldquo;${escHtml(fc.prompt)}&rdquo;</div>
76
+ ${fc.reason ? `<div class="text-xs text-zinc-500 mt-1 break-words">${escHtml(fc.reason)}</div>` : ""}
77
+ </li>`)
78
+ .join("\n");
79
+ return `
80
+ <details class="mt-4">
81
+ <summary class="text-xs text-red-400 cursor-pointer hover:text-red-300 select-none">
82
+ ${cases.length} failed case${cases.length !== 1 ? "s" : ""}
83
+ </summary>
84
+ <ul class="mt-3 space-y-3">
85
+ ${items}
86
+ </ul>
87
+ </details>`;
88
+ }
89
+ function renderRunRow(entry, idx) {
90
+ const { report, delta, diffLines } = entry;
91
+ const pct = report.successRate * 100;
92
+ const color = barColor(report.successRate);
93
+ const textColor = rateClass(report.successRate);
94
+ // Show dimension values as tags
95
+ const dims = report.dimensions ?? {};
96
+ const dimTags = Object.entries(dims)
97
+ .map(([k, v]) => {
98
+ const short = v.length > 16 ? v.slice(0, 15) + "…" : v;
99
+ return `<span class="text-xs text-zinc-600" title="${escHtml(v)}">${escHtml(k)}:${escHtml(short)}</span>`;
100
+ })
101
+ .join(" ");
102
+ const deltaHtml = delta === undefined
103
+ ? `<span class="w-14 text-right text-zinc-700 text-xs">&mdash;</span>`
104
+ : `<span class="w-14 text-right text-xs ${deltaClass(delta)}">${formatDelta(delta)}</span>`;
105
+ const diffHtml = diffLines.length === 0
106
+ ? ""
107
+ : `<div class="ml-10 mt-1 mb-2 pl-3 border-l border-zinc-800 space-y-0.5">
108
+ ${diffLines
109
+ .map((l) => {
110
+ const isAdd = l.includes(": +") || l.startsWith("tools: +");
111
+ const isRem = l.includes(": -") || l.startsWith("tools: -");
112
+ const cls = isAdd
113
+ ? "text-green-600"
114
+ : isRem
115
+ ? "text-red-600"
116
+ : "text-zinc-600";
117
+ return `<div class="text-xs ${cls}">${escHtml(l)}</div>`;
118
+ })
119
+ .join("\n")}
120
+ </div>`;
121
+ return `
122
+ <div>
123
+ <div class="flex items-center gap-3 py-1">
124
+ <span class="text-xs text-zinc-600 w-6 text-right select-none">#${idx + 1}</span>
125
+ <div class="flex-1 bg-zinc-800 rounded h-2.5 overflow-hidden">
126
+ <div class="h-2.5 rounded" style="width:${pct.toFixed(1)}%;background:${color}"></div>
127
+ </div>
128
+ <span class="text-sm font-bold ${textColor} w-12 text-right">${pct.toFixed(0)}%</span>
129
+ ${deltaHtml}
130
+ </div>
131
+ <div class="ml-10 mt-0.5 flex gap-3 flex-wrap">${dimTags}</div>
132
+ ${diffHtml}
133
+ </div>`;
134
+ }
135
+ // ---------------------------------------------------------------------------
136
+ // Grouped Bar Chart (benchmark-style)
137
+ // ---------------------------------------------------------------------------
138
+ const SERIES_COLORS = [
139
+ { bg: "#f87171", text: "#fca5a5" }, // red
140
+ { bg: "#fb923c", text: "#fdba74" }, // orange
141
+ { bg: "#facc15", text: "#fde047" }, // yellow
142
+ { bg: "#4ade80", text: "#86efac" }, // green
143
+ { bg: "#38bdf8", text: "#7dd3fc" }, // sky
144
+ { bg: "#a78bfa", text: "#c4b5fd" }, // violet
145
+ { bg: "#f472b6", text: "#f9a8d4" }, // pink
146
+ { bg: "#2dd4bf", text: "#5eead4" }, // teal
147
+ ];
148
+ function renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agentId, isActive) {
149
+ const otherDims = allDims.filter((d) => d !== groupDim);
150
+ // Column dimension: prefer "model", else first other dim
151
+ const colDim = otherDims.includes("model") ? "model" : otherDims[0];
152
+ if (!colDim)
153
+ return "";
154
+ // Remaining dims shown as tags in row labels
155
+ const tagDims = otherDims.filter((d) => d !== colDim);
156
+ // Unique column values (in order of first appearance)
157
+ const colVals = [...new Set(sorted.map((r) => r.dimensions?.[colDim] ?? "?"))];
158
+ // Build row entries: unique combinations of groupDim + tagDims
159
+ const rowEntriesMap = new Map();
160
+ for (const r of sorted) {
161
+ const gv = r.dimensions?.[groupDim] ?? "?";
162
+ const tags = {};
163
+ for (const td of tagDims) {
164
+ tags[td] = r.dimensions?.[td] ?? "?";
165
+ }
166
+ const key = [gv, ...tagDims.map((td) => tags[td])].join("|");
167
+ if (!rowEntriesMap.has(key)) {
168
+ rowEntriesMap.set(key, { groupVal: gv, tagVals: tags, key });
169
+ }
170
+ }
171
+ const rowEntries = [...rowEntriesMap.values()];
172
+ // Lookup: rowKey||colVal -> report (latest wins since sorted chronologically)
173
+ const lookup = new Map();
174
+ for (const r of sorted) {
175
+ const gv = r.dimensions?.[groupDim] ?? "?";
176
+ const tags = tagDims.map((td) => r.dimensions?.[td] ?? "?");
177
+ const rowKey = [gv, ...tags].join("|");
178
+ const cv = r.dimensions?.[colDim] ?? "?";
179
+ lookup.set(`${rowKey}||${cv}`, r);
180
+ }
181
+ // Column headers
182
+ const colHeaders = colVals
183
+ .map((cv) => {
184
+ const label = dimLabel(colDim, cv);
185
+ return `<th class="px-4 py-2 text-xs text-zinc-400 font-medium text-left" title="${escHtml(cv)}">${escHtml(label)}</th>`;
186
+ })
187
+ .join("\n");
188
+ // Rows
189
+ const rows = rowEntries
190
+ .map((row) => {
191
+ const groupLabel = dimLabel(groupDim, row.groupVal);
192
+ const tagHtml = tagDims
193
+ .map((td) => {
194
+ const tl = dimLabel(td, row.tagVals[td]);
195
+ return `<span class="text-[10px] text-zinc-600">${escHtml(td)}: ${escHtml(tl)}</span>`;
196
+ })
197
+ .join(" ");
198
+ const cells = colVals
199
+ .map((cv) => {
200
+ const r = lookup.get(`${row.key}||${cv}`);
201
+ if (!r) {
202
+ return `<td class="px-4 py-2"><span class="text-xs text-zinc-700">&mdash;</span></td>`;
203
+ }
204
+ const pct = r.successRate * 100;
205
+ const color = barColor(r.successRate);
206
+ const tc = rateClass(r.successRate);
207
+ return `<td class="px-4 py-2">
208
+ <div class="flex items-center gap-3">
209
+ <div class="flex-1 bg-zinc-800 rounded h-2 overflow-hidden" style="min-width:80px">
210
+ <div class="h-2 rounded" style="width:${pct.toFixed(1)}%;background:${color}"></div>
211
+ </div>
212
+ <span class="text-sm font-medium ${tc} w-12 text-right">${pct.toFixed(0)}%</span>
213
+ </div>
214
+ </td>`;
215
+ })
216
+ .join("\n");
217
+ return `<tr class="border-t border-zinc-800/50">
218
+ <td class="px-4 py-2.5">
219
+ <div class="text-xs text-zinc-300 font-medium">${escHtml(groupLabel)}</div>
220
+ ${tagHtml ? `<div class="flex gap-2 mt-0.5">${tagHtml}</div>` : ""}
221
+ </td>
222
+ ${cells}
223
+ </tr>`;
224
+ })
225
+ .join("\n");
226
+ // Version reference
227
+ const versionRef = allDims
228
+ .map((dim) => {
229
+ const vMap = versionMaps.get(dim);
230
+ if (vMap.size <= 1)
231
+ return "";
232
+ const entries = [...vMap.entries()]
233
+ .map(([val, version]) => {
234
+ const short = val.length > 28 ? val.slice(0, 27) + "…" : val;
235
+ return `<span class="text-zinc-600">${escHtml(version)}</span> <span class="text-zinc-700">${escHtml(short)}</span>`;
236
+ })
237
+ .join("&nbsp;&nbsp;&middot;&nbsp;&nbsp;");
238
+ return `<div class="text-[10px]"><span class="text-zinc-500">${escHtml(dim)}:</span> ${entries}</div>`;
239
+ })
240
+ .filter(Boolean)
241
+ .join("\n");
242
+ return `<div class="chart-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:${isActive ? "block" : "none"}">
243
+ <div class="mb-4">
244
+ <div class="text-xs text-zinc-600 uppercase tracking-wider mb-1">grouped by ${escHtml(groupDim)}</div>
245
+ </div>
246
+ <div class="overflow-x-auto">
247
+ <table class="w-full">
248
+ <thead>
249
+ <tr class="border-b border-zinc-800">
250
+ <th class="px-4 py-2 text-xs text-zinc-500 font-medium text-left">${escHtml(groupDim)}</th>
251
+ ${colHeaders}
252
+ </tr>
253
+ </thead>
254
+ <tbody>
255
+ ${rows}
256
+ </tbody>
257
+ </table>
258
+ </div>
259
+ <div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
260
+ ${versionRef}
261
+ </div>
262
+ </div>`;
263
+ }
264
+ function renderGroupedBarChart(group) {
265
+ const reports = group.runs.map((r) => r.report);
266
+ const varying = [...group.varyingDims];
267
+ // Prefer "model" as default tab — it has human-readable labels
268
+ const modelIdx = varying.indexOf("model");
269
+ if (modelIdx > 0) {
270
+ varying.splice(modelIdx, 1);
271
+ varying.unshift("model");
272
+ }
273
+ const allDims = [...new Set(reports.flatMap((r) => Object.keys(r.dimensions ?? {})))];
274
+ if (varying.length < 1)
275
+ return "";
276
+ const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
277
+ // Build version labels for each dimension: first unique value seen = v1, etc.
278
+ const versionMaps = new Map();
279
+ const sorted = [...reports].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
280
+ for (const dim of allDims) {
281
+ const seen = new Map();
282
+ let idx = 1;
283
+ for (const r of sorted) {
284
+ const val = r.dimensions?.[dim] ?? "?";
285
+ if (!seen.has(val)) {
286
+ seen.set(val, `v${idx}`);
287
+ idx++;
288
+ }
289
+ }
290
+ versionMaps.set(dim, seen);
291
+ }
292
+ const dimLabel = (dim, val) => {
293
+ const vMap = versionMaps.get(dim);
294
+ const version = vMap?.get(val) ?? "?";
295
+ // For model, show short model name. For others, show version tag.
296
+ if (dim === "model") {
297
+ const short = val.length > 16 ? val.split("/").pop()?.slice(0, 16) ?? val.slice(0, 16) : val;
298
+ return short;
299
+ }
300
+ // For tools, show "none" directly instead of a version tag
301
+ if (dim === "tools" && val === "none") {
302
+ return "none";
303
+ }
304
+ return version;
305
+ };
306
+ // Build a chart for each possible grouping dimension
307
+ const charts = varying.map((groupDim, dimIdx) => {
308
+ const isActive = dimIdx === 0;
309
+ // For non-model dimensions, render a matrix/heatmap view instead of bar chart
310
+ if (groupDim !== "model") {
311
+ return renderMatrixView(sorted, groupDim, allDims, versionMaps, dimLabel, agentId, isActive);
312
+ }
313
+ const otherDims = allDims.filter((d) => d !== groupDim);
314
+ // Group runs by the grouping dimension
315
+ const groupVals = [...new Set(sorted.map((r) => r.dimensions?.[groupDim] ?? "?"))];
316
+ const grouped = new Map();
317
+ for (const r of sorted) {
318
+ const gv = r.dimensions?.[groupDim] ?? "?";
319
+ const arr = grouped.get(gv) ?? [];
320
+ arr.push(r);
321
+ grouped.set(gv, arr);
322
+ }
323
+ // Config key = unique combo of non-grouping dims
324
+ const configKey = (r) => otherDims.map((d) => r.dimensions?.[d] ?? "?").join("|");
325
+ const uniqueConfigs = [...new Set(sorted.map(configKey))];
326
+ // X-axis labels: model short names
327
+ const labels = groupVals.map((gv) => dimLabel(groupDim, gv));
328
+ // Build Chart.js datasets: one per unique config
329
+ const datasets = uniqueConfigs.map((ck, ci) => {
330
+ const color = SERIES_COLORS[ci % SERIES_COLORS.length];
331
+ const parts = ck.split("|");
332
+ const cfgLabel = otherDims.map((d, i) => `${d}: ${dimLabel(d, parts[i] ?? "?")}`).join(", ");
333
+ const data = groupVals.map((gv) => {
334
+ const groupRuns = grouped.get(gv) ?? [];
335
+ const match = groupRuns.find((r) => configKey(r) === ck);
336
+ return match ? +(match.successRate * 100).toFixed(1) : null;
337
+ });
338
+ return { label: cfgLabel, data, backgroundColor: color.bg, borderColor: color.bg, borderWidth: 0, borderRadius: 4 };
339
+ });
340
+ const canvasId = `bar-${agentId}-${escHtml(groupDim)}`;
341
+ // Version reference
342
+ const versionRef = allDims
343
+ .map((dim) => {
344
+ const vMap = versionMaps.get(dim);
345
+ if (vMap.size <= 1)
346
+ return "";
347
+ const entries = [...vMap.entries()]
348
+ .map(([val, version]) => {
349
+ const short = val.length > 28 ? val.slice(0, 27) + "…" : val;
350
+ return `<span class="text-zinc-600">${escHtml(version)}</span> <span class="text-zinc-700">${escHtml(short)}</span>`;
351
+ })
352
+ .join("&nbsp;&nbsp;&middot;&nbsp;&nbsp;");
353
+ return `<div class="text-[10px]"><span class="text-zinc-500">${escHtml(dim)}:</span> ${entries}</div>`;
354
+ })
355
+ .filter(Boolean)
356
+ .join("\n");
357
+ return `<div class="chart-view" data-agent="${agentId}" data-dim="${escHtml(groupDim)}" style="display:${isActive ? "block" : "none"}">
358
+ <div class="mb-4">
359
+ <div class="text-xs text-zinc-600 uppercase tracking-wider mb-1">grouped by ${escHtml(groupDim)}</div>
360
+ </div>
361
+ <div style="position:relative;height:280px">
362
+ <canvas id="${canvasId}"></canvas>
363
+ </div>
364
+ <script>
365
+ new Chart(document.getElementById('${canvasId}'), {
366
+ type: 'bar',
367
+ data: { labels: ${JSON.stringify(labels)}, datasets: ${JSON.stringify(datasets)} },
368
+ options: {
369
+ responsive: true,
370
+ maintainAspectRatio: false,
371
+ plugins: {
372
+ legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
373
+ tooltip: { callbacks: { label: function(ctx) { return ctx.dataset.label + ': ' + ctx.parsed.y + '%'; } } }
374
+ },
375
+ scales: {
376
+ x: { ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } }, grid: { color: '#27272a' } },
377
+ y: { min: 0, max: 100, ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } }, grid: { color: '#27272a' } }
378
+ }
379
+ }
380
+ });
381
+ </script>
382
+ <div class="mt-4 pt-3 border-t border-zinc-800/50 space-y-1">
383
+ ${versionRef}
384
+ </div>
385
+ </div>`;
386
+ });
387
+ // Dimension toggle tabs
388
+ const tabs = varying
389
+ .map((dim, i) => {
390
+ const active = i === 0;
391
+ return `<button
392
+ class="dim-tab px-3 py-1.5 text-xs rounded-md transition-colors ${active ? "bg-zinc-700 text-zinc-200" : "bg-zinc-800/50 text-zinc-500 hover:text-zinc-300"}"
393
+ data-agent="${agentId}"
394
+ data-dim="${escHtml(dim)}"
395
+ onclick="switchDim('${agentId}', '${escHtml(dim)}')"
396
+ >${escHtml(dim)}</button>`;
397
+ })
398
+ .join("\n");
399
+ return `
400
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
401
+ <div class="flex items-center justify-between mb-5">
402
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">success rate</span>
403
+ <div class="flex gap-1.5">
404
+ ${tabs}
405
+ </div>
406
+ </div>
407
+ ${charts.join("\n")}
408
+ </div>`;
409
+ }
410
+ // ---------------------------------------------------------------------------
411
+ // Attribution Cards
412
+ // ---------------------------------------------------------------------------
413
+ function renderAttribution(group) {
414
+ const pairs = group.controlledPairs;
415
+ if (pairs.length === 0)
416
+ return "";
417
+ // Group by dimension
418
+ const byDim = new Map();
419
+ for (const p of pairs) {
420
+ const entry = byDim.get(p.variedDimension) ?? { deltas: [], pairs: [] };
421
+ entry.deltas.push(p.delta);
422
+ entry.pairs.push(p);
423
+ byDim.set(p.variedDimension, entry);
424
+ }
425
+ const sorted = [...byDim.entries()].sort((a, b) => Math.max(...b[1].deltas.map(Math.abs)) - Math.max(...a[1].deltas.map(Math.abs)));
426
+ const cards = sorted
427
+ .map(([dim, { deltas, pairs: dimPairs }]) => {
428
+ const avgDelta = deltas.reduce((a, b) => a + b, 0) / deltas.length;
429
+ const sign = avgDelta > 0 ? "+" : "";
430
+ const avgStr = `${sign}${(avgDelta * 100).toFixed(0)}%`;
431
+ const color = avgDelta > 0 ? "text-green-400" : avgDelta < 0 ? "text-red-400" : "text-zinc-500";
432
+ const examples = dimPairs.slice(0, 3).map((p) => {
433
+ const d = (p.delta * 100).toFixed(0);
434
+ const s = p.delta > 0 ? "+" : "";
435
+ const exColor = p.delta > 0 ? "text-green-600" : p.delta < 0 ? "text-red-600" : "text-zinc-600";
436
+ const fromLabel = p.variedFrom.length > 20 ? p.variedFrom.slice(0, 19) + "…" : p.variedFrom;
437
+ const toLabel = p.variedTo.length > 20 ? p.variedTo.slice(0, 19) + "…" : p.variedTo;
438
+ return `<div class="text-xs ${exColor}">${escHtml(fromLabel)} &rarr; ${escHtml(toLabel)}: ${s}${d}%</div>`;
439
+ }).join("\n");
440
+ return `
441
+ <div class="rounded-lg border border-zinc-800 bg-zinc-900/50 p-4">
442
+ <div class="flex items-center justify-between mb-2">
443
+ <span class="text-sm text-zinc-300 font-medium">${escHtml(dim)}</span>
444
+ <span class="text-lg font-bold ${color}">${avgStr} avg</span>
445
+ </div>
446
+ <div class="text-xs text-zinc-500 mb-2">${deltas.length} controlled comparison${deltas.length !== 1 ? "s" : ""}</div>
447
+ <div class="space-y-1">${examples}</div>
448
+ </div>`;
449
+ })
450
+ .join("\n");
451
+ return `
452
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
453
+ <div class="mb-4">
454
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">dimension impact (single-variable comparisons)</span>
455
+ </div>
456
+ <div class="grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-3">
457
+ ${cards}
458
+ </div>
459
+ </div>`;
460
+ }
461
+ // ---------------------------------------------------------------------------
462
+ // Per-group evolution (grouped by primary dimension)
463
+ // ---------------------------------------------------------------------------
464
+ function renderGroupedEvolution(group) {
465
+ const reports = group.runs.map((r) => r.report);
466
+ const varying = group.varyingDims;
467
+ if (varying.length === 0) {
468
+ // No varying dims — flat timeline
469
+ return renderFlatEvolution(group);
470
+ }
471
+ const primaryDim = varying[0];
472
+ const dimGroups = groupByDimension(reports, primaryDim);
473
+ const cards = [...dimGroups.entries()]
474
+ .map(([dimVal, dimReports]) => {
475
+ const sorted = [...dimReports].sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
476
+ const entries = sorted.map((report, i) => {
477
+ const delta = i === 0
478
+ ? undefined
479
+ : (report.successRate - sorted[i - 1].successRate) * 100;
480
+ // Compute diff lines for display
481
+ let diffLines = [];
482
+ if (i > 0) {
483
+ const prev = sorted[i - 1];
484
+ const diff = diffConfigs(prev.dimensions ?? {}, report.dimensions ?? {});
485
+ diffLines = Object.entries(diff.varied)
486
+ .map(([k, v]) => `${k}: ${v.from} → ${v.to}`)
487
+ .slice(0, 4);
488
+ }
489
+ return { report, delta, diffLines };
490
+ });
491
+ const rows = entries.map((e, i) => renderRunRow(e, i)).join("\n");
492
+ return `
493
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
494
+ <div class="flex items-center justify-between mb-4">
495
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">${escHtml(primaryDim)}: ${escHtml(dimVal.length > 30 ? dimVal.slice(0, 29) + "…" : dimVal)}</span>
496
+ <span class="text-xs text-zinc-600">${sorted.length} run${sorted.length !== 1 ? "s" : ""}</span>
497
+ </div>
498
+ <div class="space-y-0">
499
+ ${rows}
500
+ </div>
501
+ </div>`;
502
+ })
503
+ .join("\n");
504
+ return cards;
505
+ }
506
+ function renderFlatEvolution(group) {
507
+ const rows = group.runs.map((e, i) => renderRunRow(e, i)).join("\n");
508
+ return `
509
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
510
+ <div class="flex items-center justify-between mb-4">
511
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">success rate &middot; ${group.runs.length} runs</span>
512
+ <span class="text-xs text-zinc-600">oldest &rarr; newest</span>
513
+ </div>
514
+ <div class="space-y-0">
515
+ ${rows}
516
+ </div>
517
+ </div>`;
518
+ }
519
+ // ---------------------------------------------------------------------------
520
+ // Agent section
521
+ // ---------------------------------------------------------------------------
522
+ function renderScatterPlot(group) {
523
+ const reports = group.runs.map((r) => r.report);
524
+ if (reports.length < 2)
525
+ return "";
526
+ const allDims = [...new Set(reports.flatMap((r) => Object.keys(r.dimensions ?? {})))];
527
+ // Group data points by model
528
+ const byModel = new Map();
529
+ for (const r of reports) {
530
+ const model = r.dimensions?.["model"] ?? r.model ?? "?";
531
+ const avgDurSec = r.totalCases > 0 ? +(r.duration / r.totalCases / 1000).toFixed(2) : 0;
532
+ const accuracy = +(r.successRate * 100).toFixed(1);
533
+ const configLabel = allDims
534
+ .filter((d) => d !== "model")
535
+ .map((d) => `${d}: ${r.dimensions?.[d] ?? "?"}`)
536
+ .join(", ");
537
+ const arr = byModel.get(model) ?? [];
538
+ arr.push({ x: avgDurSec, y: accuracy, label: configLabel });
539
+ byModel.set(model, arr);
540
+ }
541
+ const uniqueModels = [...byModel.keys()];
542
+ const datasets = uniqueModels.map((model, i) => {
543
+ const color = SERIES_COLORS[i % SERIES_COLORS.length];
544
+ const short = model.split("/").pop()?.slice(0, 24) ?? model.slice(0, 24);
545
+ return {
546
+ label: short,
547
+ data: byModel.get(model),
548
+ backgroundColor: color.bg,
549
+ borderColor: color.text,
550
+ pointRadius: 7,
551
+ pointHoverRadius: 9,
552
+ };
553
+ });
554
+ const allX = [...byModel.values()].flat().map((p) => p.x);
555
+ const midX = allX.length > 0 ? +((Math.min(...allX) + Math.max(...allX)) / 2).toFixed(2) : 0;
556
+ const agentId = escHtml(group.label).replace(/\s+/g, "-").toLowerCase();
557
+ const canvasId = `scatter-${agentId}`;
558
+ return `
559
+ <div class="rounded-xl border border-zinc-800 bg-zinc-900 p-5 mb-4">
560
+ <div class="mb-4">
561
+ <span class="text-xs text-zinc-600 uppercase tracking-wider">accuracy vs speed</span>
562
+ </div>
563
+ <div style="position:relative;height:320px">
564
+ <canvas id="${canvasId}"></canvas>
565
+ </div>
566
+ <script>
567
+ new Chart(document.getElementById('${canvasId}'), {
568
+ type: 'scatter',
569
+ data: { datasets: ${JSON.stringify(datasets)} },
570
+ options: {
571
+ responsive: true,
572
+ maintainAspectRatio: false,
573
+ plugins: {
574
+ legend: { labels: { color: '#a1a1aa', font: { family: 'ui-monospace, monospace', size: 10 }, boxWidth: 12, padding: 16 } },
575
+ tooltip: {
576
+ callbacks: {
577
+ label: function(ctx) {
578
+ var p = ctx.raw;
579
+ var lines = [ctx.dataset.label + ': ' + p.y + '% accuracy, ' + p.x.toFixed(1) + 's/case'];
580
+ if (p.label) lines.push(p.label);
581
+ return lines;
582
+ }
583
+ }
584
+ }
585
+ },
586
+ scales: {
587
+ x: {
588
+ title: { display: true, text: 'avg duration per case (s)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
589
+ ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 } },
590
+ grid: { color: '#27272a' }
591
+ },
592
+ y: {
593
+ min: 0, max: 100,
594
+ title: { display: true, text: 'accuracy (%)', color: '#71717a', font: { family: 'ui-monospace, monospace', size: 11 } },
595
+ ticks: { color: '#71717a', font: { family: 'ui-monospace, monospace', size: 10 }, callback: function(v) { return v + '%'; } },
596
+ grid: { color: '#27272a' }
597
+ }
598
+ }
599
+ },
600
+ plugins: [{
601
+ id: 'quadrantLines',
602
+ afterDraw: function(chart) {
603
+ var ctx = chart.ctx;
604
+ var area = chart.chartArea;
605
+ var xScale = chart.scales.x;
606
+ var yScale = chart.scales.y;
607
+ var midXPx = xScale.getPixelForValue(${midX});
608
+ var midYPx = yScale.getPixelForValue(50);
609
+
610
+ ctx.save();
611
+ ctx.setLineDash([6, 4]);
612
+ ctx.lineWidth = 1;
613
+ ctx.strokeStyle = 'rgba(113, 113, 122, 0.4)';
614
+
615
+ ctx.beginPath();
616
+ ctx.moveTo(midXPx, area.top);
617
+ ctx.lineTo(midXPx, area.bottom);
618
+ ctx.stroke();
619
+
620
+ ctx.beginPath();
621
+ ctx.moveTo(area.left, midYPx);
622
+ ctx.lineTo(area.right, midYPx);
623
+ ctx.stroke();
624
+
625
+ ctx.setLineDash([]);
626
+ ctx.font = '10px ui-monospace, monospace';
627
+ ctx.fillStyle = 'rgba(113, 113, 122, 0.5)';
628
+
629
+ ctx.textAlign = 'left';
630
+ ctx.textBaseline = 'top';
631
+ ctx.fillText('Ideal', area.left + 8, area.top + 8);
632
+
633
+ ctx.textAlign = 'right';
634
+ ctx.fillText('Smart but slow', area.right - 8, area.top + 8);
635
+
636
+ ctx.textBaseline = 'bottom';
637
+ ctx.fillText('Dumb and slow', area.right - 8, area.bottom - 8);
638
+
639
+ ctx.textAlign = 'left';
640
+ ctx.fillText('Dumb and fast', area.left + 8, area.bottom - 8);
641
+
642
+ ctx.restore();
643
+ }
644
+ }]
645
+ });
646
+ </script>
647
+ </div>`;
648
+ }
649
+ function renderAgentSection(group) {
650
+ const chartHtml = renderGroupedBarChart(group);
651
+ const scatterHtml = renderScatterPlot(group);
652
+ return `
653
+ <section class="mb-12">
654
+ <h2 class="text-base font-semibold mb-4 text-zinc-400 uppercase tracking-widest">${escHtml(group.label)}</h2>
655
+
656
+ ${chartHtml}
657
+ ${scatterHtml}
658
+ </section>`;
659
+ }
660
+ // ---------------------------------------------------------------------------
661
+ // Full HTML page
662
+ // ---------------------------------------------------------------------------
663
+ function generateHTML(groups, totalReports) {
664
+ const sections = groups.map((g) => renderAgentSection(g)).join("\n");
665
+ const generated = new Date().toLocaleString();
666
+ return `<!DOCTYPE html>
667
+ <html lang="en" class="bg-zinc-950 text-zinc-100">
668
+ <head>
669
+ <meta charset="UTF-8" />
670
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
671
+ <title>agest preview</title>
672
+ <script src="https://cdn.tailwindcss.com"></script>
673
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
674
+ <script>
675
+ function switchDim(agent, dim) {
676
+ document.querySelectorAll('.chart-view[data-agent="' + agent + '"]').forEach(el => {
677
+ el.style.display = el.dataset.dim === dim ? 'block' : 'none';
678
+ });
679
+ document.querySelectorAll('.dim-tab[data-agent="' + agent + '"]').forEach(el => {
680
+ if (el.dataset.dim === dim) {
681
+ el.className = el.className.replace('bg-zinc-800/50 text-zinc-500', 'bg-zinc-700 text-zinc-200');
682
+ } else {
683
+ el.className = el.className.replace('bg-zinc-700 text-zinc-200', 'bg-zinc-800/50 text-zinc-500');
684
+ }
685
+ });
686
+ }
687
+ </script>
688
+ </head>
689
+ <body class="min-h-screen font-mono p-8">
690
+ <div class="max-w-4xl mx-auto">
691
+
692
+ <header class="mb-10">
693
+ <h1 class="text-2xl font-bold tracking-tight">agest</h1>
694
+ <p class="text-zinc-500 text-sm mt-1">${totalReports} report${totalReports !== 1 ? "s" : ""} &middot; generated ${generated}</p>
695
+ </header>
696
+
697
+ ${sections}
698
+
699
+ <footer class="mt-16 border-t border-zinc-800 pt-6 text-xs text-zinc-600">
700
+ agest by <a href="https://sebastiantuyu.com" target="_blank" class="text-zinc-500 hover:text-zinc-300 transition-colors">sebastiantuyu</a>
701
+ </footer>
702
+
703
+ </div>
704
+ </body>
705
+ </html>`;
706
+ }
707
+ // ---------------------------------------------------------------------------
708
+ // Main
709
+ // ---------------------------------------------------------------------------
710
+ async function main() {
711
+ const cwd = process.cwd();
712
+ const files = await findReports(cwd);
713
+ if (files.length === 0) {
714
+ console.log("\n No reports found. Run some agent tests first.\n");
715
+ return;
716
+ }
717
+ const reports = await Promise.all(files.map(async (f) => {
718
+ const content = await readFile(f, "utf-8");
719
+ return parseReport(content, relative(cwd, f));
720
+ }));
721
+ // Ensure all reports have dimensions (backward compat)
722
+ await Promise.all(reports.map((r) => ensureDimensions(r)));
723
+ // Group by agent name, sort each group oldest -> newest
724
+ const groupMap = new Map();
725
+ for (const r of reports) {
726
+ const key = r.name ?? "__unnamed__";
727
+ const arr = groupMap.get(key) ?? [];
728
+ arr.push(r);
729
+ groupMap.set(key, arr);
730
+ }
731
+ for (const [, arr] of groupMap) {
732
+ arr.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
733
+ }
734
+ // Build AgentGroups with dimension analysis
735
+ const namedKeys = [...groupMap.keys()]
736
+ .filter((k) => k !== "__unnamed__")
737
+ .sort();
738
+ const orderedKeys = groupMap.has("__unnamed__")
739
+ ? [...namedKeys, "__unnamed__"]
740
+ : namedKeys;
741
+ const groups = await Promise.all(orderedKeys.map(async (key) => {
742
+ const sorted = groupMap.get(key);
743
+ const varyingDims = findVaryingDimensions(sorted);
744
+ const controlledPairs = findControlledPairs(sorted);
745
+ // Load diff entries for consecutive run diffs
746
+ const diffEntries = await Promise.all(sorted.map((r) => r.systemPromptHash ? loadDiffEntry(r.systemPromptHash) : Promise.resolve(null)));
747
+ const runs = sorted.map((report, i) => {
748
+ const delta = i === 0
749
+ ? undefined
750
+ : (report.successRate - sorted[i - 1].successRate) * 100;
751
+ let diffLines = [];
752
+ if (i > 0) {
753
+ const prev = diffEntries[i - 1];
754
+ const curr = diffEntries[i];
755
+ if (prev && curr) {
756
+ diffLines = computeDiff(prev, curr);
757
+ }
758
+ }
759
+ return { report, delta, diffLines };
760
+ });
761
+ return {
762
+ label: key === "__unnamed__" ? "Unnamed" : key,
763
+ runs,
764
+ varyingDims,
765
+ controlledPairs,
766
+ };
767
+ }));
768
+ const html = generateHTML(groups, reports.length);
769
+ const tmpPath = join(os.tmpdir(), `agest-preview-${Date.now()}.html`);
770
+ await writeFile(tmpPath, html, "utf-8");
771
+ console.log(`\n Preview: ${tmpPath}\n`);
772
+ openBrowser(tmpPath);
773
+ }
774
+ main().catch((err) => {
775
+ console.error("Error:", err.message);
776
+ process.exit(1);
777
+ });