agent-duelist 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -49,14 +49,154 @@ function bold(s5) {
49
49
  function dim(s5) {
50
50
  return `${dimCode}${s5}${reset}`;
51
51
  }
52
- function colorScore(value) {
53
- const pct = Math.round(value * 100);
54
- const str = `${pct}%`;
55
- if (value >= 0.8) return `${green}${str}${reset}`;
56
- if (value >= 0.5) return `${yellow}${str}${reset}`;
57
- return `${red}${str}${reset}`;
58
- }
59
- function consoleReporter(results) {
52
+ function stripAnsi(s5) {
53
+ return s5.replace(/\x1b\[[0-9;]*m/g, "");
54
+ }
55
+ function displayWidth(s5) {
56
+ const stripped = stripAnsi(s5);
57
+ let width = 0;
58
+ for (const ch of stripped) {
59
+ const code = ch.codePointAt(0) ?? 0;
60
+ if (code >= 126976) width += 2;
61
+ else if (code >= 9728 && code <= 10175) width += 2;
62
+ else width += 1;
63
+ }
64
+ return width;
65
+ }
66
+ function padCell(str, targetWidth, align) {
67
+ const dw = displayWidth(str);
68
+ const padding = Math.max(0, targetWidth - dw);
69
+ if (align === "right") return " ".repeat(padding) + str;
70
+ return str + " ".repeat(padding);
71
+ }
72
+ function sparkBar(ratio, width = 8) {
73
+ const clamped = Math.max(0, Math.min(1, ratio));
74
+ const fillLen = Math.round(clamped * width);
75
+ const fill = "\u2593".repeat(fillLen);
76
+ const track = "\u2591".repeat(width - fillLen);
77
+ return { fill, track };
78
+ }
79
+ function drawTableLine(widths, position) {
80
+ const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
81
+ if (position === "bottom") {
82
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
83
+ }
84
+ if (position === "merge") {
85
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
86
+ }
87
+ const segments = widths.map((w4) => "\u2500".repeat(w4 + 2));
88
+ if (position === "top") {
89
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
90
+ }
91
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
92
+ }
93
+ function drawTableRow(cells, widths, aligns) {
94
+ const parts = cells.map(
95
+ (cell, i7) => " " + padCell(cell, widths[i7], aligns[i7]) + " "
96
+ );
97
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
98
+ }
99
+ function drawSpanRow(content, widths) {
100
+ const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
101
+ const dw = displayWidth(content);
102
+ const padding = Math.max(0, totalInner - dw - 1);
103
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
104
+ }
105
+ function computeColumnStats(providerData, scorerNames) {
106
+ const stats = /* @__PURE__ */ new Map();
107
+ const valid = providerData.filter((p5) => !p5.allErrors);
108
+ if (scorerNames.includes("latency")) {
109
+ const values = /* @__PURE__ */ new Map();
110
+ for (const p5 of providerData) {
111
+ values.set(p5.providerId, p5.allErrors ? void 0 : p5.latencyMs);
112
+ }
113
+ const nums = valid.map((p5) => p5.latencyMs).filter((v4) => v4 !== void 0);
114
+ stats.set("latency", {
115
+ values,
116
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
117
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
118
+ });
119
+ }
120
+ if (scorerNames.includes("cost")) {
121
+ const costValues = /* @__PURE__ */ new Map();
122
+ const tokenValues = /* @__PURE__ */ new Map();
123
+ for (const p5 of providerData) {
124
+ costValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.costUsd);
125
+ tokenValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.totalTokens);
126
+ }
127
+ const costNums = valid.map((p5) => p5.avgDetails.costUsd).filter((v4) => v4 !== void 0);
128
+ const tokenNums = valid.map((p5) => p5.avgDetails.totalTokens).filter((v4) => v4 !== void 0);
129
+ stats.set("cost", {
130
+ values: costValues,
131
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
132
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
133
+ });
134
+ stats.set("tokens", {
135
+ values: tokenValues,
136
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
137
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
138
+ });
139
+ }
140
+ for (const name of scorerNames) {
141
+ if (name === "latency" || name === "cost") continue;
142
+ const values = /* @__PURE__ */ new Map();
143
+ for (const p5 of providerData) {
144
+ values.set(p5.providerId, p5.allErrors ? void 0 : p5.avgScores[name]);
145
+ }
146
+ const nums = valid.map((p5) => p5.avgScores[name]).filter((v4) => v4 !== void 0);
147
+ stats.set(name, {
148
+ values,
149
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
150
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
151
+ });
152
+ }
153
+ return stats;
154
+ }
155
+ function colorByRank(text, value, colStats, providerCount) {
156
+ if (value === void 0) return dim("\u2014");
157
+ if (providerCount < 2) return text;
158
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
159
+ if (colStats.best === colStats.worst) return text;
160
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
161
+ if (value === colStats.worst) return `${red}${text}${reset}`;
162
+ return `${yellow}${text}${reset}`;
163
+ }
164
+ function computeMedals(columnStats, providerIds) {
165
+ const medals = /* @__PURE__ */ new Map();
166
+ if (providerIds.length < 2) {
167
+ for (const id of providerIds) medals.set(id, "");
168
+ return medals;
169
+ }
170
+ const wins = /* @__PURE__ */ new Map();
171
+ for (const id of providerIds) wins.set(id, 0);
172
+ for (const [, colStats] of columnStats) {
173
+ if (colStats.best === void 0) continue;
174
+ for (const [providerId, value] of colStats.values) {
175
+ if (value !== void 0 && value === colStats.best) {
176
+ wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
177
+ }
178
+ }
179
+ }
180
+ const totalWins = [...wins.values()].reduce((a7, b3) => a7 + b3, 0);
181
+ if (totalWins === 0) {
182
+ for (const id of providerIds) medals.set(id, "");
183
+ return medals;
184
+ }
185
+ const sorted = [...wins.entries()].sort(
186
+ (a7, b3) => b3[1] - a7[1] || a7[0].localeCompare(b3[0])
187
+ );
188
+ const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
189
+ let rank = 0;
190
+ for (let i7 = 0; i7 < sorted.length; i7++) {
191
+ if (i7 > 0 && sorted[i7][1] < sorted[i7 - 1][1]) {
192
+ rank = i7;
193
+ }
194
+ medals.set(sorted[i7][0], rank < medalList.length ? medalList[rank] : "");
195
+ }
196
+ return medals;
197
+ }
198
+ function consoleReporter(results, options) {
199
+ const showSparklines = options?.sparklines ?? true;
60
200
  if (results.length === 0) {
61
201
  console.log("\nNo results to display.\n");
62
202
  return;
@@ -66,78 +206,155 @@ function consoleReporter(results) {
66
206
  const scorerNames = [...new Set(results.flatMap((r3) => r3.scores.map((s5) => s5.name)))];
67
207
  const hasCost = scorerNames.includes("cost");
68
208
  const hasErrors = results.some((r3) => r3.error);
209
+ const multi = providers.length >= 2;
69
210
  const runsPerCell = Math.max(...results.map((r3) => r3.run));
70
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
211
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
71
212
  console.log("");
72
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
73
- console.log(` ${dim("\u2500".repeat(70))}`);
213
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
214
+ console.log(` ${dim("\u2501".repeat(72))}`);
74
215
  console.log("");
75
216
  for (const task of tasks) {
76
217
  console.log(` ${bold(`Task: ${task}`)}`);
77
- const cols = [{ label: "Provider", width: 22, align: "left" }];
78
- for (const name of scorerNames) {
79
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
80
- else if (name === "cost") {
81
- cols.push({ label: "Cost", width: 12, align: "right" });
82
- cols.push({ label: "Tokens", width: 9, align: "right" });
83
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
84
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
85
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
86
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
87
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
88
- else cols.push({ label: name, width: 10, align: "right" });
89
- }
90
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
91
- const totalWidth = cols.reduce((sum, c3) => sum + c3.width + 2, 0);
92
- console.log(` ${dim(cols.map((c3) => pad(c3.label, c3.width + 2, c3.align)).join(""))}`);
93
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
94
- for (const provider of providers) {
95
- const taskResults = results.filter(
96
- (r3) => r3.taskName === task && r3.providerId === provider
97
- );
218
+ console.log("");
219
+ const providerData = providers.map((providerId) => {
220
+ const taskResults = results.filter((r3) => r3.taskName === task && r3.providerId === providerId);
98
221
  const errorResults2 = taskResults.filter((r3) => r3.error);
99
222
  const successResults = taskResults.filter((r3) => !r3.error);
100
- if (successResults.length === 0 && errorResults2.length > 0) {
101
- const cells2 = [pad(provider, 24, "left")];
102
- for (const name of scorerNames) {
103
- if (name === "cost") {
104
- cells2.push(pad("\u2014", 14, "right"));
105
- cells2.push(pad("\u2014", 11, "right"));
106
- } else cells2.push(pad("\u2014", cols.find((c3) => c3.label !== "Provider").width + 2, "right"));
107
- }
108
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
109
- console.log(` ${cells2.join("")}`);
110
- continue;
223
+ if (successResults.length === 0) {
224
+ return {
225
+ providerId,
226
+ avgScores: {},
227
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
228
+ latencyMs: void 0,
229
+ allErrors: errorResults2.length > 0,
230
+ errorCount: errorResults2.length
231
+ };
111
232
  }
112
- const avgScores = averageScores(successResults);
113
- const avgDetails = averageDetails(successResults);
114
- const latencyMs = average(successResults.map((r3) => r3.raw.latencyMs));
115
- const cells = [pad(provider, 24, "left")];
116
- for (const name of scorerNames) {
117
- if (name === "latency") {
118
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
119
- } else if (name === "cost") {
120
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
121
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
122
- } else {
123
- const val = avgScores[name];
124
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
125
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
233
+ return {
234
+ providerId,
235
+ avgScores: averageScores(successResults),
236
+ avgDetails: averageDetails(successResults),
237
+ latencyMs: average(successResults.map((r3) => r3.raw.latencyMs)),
238
+ allErrors: false,
239
+ errorCount: errorResults2.length
240
+ };
241
+ });
242
+ const columnStats = computeColumnStats(providerData, scorerNames);
243
+ const medals = computeMedals(columnStats, providers);
244
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
245
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
246
+ const cols = [
247
+ { label: "Provider", width: providerWidth, align: "left" }
248
+ ];
249
+ for (const name of scorerNames) {
250
+ if (name === "latency") {
251
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
252
+ } else if (name === "cost") {
253
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
254
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
255
+ } else {
256
+ const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
257
+ cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
258
+ }
259
+ }
260
+ if (hasErrors) {
261
+ cols.push({ label: "Status", width: 8, align: "left" });
262
+ }
263
+ const widths = cols.map((c3) => c3.width);
264
+ const aligns = cols.map((c3) => c3.align);
265
+ console.log(` ${drawTableLine(widths, "top")}`);
266
+ const headerCells = cols.map((c3) => bold(c3.label));
267
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
268
+ console.log(` ${drawTableLine(widths, "header")}`);
269
+ for (const pd of providerData) {
270
+ const medal = medals.get(pd.providerId) ?? "";
271
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
272
+ const cells = [providerCell];
273
+ if (pd.allErrors) {
274
+ for (const col of cols.slice(1)) {
275
+ if (col.label === "Status") {
276
+ cells.push(`${red}FAIL${reset}`);
277
+ } else {
278
+ cells.push(dim("\u2014"));
279
+ }
280
+ }
281
+ } else {
282
+ for (const col of cols.slice(1)) {
283
+ if (col.label === "Status") {
284
+ cells.push(
285
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
286
+ );
287
+ continue;
288
+ }
289
+ const statsKey = col.statsKey;
290
+ const colStats = columnStats.get(statsKey);
291
+ if (statsKey === "latency") {
292
+ const ms = pd.latencyMs;
293
+ if (ms === void 0) {
294
+ cells.push(dim("\u2014"));
295
+ } else {
296
+ const text = `${Math.round(ms)}ms`;
297
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
298
+ }
299
+ } else if (statsKey === "cost") {
300
+ const cost = pd.avgDetails.costUsd;
301
+ if (cost === void 0) {
302
+ cells.push(dim("\u2014"));
303
+ } else {
304
+ const text = formatCost(cost);
305
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
306
+ }
307
+ } else if (statsKey === "tokens") {
308
+ const tokens = pd.avgDetails.totalTokens;
309
+ if (tokens === void 0) {
310
+ cells.push(dim("\u2014"));
311
+ } else {
312
+ const text = `${tokens}`;
313
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
314
+ }
315
+ } else {
316
+ const val = pd.avgScores[statsKey];
317
+ if (val === void 0) {
318
+ cells.push(dim("\u2014"));
319
+ } else {
320
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
321
+ let coloredPct;
322
+ if (multi && colStats) {
323
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
324
+ } else {
325
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
326
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
327
+ else coloredPct = `${red}${pctStr}${reset}`;
328
+ }
329
+ if (showSparklines) {
330
+ const { fill, track } = sparkBar(val);
331
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
332
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
333
+ } else {
334
+ cells.push(coloredPct);
335
+ }
336
+ }
337
+ }
126
338
  }
127
339
  }
128
- if (hasErrors) {
129
- const failCount = errorResults2.length;
130
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
340
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
341
+ }
342
+ if (multi && providerData.some((p5) => !p5.allErrors)) {
343
+ const winnerId = [...medals.entries()].find(([, m8]) => m8 === "\u{1F947}")?.[0];
344
+ if (winnerId) {
345
+ console.log(` ${drawTableLine(widths, "merge")}`);
346
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
347
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
131
348
  }
132
- console.log(` ${cells.join("")}`);
133
349
  }
350
+ console.log(` ${drawTableLine(widths, "bottom")}`);
134
351
  console.log("");
135
352
  }
136
353
  printSummary(results, providers);
137
354
  const errorResults = results.filter((r3) => r3.error);
138
355
  if (errorResults.length > 0) {
139
356
  console.log(` ${bold("Errors")}`);
140
- console.log(` ${dim("\u2500".repeat(70))}`);
357
+ console.log(` ${dim("\u2501".repeat(72))}`);
141
358
  const seen = /* @__PURE__ */ new Set();
142
359
  for (const r3 of errorResults) {
143
360
  const key = `${r3.providerId}::${r3.error}`;
@@ -145,7 +362,7 @@ function consoleReporter(results) {
145
362
  seen.add(key);
146
363
  const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
147
364
  const suffix = count > 1 ? ` (\xD7${count})` : "";
148
- console.log(` ${red}\u2717${reset} ${r3.providerId}: ${r3.error}${suffix}`);
365
+ console.log(` ${red}\u2716${reset} ${r3.providerId}: ${r3.error}${suffix}`);
149
366
  const hint = apiKeyHint(r3.providerId, r3.error ?? "");
150
367
  if (hint) console.log(` ${dim(hint)}`);
151
368
  }
@@ -159,15 +376,20 @@ function consoleReporter(results) {
159
376
  function printSummary(results, providers) {
160
377
  const successResults = results.filter((r3) => !r3.error);
161
378
  if (successResults.length === 0) return;
162
- console.log(` ${dim("\u2500".repeat(70))}`);
163
379
  console.log(` ${bold("Summary")}`);
380
+ console.log(` ${dim("\u2501".repeat(72))}`);
164
381
  console.log("");
165
382
  const single = providers.length === 1;
166
383
  const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
167
384
  const byCorrectness = rankProviders(successResults, providers, correctnessKey);
168
385
  if (byCorrectness) {
169
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
170
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
386
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
387
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
388
+ if (single) {
389
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
390
+ } else {
391
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
392
+ }
171
393
  }
172
394
  const byLatency = providers.map((id) => {
173
395
  const runs = successResults.filter((r3) => r3.providerId === id);
@@ -175,8 +397,13 @@ function printSummary(results, providers) {
175
397
  return { id, avg: avg ?? Infinity };
176
398
  }).sort((a7, b3) => a7.avg - b3.avg)[0];
177
399
  if (byLatency && byLatency.avg !== Infinity) {
178
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
179
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
400
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
401
+ const msStr = `${Math.round(byLatency.avg)}ms`;
402
+ if (single) {
403
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
404
+ } else {
405
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
406
+ }
180
407
  }
181
408
  const byCost = providers.map((id) => {
182
409
  const runs = successResults.filter((r3) => r3.providerId === id);
@@ -188,8 +415,32 @@ function printSummary(results, providers) {
188
415
  return { id, avg };
189
416
  }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
190
417
  if (byCost?.avg !== void 0) {
191
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
192
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
418
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
419
+ const costStr = formatCost(byCost.avg);
420
+ if (single) {
421
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
422
+ } else {
423
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
424
+ }
425
+ }
426
+ if (!single) {
427
+ const wins = /* @__PURE__ */ new Map();
428
+ for (const id of providers) wins.set(id, 0);
429
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
430
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
431
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
432
+ const maxWins = Math.max(...wins.values());
433
+ if (maxWins > 0) {
434
+ const topProviders = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
435
+ console.log("");
436
+ if (topProviders.length === 1) {
437
+ const [winnerId, winCount] = topProviders[0];
438
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
439
+ } else {
440
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
441
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
442
+ }
443
+ }
193
444
  }
194
445
  console.log("");
195
446
  }
@@ -251,14 +502,6 @@ function formatCost(usd) {
251
502
  const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
252
503
  return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
253
504
  }
254
- function pad(str, width, align) {
255
- if (align === "right") return str.padStart(width);
256
- return str.padEnd(width);
257
- }
258
- function colorLen(str) {
259
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
260
- return str.length - stripped.length;
261
- }
262
505
  function apiKeyHint(providerId, error) {
263
506
  const lower = error.toLowerCase();
264
507
  const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
@@ -328,7 +571,7 @@ function providerLabel(providerId) {
328
571
  return `(${prefix})`;
329
572
  }
330
573
  }
331
- var reset, boldCode, dimCode, green, red, yellow, cyan;
574
+ var reset, boldCode, dimCode, green, red, yellow, cyan, brightGreen, brightWhite;
332
575
  var init_console = __esm({
333
576
  "src/reporter/console.ts"() {
334
577
  "use strict";
@@ -339,6 +582,8 @@ var init_console = __esm({
339
582
  red = "\x1B[31m";
340
583
  yellow = "\x1B[33m";
341
584
  cyan = "\x1B[36m";
585
+ brightGreen = "\x1B[92m";
586
+ brightWhite = "\x1B[97m";
342
587
  }
343
588
  });
344
589
 
@@ -375,6 +620,425 @@ var init_json = __esm({
375
620
  }
376
621
  });
377
622
 
623
+ // src/ci.ts
624
+ var ci_exports = {};
625
+ __export(ci_exports, {
626
+ compareResults: () => compareResults,
627
+ computeCostSummary: () => computeCostSummary,
628
+ computeScorerStats: () => computeScorerStats,
629
+ computeStats: () => computeStats,
630
+ loadBaseline: () => loadBaseline,
631
+ saveBaseline: () => saveBaseline
632
+ });
633
+ import { readFileSync, writeFileSync, mkdirSync } from "fs";
634
+ import { dirname } from "path";
635
+ function tCritical(df) {
636
+ if (df <= 0) return 1.96;
637
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
638
+ const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a7, b3) => a7 - b3);
639
+ if (df > keys[keys.length - 1]) return 1.96;
640
+ for (let i7 = 0; i7 < keys.length - 1; i7++) {
641
+ if (df > keys[i7] && df < keys[i7 + 1]) {
642
+ const low = keys[i7], high = keys[i7 + 1];
643
+ const ratio = (df - low) / (high - low);
644
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
645
+ }
646
+ }
647
+ return 1.96;
648
+ }
649
+ function computeScorerStats(samples) {
650
+ const n3 = samples.length;
651
+ if (n3 === 0) {
652
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
653
+ }
654
+ const mean = samples.reduce((a7, b3) => a7 + b3, 0) / n3;
655
+ if (n3 === 1) {
656
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
657
+ }
658
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n3 - 1);
659
+ const stddev = Math.sqrt(variance);
660
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
661
+ const se4 = stddev / Math.sqrt(n3);
662
+ const t3 = tCritical(n3 - 1);
663
+ return {
664
+ mean,
665
+ stddev,
666
+ cv,
667
+ n: n3,
668
+ ci95Lower: mean - t3 * se4,
669
+ ci95Upper: mean + t3 * se4
670
+ };
671
+ }
672
+ function groupKey(providerId, taskName, scorerName) {
673
+ return `${providerId}::${taskName}::${scorerName}`;
674
+ }
675
+ function computeStats(results) {
676
+ const grouped = /* @__PURE__ */ new Map();
677
+ for (const r3 of results) {
678
+ if (r3.error) continue;
679
+ for (const score of r3.scores) {
680
+ if (score.value < 0) continue;
681
+ const key = groupKey(r3.providerId, r3.taskName, score.name);
682
+ if (!grouped.has(key)) grouped.set(key, []);
683
+ grouped.get(key).push(score.value);
684
+ }
685
+ }
686
+ const stats = /* @__PURE__ */ new Map();
687
+ for (const [key, samples] of grouped) {
688
+ stats.set(key, computeScorerStats(samples));
689
+ }
690
+ return stats;
691
+ }
692
+ function computeCostSummary(results, budget) {
693
+ let totalUsd = 0;
694
+ const perProvider = /* @__PURE__ */ new Map();
695
+ for (const r3 of results) {
696
+ if (r3.error) continue;
697
+ const costScore = r3.scores.find((s5) => s5.name === "cost");
698
+ if (!costScore || costScore.value < 0) continue;
699
+ const details = costScore.details;
700
+ const usd = details?.estimatedUsd ?? 0;
701
+ if (usd <= 0) continue;
702
+ totalUsd += usd;
703
+ perProvider.set(r3.providerId, (perProvider.get(r3.providerId) ?? 0) + usd);
704
+ }
705
+ return {
706
+ totalUsd,
707
+ perProvider,
708
+ budget,
709
+ overBudget: budget !== void 0 && totalUsd > budget
710
+ };
711
+ }
712
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
713
+ const comparisons = [];
714
+ const failureReasons = [];
715
+ for (const [key, current] of currentStats) {
716
+ const [providerId, taskName, scorerName] = key.split("::");
717
+ const baseline = baselineStats?.get(key) ?? null;
718
+ let delta = null;
719
+ let regressed = false;
720
+ let improved = false;
721
+ if (baseline) {
722
+ delta = current.mean - baseline.mean;
723
+ const threshold = thresholds.get(scorerName);
724
+ if (threshold !== void 0) {
725
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
726
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
727
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
728
+ }
729
+ }
730
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
731
+ comparisons.push({
732
+ providerId,
733
+ taskName,
734
+ scorerName,
735
+ baseline,
736
+ current,
737
+ delta,
738
+ regressed,
739
+ improved,
740
+ flaky
741
+ });
742
+ }
743
+ const cost = computeCostSummary(currentResults ?? [], budget);
744
+ const regressions = comparisons.filter((c3) => c3.regressed);
745
+ if (regressions.length > 0) {
746
+ for (const r3 of regressions) {
747
+ failureReasons.push(
748
+ `${r3.providerId} \xD7 ${r3.taskName}: ${r3.scorerName} regressed by ${formatDelta(r3.delta)}`
749
+ );
750
+ }
751
+ }
752
+ if (cost.overBudget) {
753
+ failureReasons.push(
754
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
755
+ );
756
+ }
757
+ const flakyResults = comparisons.filter((c3) => c3.flaky);
758
+ const failed = failureReasons.length > 0;
759
+ return { comparisons, cost, failed, flakyResults, failureReasons };
760
+ }
761
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
762
+ if (baseline.n === 1 && current.n === 1) {
763
+ const delta = current.mean - baseline.mean;
764
+ if (lowerIsBetter) return delta > threshold;
765
+ return delta < -threshold;
766
+ }
767
+ if (lowerIsBetter) {
768
+ return current.ci95Lower - baseline.ci95Upper > threshold;
769
+ }
770
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
771
+ }
772
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
773
+ if (baseline.n === 1 && current.n === 1) {
774
+ const delta = current.mean - baseline.mean;
775
+ if (lowerIsBetter) return delta < -threshold;
776
+ return delta > threshold;
777
+ }
778
+ if (lowerIsBetter) {
779
+ return baseline.ci95Lower - current.ci95Upper > threshold;
780
+ }
781
+ return current.ci95Lower - baseline.ci95Upper > threshold;
782
+ }
783
+ function formatDelta(delta) {
784
+ const sign = delta >= 0 ? "+" : "";
785
+ return `${sign}${delta.toFixed(4)}`;
786
+ }
787
+ function loadBaseline(path) {
788
+ try {
789
+ const raw = readFileSync(path, "utf-8");
790
+ const data = JSON.parse(raw);
791
+ const results = data.results ?? data;
792
+ if (!Array.isArray(results)) return null;
793
+ return {
794
+ timestamp: data.timestamp ?? "unknown",
795
+ results
796
+ };
797
+ } catch {
798
+ return null;
799
+ }
800
+ }
801
+ function saveBaseline(path, results) {
802
+ mkdirSync(dirname(path), { recursive: true });
803
+ const data = {
804
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
805
+ results
806
+ };
807
+ writeFileSync(path, JSON.stringify(data, null, 2));
808
+ }
809
+ var LOWER_IS_BETTER, FLAKY_CV_THRESHOLD, T_CRITICAL_95;
810
+ var init_ci = __esm({
811
+ "src/ci.ts"() {
812
+ "use strict";
813
+ LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
814
+ FLAKY_CV_THRESHOLD = 0.3;
815
+ T_CRITICAL_95 = {
816
+ 1: 12.706,
817
+ 2: 4.303,
818
+ 3: 3.182,
819
+ 4: 2.776,
820
+ 5: 2.571,
821
+ 6: 2.447,
822
+ 7: 2.365,
823
+ 8: 2.306,
824
+ 9: 2.262,
825
+ 10: 2.228,
826
+ 15: 2.131,
827
+ 20: 2.086,
828
+ 25: 2.06,
829
+ 30: 2.042
830
+ };
831
+ }
832
+ });
833
+
834
+ // src/reporter/markdown.ts
835
+ var markdown_exports = {};
836
+ __export(markdown_exports, {
837
+ COMMENT_MARKER: () => COMMENT_MARKER,
838
+ markdownComparisonTable: () => markdownComparisonTable,
839
+ markdownCostSummary: () => markdownCostSummary,
840
+ markdownReporter: () => markdownReporter
841
+ });
842
+ function markdownReporter(report, _current) {
843
+ const lines = [COMMENT_MARKER, ""];
844
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
845
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
846
+ lines.push("");
847
+ if (report.comparisons.length > 0) {
848
+ lines.push(markdownComparisonTable(report.comparisons));
849
+ lines.push("");
850
+ }
851
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
852
+ lines.push(markdownCostSummary(report.cost));
853
+ lines.push("");
854
+ }
855
+ if (report.flakyResults.length > 0) {
856
+ lines.push("### \u26A0\uFE0F Flaky Results");
857
+ lines.push("");
858
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
859
+ lines.push("");
860
+ for (const f6 of report.flakyResults) {
861
+ lines.push(`- **${f6.providerId}** \xD7 ${f6.taskName} \u2192 ${f6.scorerName} (CV = ${f6.current.cv.toFixed(2)})`);
862
+ }
863
+ lines.push("");
864
+ }
865
+ if (report.failureReasons.length > 0) {
866
+ lines.push("### Failure Reasons");
867
+ lines.push("");
868
+ for (const reason of report.failureReasons) {
869
+ lines.push(`- ${reason}`);
870
+ }
871
+ lines.push("");
872
+ }
873
+ lines.push("---");
874
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
875
+ return lines.join("\n");
876
+ }
877
+ function markdownComparisonTable(comparisons) {
878
+ const lines = [];
879
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
880
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
881
+ for (const c3 of comparisons) {
882
+ const baselineStr = c3.baseline ? formatStats(c3.baseline) : "\u2014";
883
+ const currentStr = formatStats(c3.current);
884
+ const deltaStr = c3.delta !== null ? formatDelta2(c3.delta) : "\u2014";
885
+ const status = statusIndicator(c3);
886
+ lines.push(`| ${c3.providerId} | ${c3.taskName} | ${c3.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
887
+ }
888
+ return lines.join("\n");
889
+ }
890
+ function markdownCostSummary(cost) {
891
+ const lines = [];
892
+ lines.push("### \u{1F4B0} Cost Summary");
893
+ lines.push("");
894
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
895
+ if (cost.budget !== void 0) {
896
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
897
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
898
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
899
+ }
900
+ if (cost.perProvider.size > 1) {
901
+ lines.push("");
902
+ lines.push("| Provider | Cost |");
903
+ lines.push("|----------|------|");
904
+ for (const [provider, usd] of cost.perProvider) {
905
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
906
+ }
907
+ }
908
+ return lines.join("\n");
909
+ }
910
+ function formatStats(stats) {
911
+ if (stats.n > 1) {
912
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
913
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
914
+ }
915
+ return stats.mean.toFixed(3);
916
+ }
917
+ function formatDelta2(delta) {
918
+ const sign = delta >= 0 ? "+" : "";
919
+ return `${sign}${delta.toFixed(3)}`;
920
+ }
921
+ function statusIndicator(c3) {
922
+ if (c3.regressed) return "\u{1F534} regressed";
923
+ if (c3.improved) return "\u{1F7E2} improved";
924
+ if (c3.baseline === null) return "\u{1F195} new";
925
+ return "\u26AA unchanged";
926
+ }
927
+ var COMMENT_MARKER;
928
+ var init_markdown = __esm({
929
+ "src/reporter/markdown.ts"() {
930
+ "use strict";
931
+ COMMENT_MARKER = "<!-- duelist-ci-report -->";
932
+ }
933
+ });
934
+
935
+ // src/github.ts
936
+ var github_exports = {};
937
+ __export(github_exports, {
938
+ detectGitHubContext: () => detectGitHubContext,
939
+ findExistingComment: () => findExistingComment,
940
+ upsertPrComment: () => upsertPrComment
941
+ });
942
+ import { readFileSync as readFileSync2 } from "fs";
943
+ function detectGitHubContext() {
944
+ const token = process.env.GITHUB_TOKEN;
945
+ const repository = process.env.GITHUB_REPOSITORY;
946
+ const eventPath = process.env.GITHUB_EVENT_PATH;
947
+ if (!token || !repository) return null;
948
+ const [owner, repo] = repository.split("/");
949
+ if (!owner || !repo) return null;
950
+ let prNumber;
951
+ if (eventPath) {
952
+ try {
953
+ const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
954
+ if (event.pull_request && typeof event.pull_request === "object") {
955
+ const pr = event.pull_request;
956
+ prNumber = pr.number;
957
+ }
958
+ if (!prNumber && event.issue && typeof event.issue === "object") {
959
+ const issue = event.issue;
960
+ if (issue.pull_request) {
961
+ prNumber = issue.number;
962
+ }
963
+ }
964
+ } catch {
965
+ }
966
+ }
967
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
968
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
969
+ }
970
+ if (!prNumber) return null;
971
+ return { token, owner, repo, prNumber };
972
+ }
973
+ async function findExistingComment(ctx, marker) {
974
+ let page = 1;
975
+ const perPage = 50;
976
+ while (true) {
977
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
978
+ const res = await fetch(url, {
979
+ headers: {
980
+ Authorization: `Bearer ${ctx.token}`,
981
+ Accept: "application/vnd.github+json",
982
+ "X-GitHub-Api-Version": "2022-11-28"
983
+ }
984
+ });
985
+ if (!res.ok) return null;
986
+ const comments = await res.json();
987
+ if (comments.length === 0) break;
988
+ for (const comment of comments) {
989
+ if (comment.body?.includes(marker)) {
990
+ return comment.id;
991
+ }
992
+ }
993
+ if (comments.length < perPage) break;
994
+ page++;
995
+ }
996
+ return null;
997
+ }
998
+ async function upsertPrComment(ctx, body, marker) {
999
+ const existingId = await findExistingComment(ctx, marker);
1000
+ if (existingId) {
1001
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
1002
+ const res = await fetch(url, {
1003
+ method: "PATCH",
1004
+ headers: {
1005
+ Authorization: `Bearer ${ctx.token}`,
1006
+ Accept: "application/vnd.github+json",
1007
+ "Content-Type": "application/json",
1008
+ "X-GitHub-Api-Version": "2022-11-28"
1009
+ },
1010
+ body: JSON.stringify({ body })
1011
+ });
1012
+ if (!res.ok) {
1013
+ const text = await res.text();
1014
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
1015
+ }
1016
+ } else {
1017
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
1018
+ const res = await fetch(url, {
1019
+ method: "POST",
1020
+ headers: {
1021
+ Authorization: `Bearer ${ctx.token}`,
1022
+ Accept: "application/vnd.github+json",
1023
+ "Content-Type": "application/json",
1024
+ "X-GitHub-Api-Version": "2022-11-28"
1025
+ },
1026
+ body: JSON.stringify({ body })
1027
+ });
1028
+ if (!res.ok) {
1029
+ const text = await res.text();
1030
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
1031
+ }
1032
+ }
1033
+ }
1034
+ var API_BASE;
1035
+ var init_github = __esm({
1036
+ "src/github.ts"() {
1037
+ "use strict";
1038
+ API_BASE = "https://api.github.com";
1039
+ }
1040
+ });
1041
+
378
1042
  // node_modules/tsx/dist/temporary-directory-CwHp0_NW.mjs
379
1043
  import r from "path";
380
1044
  import o from "os";
@@ -6530,10 +7194,10 @@ var init_api = __esm({
6530
7194
  // src/cli.ts
6531
7195
  import "dotenv/config";
6532
7196
  import { Command } from "commander";
6533
- import { readFileSync, writeFileSync, existsSync } from "fs";
6534
- import { resolve, join, dirname } from "path";
7197
+ import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, existsSync } from "fs";
7198
+ import { resolve, join, dirname as dirname2 } from "path";
6535
7199
  import { pathToFileURL, fileURLToPath } from "url";
6536
- var __dirname2 = dirname(fileURLToPath(import.meta.url));
7200
+ var __dirname2 = dirname2(fileURLToPath(import.meta.url));
6537
7201
  var program = new Command();
6538
7202
  program.name("duelist").description("Pit LLM providers against each other on agent tasks.").version(getVersion());
6539
7203
  program.command("init").description("Scaffold an arena.config.ts in the current directory").option("--force", "Overwrite existing config file").action((opts) => {
@@ -6545,11 +7209,11 @@ program.command("init").description("Scaffold an arena.config.ts in the current
6545
7209
  const templatePath = join(__dirname2, "..", "templates", "arena.config.ts");
6546
7210
  let template;
6547
7211
  if (existsSync(templatePath)) {
6548
- template = readFileSync(templatePath, "utf-8");
7212
+ template = readFileSync3(templatePath, "utf-8");
6549
7213
  } else {
6550
7214
  template = DEFAULT_TEMPLATE;
6551
7215
  }
6552
- writeFileSync(target, template);
7216
+ writeFileSync2(target, template);
6553
7217
  console.log(existsSync(target) && opts.force ? "Overwrote arena.config.ts" : "Created arena.config.ts");
6554
7218
  console.log("");
6555
7219
  console.log("Next steps:");
@@ -6557,17 +7221,119 @@ program.command("init").description("Scaffold an arena.config.ts in the current
6557
7221
  console.log(" 2. npx duelist run");
6558
7222
  });
6559
7223
  program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console or json", "console").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
6560
- const configPath = resolve(opts.config);
7224
+ if (!["console", "json"].includes(opts.reporter)) {
7225
+ console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
7226
+ process.exit(1);
7227
+ }
7228
+ const typedArena = await loadArenaConfig(opts.config);
7229
+ try {
7230
+ const showProgress = opts.reporter === "console" && !opts.quiet;
7231
+ const onResult = showProgress ? logResult : void 0;
7232
+ const results = await typedArena.run({ onResult });
7233
+ const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
7234
+ const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
7235
+ if (opts.reporter === "json") {
7236
+ console.log(jsonReporter2(results));
7237
+ } else {
7238
+ console.log("");
7239
+ consoleReporter2(results, { sparklines: typedArena.config?.sparklines });
7240
+ }
7241
+ const allFailed = results.length > 0 && results.every((r3) => r3.error);
7242
+ if (allFailed) process.exit(1);
7243
+ } catch (err) {
7244
+ const message = err instanceof Error ? err.message : String(err);
7245
+ console.error(`Benchmark failed: ${message}`);
7246
+ process.exit(1);
7247
+ }
7248
+ });
7249
+ function collectThreshold(value, previous) {
7250
+ const [scorer, delta] = value.split("=");
7251
+ if (!scorer || delta === void 0 || isNaN(Number(delta))) {
7252
+ console.error(`Invalid threshold format: "${value}". Expected scorer=delta (e.g., correctness=0.1)`);
7253
+ process.exit(1);
7254
+ }
7255
+ previous.set(scorer, Number(delta));
7256
+ return previous;
7257
+ }
7258
+ program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
7259
+ const ciOpts = {
7260
+ configPath: opts.config,
7261
+ baselinePath: resolve(opts.baseline),
7262
+ budget: opts.budget,
7263
+ thresholds: opts.threshold,
7264
+ updateBaseline: opts.updateBaseline ?? false,
7265
+ comment: opts.comment ?? false,
7266
+ quiet: opts.quiet ?? false
7267
+ };
7268
+ const typedArena = await loadArenaConfig(ciOpts.configPath);
7269
+ console.log("Running benchmarks...");
7270
+ const onResult = ciOpts.quiet ? void 0 : logResult;
7271
+ let results;
7272
+ try {
7273
+ results = await typedArena.run({ onResult });
7274
+ } catch (err) {
7275
+ const message = err instanceof Error ? err.message : String(err);
7276
+ console.error(`Benchmark failed: ${message}`);
7277
+ process.exit(1);
7278
+ }
7279
+ const { loadBaseline: loadBaseline2, saveBaseline: saveBaseline2, computeStats: computeStats2, compareResults: compareResults2 } = await Promise.resolve().then(() => (init_ci(), ci_exports));
7280
+ const baseline = loadBaseline2(ciOpts.baselinePath);
7281
+ const baselineStats = baseline ? computeStats2(baseline.results) : null;
7282
+ if (baseline) {
7283
+ console.log(`Loaded baseline from ${ciOpts.baselinePath} (${baseline.timestamp})`);
7284
+ } else {
7285
+ console.log("No baseline found \u2014 this run establishes the first baseline.");
7286
+ }
7287
+ const currentStats = computeStats2(results);
7288
+ const report = compareResults2(baselineStats, currentStats, ciOpts.thresholds, ciOpts.budget, results);
7289
+ const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
7290
+ console.log("");
7291
+ consoleReporter2(results, { sparklines: typedArena.config?.sparklines });
7292
+ const { markdownReporter: markdownReporter2, COMMENT_MARKER: COMMENT_MARKER2 } = await Promise.resolve().then(() => (init_markdown(), markdown_exports));
7293
+ if (report.flakyResults.length > 0) {
7294
+ console.log(`\u26A0 ${report.flakyResults.length} flaky result(s) detected (high variance)`);
7295
+ }
7296
+ if (report.cost.overBudget) {
7297
+ console.log(`\u{1F534} Budget exceeded: $${report.cost.totalUsd.toFixed(4)} > $${report.cost.budget.toFixed(2)}`);
7298
+ }
7299
+ for (const reason of report.failureReasons) {
7300
+ console.log(`\u{1F534} ${reason}`);
7301
+ }
7302
+ if (!report.failed) {
7303
+ console.log("\u{1F7E2} CI passed");
7304
+ }
7305
+ if (ciOpts.comment) {
7306
+ const { detectGitHubContext: detectGitHubContext2, upsertPrComment: upsertPrComment2 } = await Promise.resolve().then(() => (init_github(), github_exports));
7307
+ const ghCtx = detectGitHubContext2();
7308
+ if (ghCtx) {
7309
+ const markdown = markdownReporter2(report, results);
7310
+ try {
7311
+ await upsertPrComment2(ghCtx, markdown, COMMENT_MARKER2);
7312
+ console.log("Posted results to PR comment.");
7313
+ } catch (err) {
7314
+ console.warn(`Failed to post PR comment: ${err instanceof Error ? err.message : err}`);
7315
+ }
7316
+ } else {
7317
+ console.warn("--comment: not in a GitHub Actions PR context, skipping.");
7318
+ }
7319
+ }
7320
+ if (ciOpts.updateBaseline && !report.failed) {
7321
+ saveBaseline2(ciOpts.baselinePath, results);
7322
+ console.log(`Baseline saved to ${ciOpts.baselinePath}`);
7323
+ } else if (ciOpts.updateBaseline && report.failed) {
7324
+ console.log("Baseline not updated (CI failed).");
7325
+ }
7326
+ process.exit(report.failed ? 1 : 0);
7327
+ });
7328
+ program.parse();
7329
+ async function loadArenaConfig(configOpt) {
7330
+ const configPath = resolve(configOpt);
6561
7331
  if (!existsSync(configPath)) {
6562
7332
  console.error(`Config not found: ${configPath}`);
6563
7333
  console.error("");
6564
7334
  console.error("Create one with: npx duelist init");
6565
7335
  process.exit(1);
6566
7336
  }
6567
- if (!["console", "json"].includes(opts.reporter)) {
6568
- console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
6569
- process.exit(1);
6570
- }
6571
7337
  let mod;
6572
7338
  try {
6573
7339
  if (configPath.endsWith(".ts")) {
@@ -6591,35 +7357,16 @@ program.command("run").description("Run benchmarks defined in your arena config"
6591
7357
  console.error(`Loaded from: ${configPath}`);
6592
7358
  process.exit(1);
6593
7359
  }
6594
- try {
6595
- const typedArena = arena;
6596
- const showProgress = opts.reporter === "console" && !opts.quiet;
6597
- const onResult = showProgress ? (result) => {
6598
- if (result.error) {
6599
- console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
6600
- } else {
6601
- const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
6602
- console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
6603
- }
6604
- } : void 0;
6605
- const results = await typedArena.run({ onResult });
6606
- const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
6607
- const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
6608
- if (opts.reporter === "json") {
6609
- console.log(jsonReporter2(results));
6610
- } else {
6611
- console.log("");
6612
- consoleReporter2(results);
6613
- }
6614
- const allFailed = results.length > 0 && results.every((r3) => r3.error);
6615
- if (allFailed) process.exit(1);
6616
- } catch (err) {
6617
- const message = err instanceof Error ? err.message : String(err);
6618
- console.error(`Benchmark failed: ${message}`);
6619
- process.exit(1);
7360
+ return arena;
7361
+ }
7362
+ function logResult(result) {
7363
+ if (result.error) {
7364
+ console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
7365
+ } else {
7366
+ const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
7367
+ console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
6620
7368
  }
6621
- });
6622
- program.parse();
7369
+ }
6623
7370
  async function importTypeScript(filePath) {
6624
7371
  try {
6625
7372
  await Promise.resolve().then(() => (init_api(), api_exports));
@@ -6652,7 +7399,7 @@ function formatScoreForLog(s5) {
6652
7399
  }
6653
7400
  function getVersion() {
6654
7401
  try {
6655
- const pkg = readFileSync(join(__dirname2, "..", "package.json"), "utf-8");
7402
+ const pkg = readFileSync3(join(__dirname2, "..", "package.json"), "utf-8");
6656
7403
  return JSON.parse(pkg).version ?? "0.0.0";
6657
7404
  } catch {
6658
7405
  return "0.0.0";
@@ -6674,12 +7421,12 @@ import { z } from 'zod'
6674
7421
 
6675
7422
  export default defineArena({
6676
7423
  providers: [
6677
- openai('gpt-4o-mini'),
7424
+ openai('gpt-5-mini'),
6678
7425
  // Add more providers to compare:
6679
- // openai('gpt-4o'),
6680
- // azureOpenai('gpt-4o-mini'),
6681
- // anthropic('claude-sonnet-4-20250514'),
6682
- // gemini('gemini-2.5-flash'),
7426
+ // openai('gpt-5.2'),
7427
+ // azureOpenai('gpt-5-mini'),
7428
+ // anthropic('claude-sonnet-4.6'),
7429
+ // gemini('gemini-3-flash-preview'),
6683
7430
  ],
6684
7431
 
6685
7432
  tasks: [