agent-duelist 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -38,343 +38,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
38
38
  mod
39
39
  ));
40
40
 
41
- // src/reporter/console.ts
42
- var console_exports = {};
43
- __export(console_exports, {
44
- consoleReporter: () => consoleReporter
45
- });
46
- function bold(s5) {
47
- return `${boldCode}${s5}${reset}`;
48
- }
49
- function dim(s5) {
50
- return `${dimCode}${s5}${reset}`;
51
- }
52
- function colorScore(value) {
53
- const pct = Math.round(value * 100);
54
- const str = `${pct}%`;
55
- if (value >= 0.8) return `${green}${str}${reset}`;
56
- if (value >= 0.5) return `${yellow}${str}${reset}`;
57
- return `${red}${str}${reset}`;
58
- }
59
- function consoleReporter(results) {
60
- if (results.length === 0) {
61
- console.log("\nNo results to display.\n");
62
- return;
63
- }
64
- const tasks = [...new Set(results.map((r3) => r3.taskName))];
65
- const providers = [...new Set(results.map((r3) => r3.providerId))];
66
- const scorerNames = [...new Set(results.flatMap((r3) => r3.scores.map((s5) => s5.name)))];
67
- const hasCost = scorerNames.includes("cost");
68
- const hasErrors = results.some((r3) => r3.error);
69
- const runsPerCell = Math.max(...results.map((r3) => r3.run));
70
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
71
- console.log("");
72
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
73
- console.log(` ${dim("\u2500".repeat(70))}`);
74
- console.log("");
75
- for (const task of tasks) {
76
- console.log(` ${bold(`Task: ${task}`)}`);
77
- const cols = [{ label: "Provider", width: 22, align: "left" }];
78
- for (const name of scorerNames) {
79
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
80
- else if (name === "cost") {
81
- cols.push({ label: "Cost", width: 12, align: "right" });
82
- cols.push({ label: "Tokens", width: 9, align: "right" });
83
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
84
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
85
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
86
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
87
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
88
- else cols.push({ label: name, width: 10, align: "right" });
89
- }
90
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
91
- const totalWidth = cols.reduce((sum, c3) => sum + c3.width + 2, 0);
92
- console.log(` ${dim(cols.map((c3) => pad(c3.label, c3.width + 2, c3.align)).join(""))}`);
93
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
94
- for (const provider of providers) {
95
- const taskResults = results.filter(
96
- (r3) => r3.taskName === task && r3.providerId === provider
97
- );
98
- const errorResults2 = taskResults.filter((r3) => r3.error);
99
- const successResults = taskResults.filter((r3) => !r3.error);
100
- if (successResults.length === 0 && errorResults2.length > 0) {
101
- const cells2 = [pad(provider, 24, "left")];
102
- for (const name of scorerNames) {
103
- if (name === "cost") {
104
- cells2.push(pad("\u2014", 14, "right"));
105
- cells2.push(pad("\u2014", 11, "right"));
106
- } else cells2.push(pad("\u2014", cols.find((c3) => c3.label !== "Provider").width + 2, "right"));
107
- }
108
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
109
- console.log(` ${cells2.join("")}`);
110
- continue;
111
- }
112
- const avgScores = averageScores(successResults);
113
- const avgDetails = averageDetails(successResults);
114
- const latencyMs = average(successResults.map((r3) => r3.raw.latencyMs));
115
- const cells = [pad(provider, 24, "left")];
116
- for (const name of scorerNames) {
117
- if (name === "latency") {
118
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
119
- } else if (name === "cost") {
120
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
121
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
122
- } else {
123
- const val = avgScores[name];
124
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
125
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
126
- }
127
- }
128
- if (hasErrors) {
129
- const failCount = errorResults2.length;
130
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
131
- }
132
- console.log(` ${cells.join("")}`);
133
- }
134
- console.log("");
135
- }
136
- printSummary(results, providers);
137
- const errorResults = results.filter((r3) => r3.error);
138
- if (errorResults.length > 0) {
139
- console.log(` ${bold("Errors")}`);
140
- console.log(` ${dim("\u2500".repeat(70))}`);
141
- const seen = /* @__PURE__ */ new Set();
142
- for (const r3 of errorResults) {
143
- const key = `${r3.providerId}::${r3.error}`;
144
- if (seen.has(key)) continue;
145
- seen.add(key);
146
- const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
147
- const suffix = count > 1 ? ` (\xD7${count})` : "";
148
- console.log(` ${red}\u2717${reset} ${r3.providerId}: ${r3.error}${suffix}`);
149
- const hint = apiKeyHint(r3.providerId, r3.error ?? "");
150
- if (hint) console.log(` ${dim(hint)}`);
151
- }
152
- console.log("");
153
- }
154
- if (hasCost) {
155
- console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
156
- console.log("");
157
- }
158
- }
159
- function printSummary(results, providers) {
160
- const successResults = results.filter((r3) => !r3.error);
161
- if (successResults.length === 0) return;
162
- console.log(` ${dim("\u2500".repeat(70))}`);
163
- console.log(` ${bold("Summary")}`);
164
- console.log("");
165
- const single = providers.length === 1;
166
- const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
167
- const byCorrectness = rankProviders(successResults, providers, correctnessKey);
168
- if (byCorrectness) {
169
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
170
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
171
- }
172
- const byLatency = providers.map((id) => {
173
- const runs = successResults.filter((r3) => r3.providerId === id);
174
- const avg = average(runs.map((r3) => r3.raw.latencyMs));
175
- return { id, avg: avg ?? Infinity };
176
- }).sort((a7, b3) => a7.avg - b3.avg)[0];
177
- if (byLatency && byLatency.avg !== Infinity) {
178
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
179
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
180
- }
181
- const byCost = providers.map((id) => {
182
- const runs = successResults.filter((r3) => r3.providerId === id);
183
- const costs = runs.map((r3) => {
184
- const s5 = r3.scores.find((s6) => s6.name === "cost");
185
- return s5 && s5.value >= 0 ? s5.value : void 0;
186
- }).filter((c3) => c3 !== void 0);
187
- const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
188
- return { id, avg };
189
- }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
190
- if (byCost?.avg !== void 0) {
191
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
192
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
193
- }
194
- console.log("");
195
- }
196
- function rankProviders(results, providers, scorerName) {
197
- const ranked = providers.map((id) => {
198
- const runs = results.filter((r3) => r3.providerId === id);
199
- const scores = runs.flatMap((r3) => r3.scores.filter((s5) => s5.name === scorerName && s5.value >= 0)).map((s5) => s5.value);
200
- const avg = scores.length > 0 ? scores.reduce((a7, b3) => a7 + b3, 0) / scores.length : void 0;
201
- return { id, avg };
202
- }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => b3.avg - a7.avg);
203
- return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
204
- }
205
- function averageScores(results) {
206
- const sums = {};
207
- const counts = {};
208
- for (const result of results) {
209
- for (const score of result.scores) {
210
- if (score.value < 0) continue;
211
- sums[score.name] = (sums[score.name] ?? 0) + score.value;
212
- counts[score.name] = (counts[score.name] ?? 0) + 1;
213
- }
214
- }
215
- const avgs = {};
216
- for (const name of Object.keys(sums)) {
217
- avgs[name] = sums[name] / counts[name];
218
- }
219
- return avgs;
220
- }
221
- function averageDetails(results) {
222
- let costSum = 0;
223
- let costCount = 0;
224
- let tokenSum = 0;
225
- let tokenCount = 0;
226
- for (const result of results) {
227
- const costScore = result.scores.find((s5) => s5.name === "cost");
228
- const details = costScore?.details;
229
- if (details?.estimatedUsd != null) {
230
- costSum += details.estimatedUsd;
231
- costCount++;
232
- }
233
- if (details?.totalTokens != null) {
234
- tokenSum += details.totalTokens;
235
- tokenCount++;
236
- }
237
- }
238
- return {
239
- costUsd: costCount > 0 ? costSum / costCount : void 0,
240
- totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
241
- };
242
- }
243
- function average(nums) {
244
- if (nums.length === 0) return void 0;
245
- return nums.reduce((a7, b3) => a7 + b3, 0) / nums.length;
246
- }
247
- function formatCost(usd) {
248
- if (usd === void 0) return "\u2014";
249
- if (usd === 0) return "$0.00";
250
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
251
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
252
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
253
- }
254
- function pad(str, width, align) {
255
- if (align === "right") return str.padStart(width);
256
- return str.padEnd(width);
257
- }
258
- function colorLen(str) {
259
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
260
- return str.length - stripped.length;
261
- }
262
- function apiKeyHint(providerId, error) {
263
- const lower = error.toLowerCase();
264
- const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
265
- if (!isAuthError) return void 0;
266
- const prefix = providerId.split("/")[0];
267
- switch (prefix) {
268
- case "openai":
269
- return "Set: export OPENAI_API_KEY=sk-...";
270
- case "azure":
271
- return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
272
- case "anthropic":
273
- return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
274
- case "google":
275
- return "Set: export GOOGLE_API_KEY=...";
276
- default:
277
- return `Check the API key for ${providerId}`;
278
- }
279
- }
280
- function providerLabel(providerId) {
281
- const prefix = providerId.split("/")[0];
282
- switch (prefix) {
283
- case "azure":
284
- return "(OpenAI via Azure)";
285
- case "openai":
286
- return "(OpenAI)";
287
- case "anthropic":
288
- return "(Anthropic)";
289
- case "google":
290
- return "(Google)";
291
- case "mistral":
292
- return "(Mistral)";
293
- case "meta":
294
- return "(Meta)";
295
- case "deepseek":
296
- return "(DeepSeek)";
297
- case "cohere":
298
- return "(Cohere)";
299
- case "qwen":
300
- return "(Qwen)";
301
- case "xai":
302
- return "(xAI)";
303
- case "minimax":
304
- return "(MiniMax)";
305
- case "moonshot":
306
- return "(Moonshot / Kimi)";
307
- case "perplexity":
308
- return "(Perplexity)";
309
- case "amazon":
310
- return "(Amazon)";
311
- case "nvidia":
312
- return "(NVIDIA)";
313
- case "microsoft":
314
- return "(Microsoft)";
315
- case "ai21":
316
- return "(AI21 Labs)";
317
- case "bytedance":
318
- return "(ByteDance)";
319
- case "together":
320
- return "(Together AI)";
321
- case "fireworks":
322
- return "(Fireworks AI)";
323
- case "groq":
324
- return "(Groq)";
325
- case "cerebras":
326
- return "(Cerebras)";
327
- default:
328
- return `(${prefix})`;
329
- }
330
- }
331
- var reset, boldCode, dimCode, green, red, yellow, cyan;
332
- var init_console = __esm({
333
- "src/reporter/console.ts"() {
334
- "use strict";
335
- reset = "\x1B[0m";
336
- boldCode = "\x1B[1m";
337
- dimCode = "\x1B[2m";
338
- green = "\x1B[32m";
339
- red = "\x1B[31m";
340
- yellow = "\x1B[33m";
341
- cyan = "\x1B[36m";
342
- }
343
- });
344
-
345
- // src/reporter/json.ts
346
- var json_exports = {};
347
- __export(json_exports, {
348
- jsonReporter: () => jsonReporter
349
- });
350
- function jsonReporter(results) {
351
- return JSON.stringify(
352
- {
353
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
354
- summary: buildSummary(results),
355
- results
356
- },
357
- null,
358
- 2
359
- );
360
- }
361
- function buildSummary(results) {
362
- const tasks = [...new Set(results.map((r3) => r3.taskName))];
363
- const providers = [...new Set(results.map((r3) => r3.providerId))];
364
- return {
365
- totalBenchmarks: results.length,
366
- tasks: tasks.length,
367
- providers: providers.length,
368
- providerIds: providers,
369
- taskNames: tasks
370
- };
371
- }
372
- var init_json = __esm({
373
- "src/reporter/json.ts"() {
374
- "use strict";
375
- }
376
- });
377
-
378
41
  // node_modules/tsx/dist/temporary-directory-CwHp0_NW.mjs
379
42
  import r from "path";
380
43
  import o from "os";
@@ -6530,44 +6193,1795 @@ var init_api = __esm({
6530
6193
  // src/cli.ts
6531
6194
  import "dotenv/config";
6532
6195
  import { Command } from "commander";
6533
- import { readFileSync, writeFileSync, existsSync } from "fs";
6534
- import { resolve, join, dirname } from "path";
6196
+ import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2, existsSync } from "fs";
6197
+ import { resolve, join, dirname as dirname2 } from "path";
6535
6198
  import { pathToFileURL, fileURLToPath } from "url";
6536
- var __dirname2 = dirname(fileURLToPath(import.meta.url));
6537
- var program = new Command();
6538
- program.name("duelist").description("Pit LLM providers against each other on agent tasks.").version(getVersion());
6539
- program.command("init").description("Scaffold an arena.config.ts in the current directory").option("--force", "Overwrite existing config file").action((opts) => {
6540
- const target = resolve("arena.config.ts");
6541
- if (existsSync(target) && !opts.force) {
6542
- console.error("arena.config.ts already exists. Use --force to overwrite.");
6543
- process.exit(1);
6199
+
6200
+ // src/utils/format.ts
6201
+ var MAX_FRACTION_DIGITS = 100;
6202
+ function formatCost(usd) {
6203
+ if (usd === void 0) return "\u2014";
6204
+ if (usd === 0) return "$0.00";
6205
+ if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
6206
+ const digits = Math.min(
6207
+ MAX_FRACTION_DIGITS,
6208
+ Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
6209
+ );
6210
+ return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
6211
+ }
6212
+ function formatDelta(delta, precision = 4) {
6213
+ const sign = delta >= 0 ? "+" : "";
6214
+ return `${sign}${delta.toFixed(precision)}`;
6215
+ }
6216
+
6217
+ // src/reporter/shared.ts
6218
+ function groupResults(results) {
6219
+ const taskSet = /* @__PURE__ */ new Set();
6220
+ const providerSet = /* @__PURE__ */ new Set();
6221
+ const scorerSet = /* @__PURE__ */ new Set();
6222
+ const grouped = /* @__PURE__ */ new Map();
6223
+ const byProvider = /* @__PURE__ */ new Map();
6224
+ let hasErrors = false;
6225
+ let maxRun = 0;
6226
+ for (const r3 of results) {
6227
+ taskSet.add(r3.taskName);
6228
+ providerSet.add(r3.providerId);
6229
+ for (const s5 of r3.scores) scorerSet.add(s5.name);
6230
+ if (r3.error) hasErrors = true;
6231
+ if (r3.run > maxRun) maxRun = r3.run;
6232
+ const key = `${r3.taskName}::${r3.providerId}`;
6233
+ let group = grouped.get(key);
6234
+ if (!group) {
6235
+ group = [];
6236
+ grouped.set(key, group);
6237
+ }
6238
+ group.push(r3);
6239
+ let provGroup = byProvider.get(r3.providerId);
6240
+ if (!provGroup) {
6241
+ provGroup = [];
6242
+ byProvider.set(r3.providerId, provGroup);
6243
+ }
6244
+ provGroup.push(r3);
6245
+ }
6246
+ return {
6247
+ tasks: [...taskSet],
6248
+ providers: [...providerSet],
6249
+ scorerNames: [...scorerSet],
6250
+ grouped,
6251
+ byProvider,
6252
+ hasErrors,
6253
+ maxRun
6254
+ };
6255
+ }
6256
+ function aggregateProviderTask(providerId, grouped, task) {
6257
+ const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
6258
+ const errorResults = taskResults.filter((r3) => r3.error);
6259
+ const successResults = taskResults.filter((r3) => !r3.error);
6260
+ if (successResults.length === 0) {
6261
+ return {
6262
+ providerId,
6263
+ avgScores: {},
6264
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
6265
+ latencyMs: void 0,
6266
+ allErrors: errorResults.length > 0,
6267
+ errorCount: errorResults.length
6268
+ };
6269
+ }
6270
+ return {
6271
+ providerId,
6272
+ avgScores: averageScores(successResults),
6273
+ avgDetails: averageDetails(successResults),
6274
+ latencyMs: average(successResults.map((r3) => r3.raw.latencyMs)),
6275
+ allErrors: false,
6276
+ errorCount: errorResults.length
6277
+ };
6278
+ }
6279
+ function averageScores(results) {
6280
+ const sums = {};
6281
+ const counts = {};
6282
+ for (const result of results) {
6283
+ for (const score of result.scores) {
6284
+ if (score.value < 0) continue;
6285
+ sums[score.name] = (sums[score.name] ?? 0) + score.value;
6286
+ counts[score.name] = (counts[score.name] ?? 0) + 1;
6287
+ }
6288
+ }
6289
+ const avgs = {};
6290
+ for (const name of Object.keys(sums)) {
6291
+ avgs[name] = sums[name] / counts[name];
6292
+ }
6293
+ return avgs;
6294
+ }
6295
+ function averageDetails(results) {
6296
+ let costSum = 0;
6297
+ let costCount = 0;
6298
+ let tokenSum = 0;
6299
+ let tokenCount = 0;
6300
+ for (const result of results) {
6301
+ const costScore = result.scores.find((s5) => s5.name === "cost");
6302
+ const details = costScore?.details;
6303
+ if (details?.estimatedUsd != null) {
6304
+ costSum += details.estimatedUsd;
6305
+ costCount++;
6306
+ }
6307
+ if (details?.totalTokens != null) {
6308
+ tokenSum += details.totalTokens;
6309
+ tokenCount++;
6310
+ }
6311
+ }
6312
+ return {
6313
+ costUsd: costCount > 0 ? costSum / costCount : void 0,
6314
+ totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
6315
+ };
6316
+ }
6317
+ function average(nums) {
6318
+ if (nums.length === 0) return void 0;
6319
+ return nums.reduce((a7, b3) => a7 + b3, 0) / nums.length;
6320
+ }
6321
+ function computeColumnStats(providerData, scorerNames) {
6322
+ const stats = /* @__PURE__ */ new Map();
6323
+ const valid = providerData.filter((p5) => !p5.allErrors);
6324
+ if (scorerNames.includes("latency")) {
6325
+ const values = /* @__PURE__ */ new Map();
6326
+ for (const p5 of providerData) {
6327
+ values.set(p5.providerId, p5.allErrors ? void 0 : p5.latencyMs);
6328
+ }
6329
+ const nums = valid.map((p5) => p5.latencyMs).filter((v4) => v4 !== void 0);
6330
+ stats.set("latency", {
6331
+ values,
6332
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
6333
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
6334
+ });
6335
+ }
6336
+ if (scorerNames.includes("cost")) {
6337
+ const costValues = /* @__PURE__ */ new Map();
6338
+ const tokenValues = /* @__PURE__ */ new Map();
6339
+ for (const p5 of providerData) {
6340
+ costValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.costUsd);
6341
+ tokenValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.totalTokens);
6342
+ }
6343
+ const costNums = valid.map((p5) => p5.avgDetails.costUsd).filter((v4) => v4 !== void 0);
6344
+ const tokenNums = valid.map((p5) => p5.avgDetails.totalTokens).filter((v4) => v4 !== void 0);
6345
+ stats.set("cost", {
6346
+ values: costValues,
6347
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
6348
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
6349
+ });
6350
+ stats.set("tokens", {
6351
+ values: tokenValues,
6352
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
6353
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
6354
+ });
6355
+ }
6356
+ for (const name of scorerNames) {
6357
+ if (name === "latency" || name === "cost") continue;
6358
+ const values = /* @__PURE__ */ new Map();
6359
+ for (const p5 of providerData) {
6360
+ values.set(p5.providerId, p5.allErrors ? void 0 : p5.avgScores[name]);
6361
+ }
6362
+ const nums = valid.map((p5) => p5.avgScores[name]).filter((v4) => v4 !== void 0);
6363
+ stats.set(name, {
6364
+ values,
6365
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
6366
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
6367
+ });
6368
+ }
6369
+ return stats;
6370
+ }
6371
+ function computeMedals(columnStats, providerIds) {
6372
+ const medals = /* @__PURE__ */ new Map();
6373
+ if (providerIds.length < 2) {
6374
+ for (const id of providerIds) medals.set(id, "none");
6375
+ return medals;
6376
+ }
6377
+ const wins = /* @__PURE__ */ new Map();
6378
+ for (const id of providerIds) wins.set(id, 0);
6379
+ for (const [, colStats] of columnStats) {
6380
+ if (colStats.best === void 0) continue;
6381
+ const bestProviders = [...colStats.values.entries()].filter(([, v4]) => v4 !== void 0 && v4 === colStats.best);
6382
+ if (bestProviders.length === 1) {
6383
+ wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
6384
+ }
6385
+ }
6386
+ const totalWins = [...wins.values()].reduce((a7, b3) => a7 + b3, 0);
6387
+ if (totalWins === 0) {
6388
+ for (const id of providerIds) medals.set(id, "none");
6389
+ return medals;
6390
+ }
6391
+ const sorted = [...wins.entries()].sort(
6392
+ (a7, b3) => b3[1] - a7[1] || a7[0].localeCompare(b3[0])
6393
+ );
6394
+ const medalList = ["gold", "silver", "bronze"];
6395
+ let rank = 0;
6396
+ for (let i7 = 0; i7 < sorted.length; i7++) {
6397
+ if (i7 > 0 && sorted[i7][1] < sorted[i7 - 1][1]) {
6398
+ rank = i7;
6399
+ }
6400
+ const hasWins = sorted[i7][1] > 0;
6401
+ medals.set(sorted[i7][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
6402
+ }
6403
+ return medals;
6404
+ }
6405
+ function providerLabel(providerId) {
6406
+ const prefix = providerId.split("/")[0];
6407
+ switch (prefix) {
6408
+ case "azure":
6409
+ return "(OpenAI via Azure)";
6410
+ case "openai":
6411
+ return "(OpenAI)";
6412
+ case "anthropic":
6413
+ return "(Anthropic)";
6414
+ case "google":
6415
+ return "(Google)";
6416
+ case "mistral":
6417
+ return "(Mistral)";
6418
+ case "meta":
6419
+ return "(Meta)";
6420
+ case "deepseek":
6421
+ return "(DeepSeek)";
6422
+ case "cohere":
6423
+ return "(Cohere)";
6424
+ case "qwen":
6425
+ return "(Qwen)";
6426
+ case "xai":
6427
+ return "(xAI)";
6428
+ case "minimax":
6429
+ return "(MiniMax)";
6430
+ case "moonshot":
6431
+ return "(Moonshot / Kimi)";
6432
+ case "perplexity":
6433
+ return "(Perplexity)";
6434
+ case "amazon":
6435
+ return "(Amazon)";
6436
+ case "nvidia":
6437
+ return "(NVIDIA)";
6438
+ case "microsoft":
6439
+ return "(Microsoft)";
6440
+ case "ai21":
6441
+ return "(AI21 Labs)";
6442
+ case "bytedance":
6443
+ return "(ByteDance)";
6444
+ case "together":
6445
+ return "(Together AI)";
6446
+ case "fireworks":
6447
+ return "(Fireworks AI)";
6448
+ case "groq":
6449
+ return "(Groq)";
6450
+ case "cerebras":
6451
+ return "(Cerebras)";
6452
+ default:
6453
+ return `(${prefix})`;
6454
+ }
6455
+ }
6456
+ function apiKeyHint(providerId, error) {
6457
+ const lower = error.toLowerCase();
6458
+ const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
6459
+ if (!isAuthError) return void 0;
6460
+ const prefix = providerId.split("/")[0];
6461
+ switch (prefix) {
6462
+ case "openai":
6463
+ return "Set: export OPENAI_API_KEY=sk-...";
6464
+ case "azure":
6465
+ return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
6466
+ case "anthropic":
6467
+ return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
6468
+ case "google":
6469
+ return "Set: export GOOGLE_API_KEY=...";
6470
+ default:
6471
+ return `Check the API key for ${providerId}`;
6472
+ }
6473
+ }
6474
+ function rankProviders(successByProvider, providers, scorerName) {
6475
+ const ranked = providers.map((id) => {
6476
+ const runs = successByProvider.get(id) ?? [];
6477
+ const scores = runs.flatMap((r3) => r3.scores.filter((s5) => s5.name === scorerName && s5.value >= 0)).map((s5) => s5.value);
6478
+ const avg = scores.length > 0 ? scores.reduce((a7, b3) => a7 + b3, 0) / scores.length : void 0;
6479
+ return { id, avg };
6480
+ }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => b3.avg - a7.avg);
6481
+ return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
6482
+ }
6483
+ function scorerLabel(name) {
6484
+ switch (name) {
6485
+ case "correctness":
6486
+ return "Match";
6487
+ case "schema-correctness":
6488
+ return "Schema";
6489
+ case "fuzzy-similarity":
6490
+ return "Fuzzy";
6491
+ case "llm-judge-correctness":
6492
+ return "Judge";
6493
+ case "tool-usage":
6494
+ return "Tool";
6495
+ default:
6496
+ return name;
6497
+ }
6498
+ }
6499
+ function medalEmoji(medal) {
6500
+ switch (medal) {
6501
+ case "gold":
6502
+ return "\u{1F947}";
6503
+ case "silver":
6504
+ return "\u{1F948}";
6505
+ case "bronze":
6506
+ return "\u{1F949}";
6507
+ case "none":
6508
+ return "";
6509
+ }
6510
+ }
6511
+
6512
+ // src/reporter/console.ts
6513
+ var reset = "\x1B[0m";
6514
+ var boldCode = "\x1B[1m";
6515
+ var dimCode = "\x1B[2m";
6516
+ var green = "\x1B[32m";
6517
+ var red = "\x1B[31m";
6518
+ var yellow = "\x1B[33m";
6519
+ var cyan = "\x1B[36m";
6520
+ var brightGreen = "\x1B[92m";
6521
+ var brightWhite = "\x1B[97m";
6522
+ function bold(s5) {
6523
+ return `${boldCode}${s5}${reset}`;
6524
+ }
6525
+ function dim(s5) {
6526
+ return `${dimCode}${s5}${reset}`;
6527
+ }
6528
+ function stripAnsi(s5) {
6529
+ return s5.replace(/\x1b\[[0-9;]*m/g, "");
6530
+ }
6531
+ function displayWidth(s5) {
6532
+ const stripped = stripAnsi(s5);
6533
+ let width = 0;
6534
+ for (const ch of stripped) {
6535
+ const code = ch.codePointAt(0) ?? 0;
6536
+ if (code >= 126976) width += 2;
6537
+ else if (code >= 9728 && code <= 10175) width += 2;
6538
+ else width += 1;
6539
+ }
6540
+ return width;
6541
+ }
6542
+ function padCell(str, targetWidth, align) {
6543
+ const dw = displayWidth(str);
6544
+ const padding = Math.max(0, targetWidth - dw);
6545
+ if (align === "right") return " ".repeat(padding) + str;
6546
+ return str + " ".repeat(padding);
6547
+ }
6548
+ function sparkBar(ratio, width = 8) {
6549
+ const clamped = Math.max(0, Math.min(1, ratio));
6550
+ const fillLen = Math.round(clamped * width);
6551
+ const fill = "\u2593".repeat(fillLen);
6552
+ const track = "\u2591".repeat(width - fillLen);
6553
+ return { fill, track };
6554
+ }
6555
+ function drawTableLine(widths, position) {
6556
+ const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
6557
+ if (position === "bottom") {
6558
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
6559
+ }
6560
+ if (position === "merge") {
6561
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
6562
+ }
6563
+ const segments = widths.map((w4) => "\u2500".repeat(w4 + 2));
6564
+ if (position === "top") {
6565
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
6566
+ }
6567
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
6568
+ }
6569
+ function drawTableRow(cells, widths, aligns) {
6570
+ const parts = cells.map(
6571
+ (cell, i7) => " " + padCell(cell, widths[i7], aligns[i7]) + " "
6572
+ );
6573
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
6574
+ }
6575
+ function drawSpanRow(content, widths) {
6576
+ const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
6577
+ const dw = displayWidth(content);
6578
+ const padding = Math.max(0, totalInner - dw - 1);
6579
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
6580
+ }
6581
+ function colorByRank(text, value, colStats, providerCount) {
6582
+ if (value === void 0) return dim("\u2014");
6583
+ if (providerCount < 2) return text;
6584
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
6585
+ if (colStats.best === colStats.worst) return text;
6586
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
6587
+ if (value === colStats.worst) return `${red}${text}${reset}`;
6588
+ return `${yellow}${text}${reset}`;
6589
+ }
6590
+ function consoleReporter(results, options) {
6591
+ const showSparklines = options?.sparklines ?? true;
6592
+ if (results.length === 0) {
6593
+ console.log("\nNo results to display.\n");
6594
+ return;
6595
+ }
6596
+ const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
6597
+ const hasCost = scorerNames.includes("cost");
6598
+ const multi = providers.length >= 2;
6599
+ const runsPerCell = maxRun;
6600
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
6601
+ console.log("");
6602
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
6603
+ console.log(` ${dim("\u2501".repeat(72))}`);
6604
+ console.log("");
6605
+ for (const task of tasks) {
6606
+ console.log(` ${bold(`Task: ${task}`)}`);
6607
+ console.log("");
6608
+ const providerData = providers.map(
6609
+ (providerId) => aggregateProviderTask(providerId, grouped, task)
6610
+ );
6611
+ const columnStats = computeColumnStats(providerData, scorerNames);
6612
+ const medals = computeMedals(columnStats, providers);
6613
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
6614
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
6615
+ const cols = [
6616
+ { label: "Provider", width: providerWidth, align: "left" }
6617
+ ];
6618
+ for (const name of scorerNames) {
6619
+ if (name === "latency") {
6620
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
6621
+ } else if (name === "cost") {
6622
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
6623
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
6624
+ } else {
6625
+ cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
6626
+ }
6627
+ }
6628
+ if (hasErrors) {
6629
+ cols.push({ label: "Status", width: 8, align: "left" });
6630
+ }
6631
+ const widths = cols.map((c3) => c3.width);
6632
+ const aligns = cols.map((c3) => c3.align);
6633
+ console.log(` ${drawTableLine(widths, "top")}`);
6634
+ const headerCells = cols.map((c3) => bold(c3.label));
6635
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
6636
+ console.log(` ${drawTableLine(widths, "header")}`);
6637
+ for (const pd of providerData) {
6638
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
6639
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
6640
+ const cells = [providerCell];
6641
+ if (pd.allErrors) {
6642
+ for (const col of cols.slice(1)) {
6643
+ if (col.label === "Status") {
6644
+ cells.push(`${red}FAIL${reset}`);
6645
+ } else {
6646
+ cells.push(dim("\u2014"));
6647
+ }
6648
+ }
6649
+ } else {
6650
+ for (const col of cols.slice(1)) {
6651
+ if (col.label === "Status") {
6652
+ cells.push(
6653
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
6654
+ );
6655
+ continue;
6656
+ }
6657
+ const statsKey = col.statsKey;
6658
+ const colStats = columnStats.get(statsKey);
6659
+ if (statsKey === "latency") {
6660
+ const ms = pd.latencyMs;
6661
+ if (ms === void 0) {
6662
+ cells.push(dim("\u2014"));
6663
+ } else {
6664
+ const text = `${Math.round(ms)}ms`;
6665
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
6666
+ }
6667
+ } else if (statsKey === "cost") {
6668
+ const cost = pd.avgDetails.costUsd;
6669
+ if (cost === void 0) {
6670
+ cells.push(dim("\u2014"));
6671
+ } else {
6672
+ const text = formatCost(cost);
6673
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
6674
+ }
6675
+ } else if (statsKey === "tokens") {
6676
+ const tokens = pd.avgDetails.totalTokens;
6677
+ if (tokens === void 0) {
6678
+ cells.push(dim("\u2014"));
6679
+ } else {
6680
+ const text = `${tokens}`;
6681
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
6682
+ }
6683
+ } else {
6684
+ const val = pd.avgScores[statsKey];
6685
+ if (val === void 0) {
6686
+ cells.push(dim("\u2014"));
6687
+ } else {
6688
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
6689
+ let coloredPct;
6690
+ if (multi && colStats) {
6691
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
6692
+ } else {
6693
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
6694
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
6695
+ else coloredPct = `${red}${pctStr}${reset}`;
6696
+ }
6697
+ if (showSparklines) {
6698
+ const { fill, track } = sparkBar(val);
6699
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
6700
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
6701
+ } else {
6702
+ cells.push(coloredPct);
6703
+ }
6704
+ }
6705
+ }
6706
+ }
6707
+ }
6708
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
6709
+ }
6710
+ if (multi && providerData.some((p5) => !p5.allErrors)) {
6711
+ const winnerId = [...medals.entries()].find(([, m8]) => m8 === "gold")?.[0];
6712
+ if (winnerId) {
6713
+ console.log(` ${drawTableLine(widths, "merge")}`);
6714
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
6715
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
6716
+ }
6717
+ }
6718
+ console.log(` ${drawTableLine(widths, "bottom")}`);
6719
+ console.log("");
6720
+ }
6721
+ printSummary(results, providers, byProvider);
6722
+ const errorResults = results.filter((r3) => r3.error);
6723
+ if (errorResults.length > 0) {
6724
+ console.log(` ${bold("Errors")}`);
6725
+ console.log(` ${dim("\u2501".repeat(72))}`);
6726
+ const seen = /* @__PURE__ */ new Set();
6727
+ for (const r3 of errorResults) {
6728
+ const key = `${r3.providerId}::${r3.error}`;
6729
+ if (seen.has(key)) continue;
6730
+ seen.add(key);
6731
+ const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
6732
+ const suffix = count > 1 ? ` (\xD7${count})` : "";
6733
+ console.log(` ${red}\u2716${reset} ${r3.providerId}: ${r3.error}${suffix}`);
6734
+ const hint = apiKeyHint(r3.providerId, r3.error ?? "");
6735
+ if (hint) console.log(` ${dim(hint)}`);
6736
+ }
6737
+ console.log("");
6738
+ }
6739
+ if (hasCost) {
6740
+ console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
6741
+ console.log("");
6742
+ }
6743
+ }
6744
+ function printSummary(results, providers, byProvider) {
6745
+ const successResults = results.filter((r3) => !r3.error);
6746
+ if (successResults.length === 0) return;
6747
+ const successByProvider = /* @__PURE__ */ new Map();
6748
+ for (const id of providers) {
6749
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r3) => !r3.error));
6750
+ }
6751
+ console.log(` ${bold("Summary")}`);
6752
+ console.log(` ${dim("\u2501".repeat(72))}`);
6753
+ console.log("");
6754
+ const single = providers.length === 1;
6755
+ const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
6756
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
6757
+ if (byCorrectness) {
6758
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
6759
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
6760
+ if (single) {
6761
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
6762
+ } else {
6763
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
6764
+ }
6765
+ }
6766
+ const byLatency = providers.map((id) => {
6767
+ const runs = successByProvider.get(id) ?? [];
6768
+ const avg = average(runs.map((r3) => r3.raw.latencyMs));
6769
+ return { id, avg: avg ?? Infinity };
6770
+ }).sort((a7, b3) => a7.avg - b3.avg)[0];
6771
+ if (byLatency && byLatency.avg !== Infinity) {
6772
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
6773
+ const msStr = `${Math.round(byLatency.avg)}ms`;
6774
+ if (single) {
6775
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
6776
+ } else {
6777
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
6778
+ }
6779
+ }
6780
+ const byCost = providers.map((id) => {
6781
+ const runs = successByProvider.get(id) ?? [];
6782
+ const costs = runs.map((r3) => {
6783
+ const s5 = r3.scores.find((s6) => s6.name === "cost");
6784
+ return s5 && s5.value >= 0 ? s5.value : void 0;
6785
+ }).filter((c3) => c3 !== void 0);
6786
+ const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
6787
+ return { id, avg };
6788
+ }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
6789
+ if (byCost?.avg !== void 0) {
6790
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
6791
+ const costStr = formatCost(byCost.avg);
6792
+ if (single) {
6793
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
6794
+ } else {
6795
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
6796
+ }
6797
+ }
6798
+ if (!single) {
6799
+ const wins = /* @__PURE__ */ new Map();
6800
+ for (const id of providers) wins.set(id, 0);
6801
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
6802
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
6803
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
6804
+ const maxWins = Math.max(...wins.values());
6805
+ if (maxWins > 0) {
6806
+ const topProviders = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
6807
+ console.log("");
6808
+ if (topProviders.length === 1) {
6809
+ const [winnerId, winCount] = topProviders[0];
6810
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
6811
+ } else {
6812
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
6813
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
6814
+ }
6815
+ }
6816
+ }
6817
+ console.log("");
6818
+ }
6819
+
6820
+ // src/reporter/json.ts
6821
+ function jsonReporter(results) {
6822
+ return JSON.stringify(
6823
+ {
6824
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
6825
+ summary: buildSummary(results),
6826
+ results
6827
+ },
6828
+ null,
6829
+ 2
6830
+ );
6831
+ }
6832
+ function buildSummary(results) {
6833
+ const tasks = [...new Set(results.map((r3) => r3.taskName))];
6834
+ const providers = [...new Set(results.map((r3) => r3.providerId))];
6835
+ return {
6836
+ totalBenchmarks: results.length,
6837
+ tasks: tasks.length,
6838
+ providers: providers.length,
6839
+ providerIds: providers,
6840
+ taskNames: tasks
6841
+ };
6842
+ }
6843
+
6844
+ // src/reporter/markdown.ts
6845
+ var COMMENT_MARKER = "<!-- duelist-ci-report -->";
6846
+ function markdownReporter(report, _current) {
6847
+ const lines = [COMMENT_MARKER, ""];
6848
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
6849
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
6850
+ lines.push("");
6851
+ if (report.comparisons.length > 0) {
6852
+ lines.push(markdownComparisonTable(report.comparisons));
6853
+ lines.push("");
6854
+ }
6855
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
6856
+ lines.push(markdownCostSummary(report.cost));
6857
+ lines.push("");
6858
+ }
6859
+ if (report.flakyResults.length > 0) {
6860
+ lines.push("### \u26A0\uFE0F Flaky Results");
6861
+ lines.push("");
6862
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
6863
+ lines.push("");
6864
+ for (const f6 of report.flakyResults) {
6865
+ lines.push(`- **${f6.providerId}** \xD7 ${f6.taskName} \u2192 ${f6.scorerName} (CV = ${f6.current.cv.toFixed(2)})`);
6866
+ }
6867
+ lines.push("");
6868
+ }
6869
+ if (report.failureReasons.length > 0) {
6870
+ lines.push("### Failure Reasons");
6871
+ lines.push("");
6872
+ for (const reason of report.failureReasons) {
6873
+ lines.push(`- ${reason}`);
6874
+ }
6875
+ lines.push("");
6876
+ }
6877
+ lines.push("---");
6878
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
6879
+ return lines.join("\n");
6880
+ }
6881
+ function markdownComparisonTable(comparisons) {
6882
+ const lines = [];
6883
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
6884
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
6885
+ for (const c3 of comparisons) {
6886
+ const baselineStr = c3.baseline ? formatStats(c3.baseline) : "\u2014";
6887
+ const currentStr = formatStats(c3.current);
6888
+ const deltaStr = c3.delta !== null ? formatDelta(c3.delta, 3) : "\u2014";
6889
+ const status = statusIndicator(c3);
6890
+ lines.push(`| ${c3.providerId} | ${c3.taskName} | ${c3.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
6891
+ }
6892
+ return lines.join("\n");
6893
+ }
6894
+ function markdownCostSummary(cost) {
6895
+ const lines = [];
6896
+ lines.push("### \u{1F4B0} Cost Summary");
6897
+ lines.push("");
6898
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
6899
+ if (cost.budget !== void 0) {
6900
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
6901
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
6902
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
6903
+ }
6904
+ if (cost.perProvider.size > 1) {
6905
+ lines.push("");
6906
+ lines.push("| Provider | Cost |");
6907
+ lines.push("|----------|------|");
6908
+ for (const [provider, usd] of cost.perProvider) {
6909
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
6910
+ }
6911
+ }
6912
+ return lines.join("\n");
6913
+ }
6914
+ function formatStats(stats) {
6915
+ if (stats.n > 1) {
6916
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
6917
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
6918
+ }
6919
+ return stats.mean.toFixed(3);
6920
+ }
6921
+ function statusIndicator(c3) {
6922
+ if (c3.regressed) return "\u{1F534} regressed";
6923
+ if (c3.improved) return "\u{1F7E2} improved";
6924
+ if (c3.baseline === null) return "\u{1F195} new";
6925
+ return "\u26AA unchanged";
6926
+ }
6927
+
6928
+ // src/reporter/html.ts
6929
+ function esc(s5) {
6930
+ return s5.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
6931
+ }
6932
+ function htmlReporter(results) {
6933
+ if (results.length === 0) {
6934
+ return emptyReport();
6935
+ }
6936
+ const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
6937
+ const hasCost = scorerNames.includes("cost");
6938
+ const multi = providers.length >= 2;
6939
+ const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
6940
+ const taskSections = tasks.map((task) => {
6941
+ const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
6942
+ const columnStats = computeColumnStats(providerData, scorerNames);
6943
+ const medals = computeMedals(columnStats, providers);
6944
+ const winnerId = multi ? [...medals.entries()].find(([, m8]) => m8 === "gold")?.[0] : void 0;
6945
+ return { task, providerData, columnStats, medals, winnerId };
6946
+ });
6947
+ const successResults = results.filter((r3) => !r3.error);
6948
+ const successByProvider = /* @__PURE__ */ new Map();
6949
+ for (const id of providers) {
6950
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r3) => !r3.error));
6951
+ }
6952
+ const correctnessKey = successResults.some(
6953
+ (r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)
6954
+ ) ? "llm-judge-correctness" : "correctness";
6955
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
6956
+ const byLatency = providers.map((id) => {
6957
+ const runs = successByProvider.get(id) ?? [];
6958
+ const avg = average(runs.map((r3) => r3.raw.latencyMs));
6959
+ return { id, avg: avg ?? Infinity };
6960
+ }).sort((a7, b3) => a7.avg - b3.avg)[0];
6961
+ const byCost = providers.map((id) => {
6962
+ const runs = successByProvider.get(id) ?? [];
6963
+ const costs = runs.map((r3) => {
6964
+ const s5 = r3.scores.find((s6) => s6.name === "cost");
6965
+ return s5 && s5.value >= 0 ? s5.value : void 0;
6966
+ }).filter((c3) => c3 !== void 0);
6967
+ const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
6968
+ return { id, avg };
6969
+ }).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
6970
+ let overallWinner;
6971
+ if (multi) {
6972
+ const wins = /* @__PURE__ */ new Map();
6973
+ for (const id of providers) wins.set(id, 0);
6974
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
6975
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
6976
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
6977
+ const maxWins = Math.max(...wins.values());
6978
+ if (maxWins > 0) {
6979
+ const tops = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
6980
+ if (tops.length === 1) overallWinner = tops[0][0];
6981
+ }
6982
+ }
6983
+ const errorResults = results.filter((r3) => r3.error);
6984
+ const deduped = dedupeErrors(errorResults);
6985
+ return `<!DOCTYPE html>
6986
+ <html lang="en">
6987
+ <head>
6988
+ <meta charset="UTF-8">
6989
+ <meta name="viewport" content="width=device-width, initial-scale=1">
6990
+ <title>Agent Duelist Report</title>
6991
+ <meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
6992
+ <meta property="og:title" content="Agent Duelist Report">
6993
+ <meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
6994
+ <meta property="og:type" content="website">
6995
+ ${renderStyle()}
6996
+ </head>
6997
+ <body>
6998
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
6999
+ <div class="report">
7000
+
7001
+ ${renderHeader(runsLabel, providers.length, tasks.length)}
7002
+
7003
+ ${tasks.length > 1 ? renderTabs(tasks) : ""}
7004
+
7005
+ <main>
7006
+ ${taskSections.map((s5, i7) => renderTaskSection(
7007
+ s5.task,
7008
+ s5.providerData,
7009
+ s5.columnStats,
7010
+ s5.medals,
7011
+ s5.winnerId,
7012
+ scorerNames,
7013
+ hasCost,
7014
+ multi,
7015
+ i7
7016
+ )).join("\n")}
7017
+ </main>
7018
+
7019
+ ${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
7020
+
7021
+ ${deduped.length > 0 ? renderErrors(deduped) : ""}
7022
+
7023
+ ${renderFooter()}
7024
+
7025
+ </div>
7026
+ ${renderScript(tasks.length)}
7027
+ </body>
7028
+ </html>`;
7029
+ }
7030
+ function emptyReport() {
7031
+ return `<!DOCTYPE html>
7032
+ <html lang="en">
7033
+ <head>
7034
+ <meta charset="UTF-8">
7035
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7036
+ <title>Agent Duelist Report</title>
7037
+ ${renderStyle()}
7038
+ </head>
7039
+ <body>
7040
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
7041
+ <div class="report">
7042
+ ${renderHeader("0 runs", 0, 0)}
7043
+ <main><p class="empty-msg">No results to display.</p></main>
7044
+ ${renderFooter()}
7045
+ </div>
7046
+ </body>
7047
+ </html>`;
7048
+ }
7049
+ function dedupeErrors(errorResults) {
7050
+ const seen = /* @__PURE__ */ new Map();
7051
+ for (const r3 of errorResults) {
7052
+ const key = `${r3.providerId}::${r3.error}`;
7053
+ const existing = seen.get(key);
7054
+ if (existing) {
7055
+ existing.count++;
7056
+ } else {
7057
+ seen.set(key, {
7058
+ providerId: r3.providerId,
7059
+ error: r3.error ?? "Unknown error",
7060
+ count: 1,
7061
+ hint: apiKeyHint(r3.providerId, r3.error ?? "")
7062
+ });
7063
+ }
7064
+ }
7065
+ return [...seen.values()];
7066
+ }
7067
+ function renderStyle() {
7068
+ return `<style>
7069
+ :root {
7070
+ --bg: #0f172a;
7071
+ --bg-deep: #020617;
7072
+ --panel: rgba(15, 23, 42, 0.85);
7073
+ --accent: #f59e0b;
7074
+ --accent-soft: rgba(245, 158, 11, 0.15);
7075
+ --text: #e2e8f0;
7076
+ --muted: #94a3b8;
7077
+ --border: rgba(148, 163, 184, 0.15);
7078
+ --green: #22c55e;
7079
+ --red: #ef4444;
7080
+ --yellow: #eab308;
7081
+ --radius: 12px;
7082
+ --mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
7083
+ --sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
7084
+ }
7085
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
7086
+ html, body {
7087
+ font-family: var(--sans);
7088
+ background: var(--bg);
7089
+ color: var(--text);
7090
+ min-height: 100vh;
7091
+ }
7092
+ body { padding: 24px; display: flex; justify-content: center; }
7093
+
7094
+ /* Animated gradient mesh */
7095
+ .bg-mesh {
7096
+ position: fixed; inset: 0; z-index: 0;
7097
+ overflow: hidden; pointer-events: none;
7098
+ }
7099
+ .bg-mesh::before, .bg-mesh::after {
7100
+ content: ""; position: absolute; border-radius: 50%;
7101
+ filter: blur(120px); opacity: 0.4;
7102
+ }
7103
+ .bg-mesh::before {
7104
+ width: 600px; height: 600px;
7105
+ background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
7106
+ top: -10%; left: -5%;
7107
+ animation: meshDrift1 18s ease-in-out infinite alternate;
7108
+ }
7109
+ .bg-mesh::after {
7110
+ width: 500px; height: 500px;
7111
+ background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
7112
+ bottom: -10%; right: -5%;
7113
+ animation: meshDrift2 22s ease-in-out infinite alternate;
7114
+ }
7115
+ .bg-mesh-extra {
7116
+ position: absolute; width: 400px; height: 400px;
7117
+ border-radius: 50%; filter: blur(100px); opacity: 0.3;
7118
+ background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
7119
+ top: 50%; left: 60%;
7120
+ animation: meshDrift3 15s ease-in-out infinite alternate;
7121
+ }
7122
+ @keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
7123
+ @keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
7124
+ @keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
7125
+
7126
+ /* Report container */
7127
+ .report {
7128
+ position: relative; z-index: 1;
7129
+ width: 100%; max-width: 960px;
7130
+ }
7131
+
7132
+ /* Header */
7133
+ .report-header {
7134
+ display: flex; justify-content: space-between; align-items: center;
7135
+ padding: 20px 0; margin-bottom: 8px;
7136
+ }
7137
+ .report-brand {
7138
+ display: flex; align-items: center; gap: 10px;
7139
+ text-decoration: none; color: var(--muted);
7140
+ font-weight: 600; font-size: 14px;
7141
+ letter-spacing: 0.04em; text-transform: uppercase;
7142
+ }
7143
+ .report-brand:hover { color: var(--text); }
7144
+ .brand-icon {
7145
+ width: 32px; height: 32px; border-radius: 8px;
7146
+ background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
7147
+ border: 1px solid rgba(245,158,11,0.3);
7148
+ display: flex; align-items: center; justify-content: center;
7149
+ font-size: 16px;
7150
+ }
7151
+ .report-meta {
7152
+ font-size: 12px; color: var(--muted);
7153
+ text-align: right; line-height: 1.6;
7154
+ }
7155
+
7156
+ /* Task tabs */
7157
+ .task-tabs {
7158
+ display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
7159
+ }
7160
+ .task-tab {
7161
+ padding: 6px 16px; border-radius: 999px;
7162
+ border: 1px solid var(--border);
7163
+ background: transparent; color: var(--muted);
7164
+ font-size: 13px; font-weight: 500; cursor: pointer;
7165
+ transition: all 150ms ease;
7166
+ }
7167
+ .task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
7168
+ .task-tab.active {
7169
+ background: var(--accent-soft);
7170
+ border-color: rgba(245,158,11,0.4);
7171
+ color: var(--accent);
7172
+ }
7173
+
7174
+ /* Task sections */
7175
+ .task-section { display: none; }
7176
+ .task-section.active { display: block; }
7177
+ .task-name {
7178
+ font-size: 18px; font-weight: 600;
7179
+ margin-bottom: 12px; letter-spacing: -0.01em;
7180
+ }
7181
+
7182
+ /* Results table */
7183
+ .results-table {
7184
+ width: 100%; border-collapse: collapse;
7185
+ font-size: 13px; margin-bottom: 16px;
7186
+ border-radius: var(--radius); overflow: hidden;
7187
+ border: 1px solid var(--border);
7188
+ }
7189
+ .results-table th, .results-table td {
7190
+ padding: 10px 14px;
7191
+ text-align: left;
7192
+ border-bottom: 1px solid var(--border);
7193
+ }
7194
+ .results-table th {
7195
+ background: rgba(0,0,0,0.3);
7196
+ font-size: 11px; font-weight: 600;
7197
+ text-transform: uppercase; letter-spacing: 0.05em;
7198
+ color: var(--muted); cursor: pointer;
7199
+ user-select: none; white-space: nowrap;
7200
+ }
7201
+ .results-table th:hover { color: var(--text); }
7202
+ .results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
7203
+ .results-table tbody tr {
7204
+ background: var(--panel);
7205
+ transition: background 120ms ease;
7206
+ }
7207
+ .results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
7208
+ .results-table tbody tr:last-child td { border-bottom: none; }
7209
+
7210
+ /* Score cell with progress bar */
7211
+ .score-cell { position: relative; min-width: 90px; }
7212
+ .score-bar {
7213
+ position: absolute; left: 0; bottom: 0;
7214
+ height: 3px; border-radius: 2px;
7215
+ transition: width 300ms ease;
7216
+ }
7217
+ .score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
7218
+
7219
+ /* Color ranking */
7220
+ .rank-best { color: var(--green); font-weight: 600; }
7221
+ .rank-worst { color: var(--red); }
7222
+ .rank-mid { color: var(--yellow); }
7223
+ .rank-neutral { color: var(--text); }
7224
+ .rank-error { color: var(--muted); }
7225
+
7226
+ /* Winner banner */
7227
+ .task-winner {
7228
+ display: flex; align-items: center; gap: 10px;
7229
+ padding: 12px 18px; margin-bottom: 20px;
7230
+ border-radius: var(--radius);
7231
+ background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
7232
+ border: 1px solid rgba(34,197,94,0.2);
7233
+ font-size: 14px; font-weight: 500;
7234
+ }
7235
+ .task-winner .trophy { font-size: 20px; }
7236
+ .task-winner .winner-name { color: var(--green); font-weight: 600; }
7237
+ .task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
7238
+
7239
+ /* Summary cards */
7240
+ .summary-section { margin-top: 32px; }
7241
+ .summary-title {
7242
+ font-size: 16px; font-weight: 600;
7243
+ margin-bottom: 12px; color: var(--text);
7244
+ }
7245
+ .summary-cards {
7246
+ display: grid;
7247
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
7248
+ gap: 12px;
7249
+ }
7250
+ .summary-card {
7251
+ padding: 16px; border-radius: var(--radius);
7252
+ border: 1px solid var(--border);
7253
+ background: var(--panel);
7254
+ }
7255
+ .summary-card .card-label {
7256
+ font-size: 11px; font-weight: 600;
7257
+ text-transform: uppercase; letter-spacing: 0.05em;
7258
+ color: var(--muted); margin-bottom: 6px;
7259
+ }
7260
+ .summary-card .card-value {
7261
+ font-size: 20px; font-weight: 700;
7262
+ color: var(--green); font-family: var(--mono);
7263
+ }
7264
+ .summary-card .card-provider {
7265
+ font-size: 12px; color: var(--muted); margin-top: 4px;
7266
+ }
7267
+
7268
+ /* Errors */
7269
+ .errors-section { margin-top: 24px; }
7270
+ .errors-title {
7271
+ font-size: 16px; font-weight: 600;
7272
+ margin-bottom: 8px; color: var(--red);
7273
+ cursor: pointer;
7274
+ }
7275
+ .errors-list {
7276
+ border-radius: var(--radius);
7277
+ border: 1px solid rgba(239,68,68,0.2);
7278
+ background: rgba(239,68,68,0.04);
7279
+ overflow: hidden;
7280
+ }
7281
+ .error-item {
7282
+ padding: 10px 16px;
7283
+ border-bottom: 1px solid rgba(239,68,68,0.1);
7284
+ font-size: 13px;
7285
+ }
7286
+ .error-item:last-child { border-bottom: none; }
7287
+ .error-provider { font-weight: 600; color: var(--text); }
7288
+ .error-msg { color: var(--muted); margin-left: 8px; }
7289
+ .error-count { color: var(--muted); font-size: 11px; }
7290
+ .error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
7291
+
7292
+ /* Footer */
7293
+ .report-footer {
7294
+ margin-top: 40px; padding: 20px 0;
7295
+ border-top: 1px solid var(--border);
7296
+ display: flex; justify-content: space-between; align-items: center;
7297
+ flex-wrap: wrap; gap: 12px;
7298
+ }
7299
+ .footer-brand {
7300
+ font-size: 13px; color: var(--muted);
7301
+ }
7302
+ .footer-brand a {
7303
+ color: var(--accent); text-decoration: none; font-weight: 500;
7304
+ }
7305
+ .footer-brand a:hover { text-decoration: underline; }
7306
+ .footer-cta {
7307
+ display: inline-flex; align-items: center; gap: 6px;
7308
+ padding: 6px 14px; border-radius: 8px;
7309
+ background: var(--accent-soft);
7310
+ border: 1px solid rgba(245,158,11,0.3);
7311
+ color: var(--accent); font-size: 12px; font-weight: 500;
7312
+ text-decoration: none;
7313
+ transition: transform 120ms ease, box-shadow 120ms ease;
7314
+ }
7315
+ .footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
7316
+
7317
+ /* Empty state */
7318
+ .empty-msg {
7319
+ text-align: center; color: var(--muted);
7320
+ padding: 60px 20px; font-size: 16px;
7321
+ }
7322
+
7323
+ /* Responsive */
7324
+ @media (max-width: 640px) {
7325
+ body { padding: 12px; }
7326
+ .report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
7327
+ .report-meta { text-align: left; }
7328
+ .summary-cards { grid-template-columns: 1fr; }
7329
+ .results-table { font-size: 12px; }
7330
+ .results-table th, .results-table td { padding: 8px 10px; }
7331
+ .report-footer { flex-direction: column; align-items: flex-start; }
7332
+ }
7333
+ </style>`;
7334
+ }
7335
+ function renderHeader(runsLabel, providerCount, taskCount) {
7336
+ const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
7337
+ return `<header class="report-header">
7338
+ <a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
7339
+ <div class="brand-icon">&#x2B21;</div>
7340
+ <span>Agent Duelist</span>
7341
+ </a>
7342
+ <div class="report-meta">
7343
+ ${providerCount} provider${providerCount !== 1 ? "s" : ""} &middot;
7344
+ ${taskCount} task${taskCount !== 1 ? "s" : ""} &middot;
7345
+ ${esc(runsLabel)}<br>
7346
+ ${esc(now)}
7347
+ </div>
7348
+ </header>`;
7349
+ }
7350
+ function renderTabs(tasks) {
7351
+ const buttons = tasks.map(
7352
+ (t3, i7) => `<button class="task-tab${i7 === 0 ? " active" : ""}" data-task="${i7}">${esc(t3)}</button>`
7353
+ ).join("\n ");
7354
+ return `<nav class="task-tabs">
7355
+ ${buttons}
7356
+ </nav>`;
7357
+ }
7358
+ function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
7359
+ const cols = [
7360
+ { label: "Provider", key: "provider", isScore: false }
7361
+ ];
7362
+ for (const name of scorerNames) {
7363
+ if (name === "latency") {
7364
+ cols.push({ label: "Latency", key: "latency", isScore: false });
7365
+ } else if (name === "cost") {
7366
+ cols.push({ label: "Cost", key: "cost", isScore: false });
7367
+ cols.push({ label: "Tokens", key: "tokens", isScore: false });
7368
+ } else {
7369
+ cols.push({ label: scorerLabel(name), key: name, isScore: true });
7370
+ }
7371
+ }
7372
+ const ths = cols.map(
7373
+ (c3) => `<th data-col="${esc(c3.key)}">${esc(c3.label)}<span class="sort-arrow"></span></th>`
7374
+ ).join("");
7375
+ const rows = providerData.map((pd) => {
7376
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
7377
+ const cells = [];
7378
+ const medalHtml = medal ? `${medal} ` : "";
7379
+ cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
7380
+ if (pd.allErrors) {
7381
+ for (let ci = 1; ci < cols.length; ci++) {
7382
+ cells.push(`<td class="rank-error">&mdash;</td>`);
7383
+ }
7384
+ } else {
7385
+ for (const col of cols.slice(1)) {
7386
+ cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
7387
+ }
7388
+ }
7389
+ return `<tr>${cells.join("")}</tr>`;
7390
+ }).join("\n");
7391
+ const winnerHtml = winnerId ? `<div class="task-winner">
7392
+ <span class="trophy">&#x1F3C6;</span>
7393
+ <span>Winner: <span class="winner-name">${esc(winnerId)}</span>
7394
+ <span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
7395
+ </div>` : "";
7396
+ return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
7397
+ <h2 class="task-name">${esc(task)}</h2>
7398
+ <table class="results-table">
7399
+ <thead><tr>${ths}</tr></thead>
7400
+ <tbody>${rows}</tbody>
7401
+ </table>
7402
+ ${winnerHtml}
7403
+ </section>`;
7404
+ }
7405
+ function renderDataCell(key, _isScore, pd, columnStats, multi) {
7406
+ const colStats = columnStats.get(key);
7407
+ if (key === "latency") {
7408
+ const ms = pd.latencyMs;
7409
+ if (ms === void 0) return `<td class="rank-error">&mdash;</td>`;
7410
+ const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
7411
+ return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
7412
+ }
7413
+ if (key === "cost") {
7414
+ const cost = pd.avgDetails.costUsd;
7415
+ if (cost === void 0) return `<td class="rank-error">&mdash;</td>`;
7416
+ const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
7417
+ return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
7418
+ }
7419
+ if (key === "tokens") {
7420
+ const tokens = pd.avgDetails.totalTokens;
7421
+ if (tokens === void 0) return `<td class="rank-error">&mdash;</td>`;
7422
+ const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
7423
+ return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
7424
+ }
7425
+ const val = pd.avgScores[key];
7426
+ if (val === void 0) return `<td class="rank-error">&mdash;</td>`;
7427
+ const pct = Math.round(val * 100);
7428
+ let rankCls;
7429
+ if (multi && colStats) {
7430
+ rankCls = rankClass_(val, colStats);
7431
+ } else {
7432
+ rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
7433
+ }
7434
+ const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
7435
+ return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
7436
+ <span class="score-val">${pct}%</span>
7437
+ <div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
7438
+ </td>`;
7439
+ }
7440
+ function rankClass_(value, colStats) {
7441
+ if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
7442
+ if (colStats.best === colStats.worst) return "rank-neutral";
7443
+ if (value === colStats.best) return "rank-best";
7444
+ if (value === colStats.worst) return "rank-worst";
7445
+ return "rank-mid";
7446
+ }
7447
+ function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
7448
+ const cards = [];
7449
+ if (byCorrectness) {
7450
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
7451
+ const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
7452
+ cards.push(`<div class="summary-card">
7453
+ <div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
7454
+ <div class="card-value">${pct}</div>
7455
+ ${provider}
7456
+ </div>`);
7457
+ }
7458
+ if (byLatency && byLatency.avg !== Infinity) {
7459
+ const ms = `${Math.round(byLatency.avg)}ms`;
7460
+ const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
7461
+ cards.push(`<div class="summary-card">
7462
+ <div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
7463
+ <div class="card-value">${ms}</div>
7464
+ ${provider}
7465
+ </div>`);
7466
+ }
7467
+ if (byCost?.avg !== void 0) {
7468
+ const cost = esc(formatCost(byCost.avg));
7469
+ const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
7470
+ cards.push(`<div class="summary-card">
7471
+ <div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
7472
+ <div class="card-value">${cost}</div>
7473
+ ${provider}
7474
+ </div>`);
7475
+ }
7476
+ if (overallWinner) {
7477
+ cards.push(`<div class="summary-card">
7478
+ <div class="card-label">Overall Winner</div>
7479
+ <div class="card-value">&#x1F3C6;</div>
7480
+ <div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
7481
+ </div>`);
7482
+ }
7483
+ if (cards.length === 0) return "";
7484
+ return `<section class="summary-section">
7485
+ <h2 class="summary-title">Summary</h2>
7486
+ <div class="summary-cards">
7487
+ ${cards.join("\n ")}
7488
+ </div>
7489
+ </section>`;
7490
+ }
7491
+ function renderErrors(errors) {
7492
+ const items = errors.map((e5) => {
7493
+ const suffix = e5.count > 1 ? ` <span class="error-count">(&times;${e5.count})</span>` : "";
7494
+ const hint = e5.hint ? `<div class="error-hint">${esc(e5.hint)}</div>` : "";
7495
+ return `<div class="error-item">
7496
+ <span class="error-provider">${esc(e5.providerId)}:</span>
7497
+ <span class="error-msg">${esc(e5.error)}</span>${suffix}
7498
+ ${hint}
7499
+ </div>`;
7500
+ }).join("\n");
7501
+ return `<section class="errors-section">
7502
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
7503
+ <div class="errors-list">
7504
+ ${items}
7505
+ </div>
7506
+ </section>`;
7507
+ }
7508
+ function renderFooter() {
7509
+ return `<footer class="report-footer">
7510
+ <div class="footer-brand">
7511
+ Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
7512
+ </div>
7513
+ <a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
7514
+ &#x2B50; Star on GitHub
7515
+ </a>
7516
+ </footer>`;
7517
+ }
7518
+ function renderScript(taskCount) {
7519
+ return `<script>
7520
+ (function() {
7521
+ /* Tab switching */
7522
+ ${taskCount > 1 ? `
7523
+ var tabs = document.querySelectorAll('.task-tab');
7524
+ var sections = document.querySelectorAll('.task-section');
7525
+ tabs.forEach(function(tab) {
7526
+ tab.addEventListener('click', function() {
7527
+ var idx = parseInt(tab.getAttribute('data-task'));
7528
+ tabs.forEach(function(t) { t.classList.remove('active'); });
7529
+ sections.forEach(function(s) { s.classList.remove('active'); });
7530
+ tab.classList.add('active');
7531
+ sections[idx].classList.add('active');
7532
+ });
7533
+ });` : ""}
7534
+
7535
+ /* Column sorting */
7536
+ document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
7537
+ var table = th.closest('table');
7538
+ var asc = true;
7539
+ th.addEventListener('click', function() {
7540
+ var tbody = table.querySelector('tbody');
7541
+ var rows = Array.from(tbody.querySelectorAll('tr'));
7542
+ rows.sort(function(a, b) {
7543
+ var aCell = a.children[colIdx];
7544
+ var bCell = b.children[colIdx];
7545
+ var aVal = aCell.getAttribute('data-sort-val');
7546
+ var bVal = bCell.getAttribute('data-sort-val');
7547
+ if (aVal !== null && bVal !== null) {
7548
+ return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
7549
+ }
7550
+ var aText = aCell.textContent || '';
7551
+ var bText = bCell.textContent || '';
7552
+ return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
7553
+ });
7554
+ rows.forEach(function(row) { tbody.appendChild(row); });
7555
+
7556
+ /* Update sort arrows */
7557
+ table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
7558
+ th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
7559
+ asc = !asc;
7560
+ });
7561
+ });
7562
+ })();
7563
+ </script>`;
7564
+ }
7565
+
7566
+ // src/ci.ts
7567
+ import { readFileSync, writeFileSync, mkdirSync } from "fs";
7568
+ import { dirname } from "path";
7569
+ var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
7570
+ var FLAKY_CV_THRESHOLD = 0.3;
7571
+ var T_CRITICAL_95 = {
7572
+ 1: 12.706,
7573
+ 2: 4.303,
7574
+ 3: 3.182,
7575
+ 4: 2.776,
7576
+ 5: 2.571,
7577
+ 6: 2.447,
7578
+ 7: 2.365,
7579
+ 8: 2.306,
7580
+ 9: 2.262,
7581
+ 10: 2.228,
7582
+ 15: 2.131,
7583
+ 20: 2.086,
7584
+ 25: 2.06,
7585
+ 30: 2.042
7586
+ };
7587
+ var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a7, b3) => a7 - b3);
7588
+ function tCritical(df) {
7589
+ if (df <= 0) return 1.96;
7590
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
7591
+ const keys = T_CRITICAL_KEYS;
7592
+ if (df > keys[keys.length - 1]) return 1.96;
7593
+ for (let i7 = 0; i7 < keys.length - 1; i7++) {
7594
+ if (df > keys[i7] && df < keys[i7 + 1]) {
7595
+ const low = keys[i7], high = keys[i7 + 1];
7596
+ const ratio = (df - low) / (high - low);
7597
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
7598
+ }
7599
+ }
7600
+ return 1.96;
7601
+ }
7602
+ function computeScorerStats(samples) {
7603
+ const n3 = samples.length;
7604
+ if (n3 === 0) {
7605
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
7606
+ }
7607
+ const mean = samples.reduce((a7, b3) => a7 + b3, 0) / n3;
7608
+ if (n3 === 1) {
7609
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
7610
+ }
7611
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n3 - 1);
7612
+ const stddev = Math.sqrt(variance);
7613
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
7614
+ const se4 = stddev / Math.sqrt(n3);
7615
+ const t3 = tCritical(n3 - 1);
7616
+ return {
7617
+ mean,
7618
+ stddev,
7619
+ cv,
7620
+ n: n3,
7621
+ ci95Lower: mean - t3 * se4,
7622
+ ci95Upper: mean + t3 * se4
7623
+ };
7624
+ }
7625
+ function groupKey(providerId, taskName, scorerName) {
7626
+ return `${providerId}::${taskName}::${scorerName}`;
7627
+ }
7628
+ function computeStats(results) {
7629
+ const grouped = /* @__PURE__ */ new Map();
7630
+ for (const r3 of results) {
7631
+ if (r3.error) continue;
7632
+ for (const score of r3.scores) {
7633
+ if (score.value < 0) continue;
7634
+ const key = groupKey(r3.providerId, r3.taskName, score.name);
7635
+ if (!grouped.has(key)) grouped.set(key, []);
7636
+ grouped.get(key).push(score.value);
7637
+ }
7638
+ }
7639
+ const stats = /* @__PURE__ */ new Map();
7640
+ for (const [key, samples] of grouped) {
7641
+ stats.set(key, computeScorerStats(samples));
7642
+ }
7643
+ return stats;
7644
+ }
7645
+ function computeCostSummary(results, budget) {
7646
+ let totalUsd = 0;
7647
+ const perProvider = /* @__PURE__ */ new Map();
7648
+ for (const r3 of results) {
7649
+ if (r3.error) continue;
7650
+ const costScore = r3.scores.find((s5) => s5.name === "cost");
7651
+ if (!costScore || costScore.value < 0) continue;
7652
+ const details = costScore.details;
7653
+ const usd = details?.estimatedUsd ?? 0;
7654
+ if (usd <= 0) continue;
7655
+ totalUsd += usd;
7656
+ perProvider.set(r3.providerId, (perProvider.get(r3.providerId) ?? 0) + usd);
7657
+ }
7658
+ return {
7659
+ totalUsd,
7660
+ perProvider,
7661
+ budget,
7662
+ overBudget: budget !== void 0 && totalUsd > budget
7663
+ };
7664
+ }
7665
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
7666
+ const comparisons = [];
7667
+ const failureReasons = [];
7668
+ for (const [key, current] of currentStats) {
7669
+ const [providerId, taskName, scorerName] = key.split("::");
7670
+ const baseline = baselineStats?.get(key) ?? null;
7671
+ let delta = null;
7672
+ let regressed = false;
7673
+ let improved = false;
7674
+ if (baseline) {
7675
+ delta = current.mean - baseline.mean;
7676
+ const threshold = thresholds.get(scorerName);
7677
+ if (threshold !== void 0) {
7678
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
7679
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
7680
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
7681
+ }
7682
+ }
7683
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
7684
+ comparisons.push({
7685
+ providerId,
7686
+ taskName,
7687
+ scorerName,
7688
+ baseline,
7689
+ current,
7690
+ delta,
7691
+ regressed,
7692
+ improved,
7693
+ flaky
7694
+ });
7695
+ }
7696
+ const cost = computeCostSummary(currentResults ?? [], budget);
7697
+ const regressions = comparisons.filter((c3) => c3.regressed);
7698
+ if (regressions.length > 0) {
7699
+ for (const r3 of regressions) {
7700
+ failureReasons.push(
7701
+ `${r3.providerId} \xD7 ${r3.taskName}: ${r3.scorerName} regressed by ${formatDelta(r3.delta)}`
7702
+ );
7703
+ }
7704
+ }
7705
+ if (cost.overBudget) {
7706
+ failureReasons.push(
7707
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
7708
+ );
7709
+ }
7710
+ const flakyResults = comparisons.filter((c3) => c3.flaky);
7711
+ const failed = failureReasons.length > 0;
7712
+ return { comparisons, cost, failed, flakyResults, failureReasons };
7713
+ }
7714
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
7715
+ if (baseline.n === 1 && current.n === 1) {
7716
+ const delta = current.mean - baseline.mean;
7717
+ if (lowerIsBetter) return delta > threshold;
7718
+ return delta < -threshold;
7719
+ }
7720
+ if (lowerIsBetter) {
7721
+ return current.ci95Lower - baseline.ci95Upper > threshold;
7722
+ }
7723
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
7724
+ }
7725
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
7726
+ if (baseline.n === 1 && current.n === 1) {
7727
+ const delta = current.mean - baseline.mean;
7728
+ if (lowerIsBetter) return delta < -threshold;
7729
+ return delta > threshold;
7730
+ }
7731
+ if (lowerIsBetter) {
7732
+ return baseline.ci95Lower - current.ci95Upper > threshold;
7733
+ }
7734
+ return current.ci95Lower - baseline.ci95Upper > threshold;
7735
+ }
7736
+ function loadBaseline(path) {
7737
+ try {
7738
+ const raw = readFileSync(path, "utf-8");
7739
+ const data = JSON.parse(raw);
7740
+ const results = data.results ?? data;
7741
+ if (!Array.isArray(results)) return null;
7742
+ return {
7743
+ timestamp: data.timestamp ?? "unknown",
7744
+ results
7745
+ };
7746
+ } catch {
7747
+ return null;
7748
+ }
7749
+ }
7750
+ function saveBaseline(path, results) {
7751
+ mkdirSync(dirname(path), { recursive: true });
7752
+ const data = {
7753
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
7754
+ results
7755
+ };
7756
+ writeFileSync(path, JSON.stringify(data, null, 2));
7757
+ }
7758
+
7759
+ // src/github.ts
7760
+ import { readFileSync as readFileSync2 } from "fs";
7761
+ function detectGitHubContext() {
7762
+ const token = process.env.GITHUB_TOKEN;
7763
+ const repository = process.env.GITHUB_REPOSITORY;
7764
+ const eventPath = process.env.GITHUB_EVENT_PATH;
7765
+ if (!token || !repository) return null;
7766
+ const [owner, repo] = repository.split("/");
7767
+ if (!owner || !repo) return null;
7768
+ let prNumber;
7769
+ if (eventPath) {
7770
+ try {
7771
+ const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
7772
+ if (event.pull_request && typeof event.pull_request === "object") {
7773
+ const pr = event.pull_request;
7774
+ prNumber = pr.number;
7775
+ }
7776
+ if (!prNumber && event.issue && typeof event.issue === "object") {
7777
+ const issue = event.issue;
7778
+ if (issue.pull_request) {
7779
+ prNumber = issue.number;
7780
+ }
7781
+ }
7782
+ } catch {
7783
+ }
7784
+ }
7785
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
7786
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
7787
+ }
7788
+ if (!prNumber) return null;
7789
+ return { token, owner, repo, prNumber };
7790
+ }
7791
+ var API_BASE = "https://api.github.com";
7792
+ function ghHeaders(token, extra) {
7793
+ return {
7794
+ Authorization: `Bearer ${token}`,
7795
+ Accept: "application/vnd.github+json",
7796
+ "X-GitHub-Api-Version": "2022-11-28",
7797
+ ...extra
7798
+ };
7799
+ }
7800
+ async function findExistingComment(ctx, marker) {
7801
+ let page = 1;
7802
+ const perPage = 50;
7803
+ while (true) {
7804
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
7805
+ const res = await fetch(url, { headers: ghHeaders(ctx.token) });
7806
+ if (!res.ok) return null;
7807
+ const comments = await res.json();
7808
+ if (comments.length === 0) break;
7809
+ for (const comment of comments) {
7810
+ if (comment.body?.includes(marker)) {
7811
+ return comment.id;
7812
+ }
7813
+ }
7814
+ if (comments.length < perPage) break;
7815
+ page++;
7816
+ }
7817
+ return null;
7818
+ }
7819
+ async function upsertPrComment(ctx, body, marker) {
7820
+ const existingId = await findExistingComment(ctx, marker);
7821
+ if (existingId) {
7822
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
7823
+ const res = await fetch(url, {
7824
+ method: "PATCH",
7825
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
7826
+ body: JSON.stringify({ body })
7827
+ });
7828
+ if (!res.ok) {
7829
+ const text = await res.text();
7830
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
7831
+ }
7832
+ } else {
7833
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
7834
+ const res = await fetch(url, {
7835
+ method: "POST",
7836
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
7837
+ body: JSON.stringify({ body })
7838
+ });
7839
+ if (!res.ok) {
7840
+ const text = await res.text();
7841
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
7842
+ }
7843
+ }
7844
+ }
7845
+
7846
+ // src/cli.ts
7847
+ var __dirname2 = dirname2(fileURLToPath(import.meta.url));
7848
+ var program = new Command();
7849
+ program.name("duelist").description("Pit LLM providers against each other on agent tasks.").version(getVersion());
7850
+ program.command("init").description("Scaffold an arena.config.ts in the current directory").option("--force", "Overwrite existing config file").action((opts) => {
7851
+ const target = resolve("arena.config.ts");
7852
+ if (existsSync(target) && !opts.force) {
7853
+ console.error("arena.config.ts already exists. Use --force to overwrite.");
7854
+ process.exit(1);
6544
7855
  }
6545
7856
  const templatePath = join(__dirname2, "..", "templates", "arena.config.ts");
6546
7857
  let template;
6547
7858
  if (existsSync(templatePath)) {
6548
- template = readFileSync(templatePath, "utf-8");
7859
+ template = readFileSync3(templatePath, "utf-8");
6549
7860
  } else {
6550
7861
  template = DEFAULT_TEMPLATE;
6551
7862
  }
6552
- writeFileSync(target, template);
7863
+ writeFileSync2(target, template);
6553
7864
  console.log(existsSync(target) && opts.force ? "Overwrote arena.config.ts" : "Created arena.config.ts");
6554
7865
  console.log("");
6555
7866
  console.log("Next steps:");
6556
7867
  console.log(" 1. export OPENAI_API_KEY=sk-...");
6557
7868
  console.log(" 2. npx duelist run");
6558
7869
  });
6559
- program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console or json", "console").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
6560
- const configPath = resolve(opts.config);
7870
+ program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console, json, or html", "console").option("--output <path>", "Output file path (used with html reporter)", "duelist-report.html").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
7871
+ if (!["console", "json", "html"].includes(opts.reporter)) {
7872
+ console.error(`Unknown reporter "${opts.reporter}". Use "console", "json", or "html".`);
7873
+ process.exit(1);
7874
+ }
7875
+ const typedArena = await loadArenaConfig(opts.config);
7876
+ try {
7877
+ const showProgress = opts.reporter !== "json" && !opts.quiet;
7878
+ const onResult = showProgress ? logResult : void 0;
7879
+ const results = await typedArena.run({ onResult });
7880
+ if (opts.reporter === "json") {
7881
+ console.log(jsonReporter(results));
7882
+ } else if (opts.reporter === "html") {
7883
+ const html = htmlReporter(results);
7884
+ const outPath = resolve(opts.output);
7885
+ mkdirSync2(dirname2(outPath), { recursive: true });
7886
+ writeFileSync2(outPath, html);
7887
+ console.log(`
7888
+ HTML report written to ${outPath}`);
7889
+ } else {
7890
+ console.log("");
7891
+ consoleReporter(results, { sparklines: typedArena.config?.sparklines });
7892
+ }
7893
+ const allFailed = results.length > 0 && results.every((r3) => r3.error);
7894
+ if (allFailed) process.exit(1);
7895
+ } catch (err) {
7896
+ const message = err instanceof Error ? err.message : String(err);
7897
+ console.error(`Benchmark failed: ${message}`);
7898
+ process.exit(1);
7899
+ }
7900
+ });
7901
+ function collectThreshold(value, previous) {
7902
+ const [scorer, delta] = value.split("=");
7903
+ if (!scorer || delta === void 0 || isNaN(Number(delta))) {
7904
+ console.error(`Invalid threshold format: "${value}". Expected scorer=delta (e.g., correctness=0.1)`);
7905
+ process.exit(1);
7906
+ }
7907
+ previous.set(scorer, Number(delta));
7908
+ return previous;
7909
+ }
7910
+ program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
7911
+ const ciOpts = {
7912
+ configPath: opts.config,
7913
+ baselinePath: resolve(opts.baseline),
7914
+ budget: opts.budget,
7915
+ thresholds: opts.threshold,
7916
+ updateBaseline: opts.updateBaseline ?? false,
7917
+ comment: opts.comment ?? false,
7918
+ quiet: opts.quiet ?? false
7919
+ };
7920
+ const typedArena = await loadArenaConfig(ciOpts.configPath);
7921
+ console.log("Running benchmarks...");
7922
+ const onResult = ciOpts.quiet ? void 0 : logResult;
7923
+ let results;
7924
+ try {
7925
+ results = await typedArena.run({ onResult });
7926
+ } catch (err) {
7927
+ const message = err instanceof Error ? err.message : String(err);
7928
+ console.error(`Benchmark failed: ${message}`);
7929
+ process.exit(1);
7930
+ }
7931
+ const baseline = loadBaseline(ciOpts.baselinePath);
7932
+ const baselineStats = baseline ? computeStats(baseline.results) : null;
7933
+ if (baseline) {
7934
+ console.log(`Loaded baseline from ${ciOpts.baselinePath} (${baseline.timestamp})`);
7935
+ } else {
7936
+ console.log("No baseline found \u2014 this run establishes the first baseline.");
7937
+ }
7938
+ const currentStats = computeStats(results);
7939
+ const report = compareResults(baselineStats, currentStats, ciOpts.thresholds, ciOpts.budget, results);
7940
+ console.log("");
7941
+ consoleReporter(results, { sparklines: typedArena.config?.sparklines ?? true });
7942
+ if (report.flakyResults.length > 0) {
7943
+ console.log(`\u26A0 ${report.flakyResults.length} flaky result(s) detected (high variance)`);
7944
+ }
7945
+ if (report.cost.overBudget) {
7946
+ console.log(`\u{1F534} Budget exceeded: $${report.cost.totalUsd.toFixed(4)} > $${report.cost.budget.toFixed(2)}`);
7947
+ }
7948
+ for (const reason of report.failureReasons) {
7949
+ console.log(`\u{1F534} ${reason}`);
7950
+ }
7951
+ if (!report.failed) {
7952
+ console.log("\u{1F7E2} CI passed");
7953
+ }
7954
+ if (ciOpts.comment) {
7955
+ const ghCtx = detectGitHubContext();
7956
+ if (ghCtx) {
7957
+ const markdown = markdownReporter(report, results);
7958
+ try {
7959
+ await upsertPrComment(ghCtx, markdown, COMMENT_MARKER);
7960
+ console.log("Posted results to PR comment.");
7961
+ } catch (err) {
7962
+ console.warn(`Failed to post PR comment: ${err instanceof Error ? err.message : err}`);
7963
+ }
7964
+ } else {
7965
+ console.warn("--comment: not in a GitHub Actions PR context, skipping.");
7966
+ }
7967
+ }
7968
+ if (ciOpts.updateBaseline && !report.failed) {
7969
+ saveBaseline(ciOpts.baselinePath, results);
7970
+ console.log(`Baseline saved to ${ciOpts.baselinePath}`);
7971
+ } else if (ciOpts.updateBaseline && report.failed) {
7972
+ console.log("Baseline not updated (CI failed).");
7973
+ }
7974
+ process.exit(report.failed ? 1 : 0);
7975
+ });
7976
+ program.parse();
7977
+ async function loadArenaConfig(configOpt) {
7978
+ const configPath = resolve(configOpt);
6561
7979
  if (!existsSync(configPath)) {
6562
7980
  console.error(`Config not found: ${configPath}`);
6563
7981
  console.error("");
6564
7982
  console.error("Create one with: npx duelist init");
6565
7983
  process.exit(1);
6566
7984
  }
6567
- if (!["console", "json"].includes(opts.reporter)) {
6568
- console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
6569
- process.exit(1);
6570
- }
6571
7985
  let mod;
6572
7986
  try {
6573
7987
  if (configPath.endsWith(".ts")) {
@@ -6591,35 +8005,21 @@ program.command("run").description("Run benchmarks defined in your arena config"
6591
8005
  console.error(`Loaded from: ${configPath}`);
6592
8006
  process.exit(1);
6593
8007
  }
6594
- try {
6595
- const typedArena = arena;
6596
- const showProgress = opts.reporter === "console" && !opts.quiet;
6597
- const onResult = showProgress ? (result) => {
6598
- if (result.error) {
6599
- console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
6600
- } else {
6601
- const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
6602
- console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
6603
- }
6604
- } : void 0;
6605
- const results = await typedArena.run({ onResult });
6606
- const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
6607
- const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
6608
- if (opts.reporter === "json") {
6609
- console.log(jsonReporter2(results));
6610
- } else {
6611
- console.log("");
6612
- consoleReporter2(results);
6613
- }
6614
- const allFailed = results.length > 0 && results.every((r3) => r3.error);
6615
- if (allFailed) process.exit(1);
6616
- } catch (err) {
6617
- const message = err instanceof Error ? err.message : String(err);
6618
- console.error(`Benchmark failed: ${message}`);
6619
- process.exit(1);
8008
+ const maybeConfig = arena.config;
8009
+ if (maybeConfig === void 0 || maybeConfig === null || typeof maybeConfig !== "object") {
8010
+ ;
8011
+ arena.config = {};
6620
8012
  }
6621
- });
6622
- program.parse();
8013
+ return arena;
8014
+ }
8015
+ function logResult(result) {
8016
+ if (result.error) {
8017
+ console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
8018
+ } else {
8019
+ const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
8020
+ console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
8021
+ }
8022
+ }
6623
8023
  async function importTypeScript(filePath) {
6624
8024
  try {
6625
8025
  await Promise.resolve().then(() => (init_api(), api_exports));
@@ -6642,17 +8042,16 @@ function formatScoreForLog(s5) {
6642
8042
  return `${Math.round(details.ms)}ms`;
6643
8043
  }
6644
8044
  if (s5.name === "cost" && details?.estimatedUsd != null) {
6645
- const usd = details.estimatedUsd;
6646
- if (usd === 0) return "$0.00";
6647
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
6648
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
6649
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
8045
+ return formatCost(details.estimatedUsd);
8046
+ }
8047
+ if (s5.value < 0 && details?.reason) {
8048
+ return `SKIP (${details.reason})`;
6650
8049
  }
6651
8050
  return String(s5.value);
6652
8051
  }
6653
8052
  function getVersion() {
6654
8053
  try {
6655
- const pkg = readFileSync(join(__dirname2, "..", "package.json"), "utf-8");
8054
+ const pkg = readFileSync3(join(__dirname2, "..", "package.json"), "utf-8");
6656
8055
  return JSON.parse(pkg).version ?? "0.0.0";
6657
8056
  } catch {
6658
8057
  return "0.0.0";
@@ -6674,12 +8073,12 @@ import { z } from 'zod'
6674
8073
 
6675
8074
  export default defineArena({
6676
8075
  providers: [
6677
- openai('gpt-4o-mini'),
8076
+ openai('gpt-5-mini'),
6678
8077
  // Add more providers to compare:
6679
- // openai('gpt-4o'),
6680
- // azureOpenai('gpt-4o-mini'),
6681
- // anthropic('claude-sonnet-4-20250514'),
6682
- // gemini('gemini-2.5-flash'),
8078
+ // openai('gpt-5.2'),
8079
+ // azureOpenai('gpt-5-mini'),
8080
+ // anthropic('claude-sonnet-4.6'),
8081
+ // gemini('gemini-3-flash-preview'),
6683
8082
  ],
6684
8083
 
6685
8084
  tasks: [