agent-duelist 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -59
- package/dist/cli.js +1793 -394
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1774 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -8
- package/dist/index.d.ts +73 -8
- package/dist/index.js +1765 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/cli.js
CHANGED
|
@@ -38,343 +38,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
38
38
|
mod
|
|
39
39
|
));
|
|
40
40
|
|
|
41
|
-
// src/reporter/console.ts
|
|
42
|
-
var console_exports = {};
|
|
43
|
-
__export(console_exports, {
|
|
44
|
-
consoleReporter: () => consoleReporter
|
|
45
|
-
});
|
|
46
|
-
function bold(s5) {
|
|
47
|
-
return `${boldCode}${s5}${reset}`;
|
|
48
|
-
}
|
|
49
|
-
function dim(s5) {
|
|
50
|
-
return `${dimCode}${s5}${reset}`;
|
|
51
|
-
}
|
|
52
|
-
function colorScore(value) {
|
|
53
|
-
const pct = Math.round(value * 100);
|
|
54
|
-
const str = `${pct}%`;
|
|
55
|
-
if (value >= 0.8) return `${green}${str}${reset}`;
|
|
56
|
-
if (value >= 0.5) return `${yellow}${str}${reset}`;
|
|
57
|
-
return `${red}${str}${reset}`;
|
|
58
|
-
}
|
|
59
|
-
function consoleReporter(results) {
|
|
60
|
-
if (results.length === 0) {
|
|
61
|
-
console.log("\nNo results to display.\n");
|
|
62
|
-
return;
|
|
63
|
-
}
|
|
64
|
-
const tasks = [...new Set(results.map((r3) => r3.taskName))];
|
|
65
|
-
const providers = [...new Set(results.map((r3) => r3.providerId))];
|
|
66
|
-
const scorerNames = [...new Set(results.flatMap((r3) => r3.scores.map((s5) => s5.name)))];
|
|
67
|
-
const hasCost = scorerNames.includes("cost");
|
|
68
|
-
const hasErrors = results.some((r3) => r3.error);
|
|
69
|
-
const runsPerCell = Math.max(...results.map((r3) => r3.run));
|
|
70
|
-
const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
|
|
71
|
-
console.log("");
|
|
72
|
-
console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
|
|
73
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
74
|
-
console.log("");
|
|
75
|
-
for (const task of tasks) {
|
|
76
|
-
console.log(` ${bold(`Task: ${task}`)}`);
|
|
77
|
-
const cols = [{ label: "Provider", width: 22, align: "left" }];
|
|
78
|
-
for (const name of scorerNames) {
|
|
79
|
-
if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
|
|
80
|
-
else if (name === "cost") {
|
|
81
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
82
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
83
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
84
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
85
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
86
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
87
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
88
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
89
|
-
}
|
|
90
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
91
|
-
const totalWidth = cols.reduce((sum, c3) => sum + c3.width + 2, 0);
|
|
92
|
-
console.log(` ${dim(cols.map((c3) => pad(c3.label, c3.width + 2, c3.align)).join(""))}`);
|
|
93
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
94
|
-
for (const provider of providers) {
|
|
95
|
-
const taskResults = results.filter(
|
|
96
|
-
(r3) => r3.taskName === task && r3.providerId === provider
|
|
97
|
-
);
|
|
98
|
-
const errorResults2 = taskResults.filter((r3) => r3.error);
|
|
99
|
-
const successResults = taskResults.filter((r3) => !r3.error);
|
|
100
|
-
if (successResults.length === 0 && errorResults2.length > 0) {
|
|
101
|
-
const cells2 = [pad(provider, 24, "left")];
|
|
102
|
-
for (const name of scorerNames) {
|
|
103
|
-
if (name === "cost") {
|
|
104
|
-
cells2.push(pad("\u2014", 14, "right"));
|
|
105
|
-
cells2.push(pad("\u2014", 11, "right"));
|
|
106
|
-
} else cells2.push(pad("\u2014", cols.find((c3) => c3.label !== "Provider").width + 2, "right"));
|
|
107
|
-
}
|
|
108
|
-
if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
|
|
109
|
-
console.log(` ${cells2.join("")}`);
|
|
110
|
-
continue;
|
|
111
|
-
}
|
|
112
|
-
const avgScores = averageScores(successResults);
|
|
113
|
-
const avgDetails = averageDetails(successResults);
|
|
114
|
-
const latencyMs = average(successResults.map((r3) => r3.raw.latencyMs));
|
|
115
|
-
const cells = [pad(provider, 24, "left")];
|
|
116
|
-
for (const name of scorerNames) {
|
|
117
|
-
if (name === "latency") {
|
|
118
|
-
cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
|
|
119
|
-
} else if (name === "cost") {
|
|
120
|
-
cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
|
|
121
|
-
cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
|
|
122
|
-
} else {
|
|
123
|
-
const val = avgScores[name];
|
|
124
|
-
if (val === void 0) cells.push(pad("\u2014", 10, "right"));
|
|
125
|
-
else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
if (hasErrors) {
|
|
129
|
-
const failCount = errorResults2.length;
|
|
130
|
-
cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
|
|
131
|
-
}
|
|
132
|
-
console.log(` ${cells.join("")}`);
|
|
133
|
-
}
|
|
134
|
-
console.log("");
|
|
135
|
-
}
|
|
136
|
-
printSummary(results, providers);
|
|
137
|
-
const errorResults = results.filter((r3) => r3.error);
|
|
138
|
-
if (errorResults.length > 0) {
|
|
139
|
-
console.log(` ${bold("Errors")}`);
|
|
140
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
141
|
-
const seen = /* @__PURE__ */ new Set();
|
|
142
|
-
for (const r3 of errorResults) {
|
|
143
|
-
const key = `${r3.providerId}::${r3.error}`;
|
|
144
|
-
if (seen.has(key)) continue;
|
|
145
|
-
seen.add(key);
|
|
146
|
-
const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
|
|
147
|
-
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
148
|
-
console.log(` ${red}\u2717${reset} ${r3.providerId}: ${r3.error}${suffix}`);
|
|
149
|
-
const hint = apiKeyHint(r3.providerId, r3.error ?? "");
|
|
150
|
-
if (hint) console.log(` ${dim(hint)}`);
|
|
151
|
-
}
|
|
152
|
-
console.log("");
|
|
153
|
-
}
|
|
154
|
-
if (hasCost) {
|
|
155
|
-
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
156
|
-
console.log("");
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
function printSummary(results, providers) {
|
|
160
|
-
const successResults = results.filter((r3) => !r3.error);
|
|
161
|
-
if (successResults.length === 0) return;
|
|
162
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
163
|
-
console.log(` ${bold("Summary")}`);
|
|
164
|
-
console.log("");
|
|
165
|
-
const single = providers.length === 1;
|
|
166
|
-
const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
167
|
-
const byCorrectness = rankProviders(successResults, providers, correctnessKey);
|
|
168
|
-
if (byCorrectness) {
|
|
169
|
-
const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
|
|
170
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
|
|
171
|
-
}
|
|
172
|
-
const byLatency = providers.map((id) => {
|
|
173
|
-
const runs = successResults.filter((r3) => r3.providerId === id);
|
|
174
|
-
const avg = average(runs.map((r3) => r3.raw.latencyMs));
|
|
175
|
-
return { id, avg: avg ?? Infinity };
|
|
176
|
-
}).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
177
|
-
if (byLatency && byLatency.avg !== Infinity) {
|
|
178
|
-
const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
|
|
179
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
|
|
180
|
-
}
|
|
181
|
-
const byCost = providers.map((id) => {
|
|
182
|
-
const runs = successResults.filter((r3) => r3.providerId === id);
|
|
183
|
-
const costs = runs.map((r3) => {
|
|
184
|
-
const s5 = r3.scores.find((s6) => s6.name === "cost");
|
|
185
|
-
return s5 && s5.value >= 0 ? s5.value : void 0;
|
|
186
|
-
}).filter((c3) => c3 !== void 0);
|
|
187
|
-
const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
|
|
188
|
-
return { id, avg };
|
|
189
|
-
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
190
|
-
if (byCost?.avg !== void 0) {
|
|
191
|
-
const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
|
|
192
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
|
|
193
|
-
}
|
|
194
|
-
console.log("");
|
|
195
|
-
}
|
|
196
|
-
function rankProviders(results, providers, scorerName) {
|
|
197
|
-
const ranked = providers.map((id) => {
|
|
198
|
-
const runs = results.filter((r3) => r3.providerId === id);
|
|
199
|
-
const scores = runs.flatMap((r3) => r3.scores.filter((s5) => s5.name === scorerName && s5.value >= 0)).map((s5) => s5.value);
|
|
200
|
-
const avg = scores.length > 0 ? scores.reduce((a7, b3) => a7 + b3, 0) / scores.length : void 0;
|
|
201
|
-
return { id, avg };
|
|
202
|
-
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => b3.avg - a7.avg);
|
|
203
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
204
|
-
}
|
|
205
|
-
function averageScores(results) {
|
|
206
|
-
const sums = {};
|
|
207
|
-
const counts = {};
|
|
208
|
-
for (const result of results) {
|
|
209
|
-
for (const score of result.scores) {
|
|
210
|
-
if (score.value < 0) continue;
|
|
211
|
-
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
212
|
-
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
const avgs = {};
|
|
216
|
-
for (const name of Object.keys(sums)) {
|
|
217
|
-
avgs[name] = sums[name] / counts[name];
|
|
218
|
-
}
|
|
219
|
-
return avgs;
|
|
220
|
-
}
|
|
221
|
-
function averageDetails(results) {
|
|
222
|
-
let costSum = 0;
|
|
223
|
-
let costCount = 0;
|
|
224
|
-
let tokenSum = 0;
|
|
225
|
-
let tokenCount = 0;
|
|
226
|
-
for (const result of results) {
|
|
227
|
-
const costScore = result.scores.find((s5) => s5.name === "cost");
|
|
228
|
-
const details = costScore?.details;
|
|
229
|
-
if (details?.estimatedUsd != null) {
|
|
230
|
-
costSum += details.estimatedUsd;
|
|
231
|
-
costCount++;
|
|
232
|
-
}
|
|
233
|
-
if (details?.totalTokens != null) {
|
|
234
|
-
tokenSum += details.totalTokens;
|
|
235
|
-
tokenCount++;
|
|
236
|
-
}
|
|
237
|
-
}
|
|
238
|
-
return {
|
|
239
|
-
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
240
|
-
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
241
|
-
};
|
|
242
|
-
}
|
|
243
|
-
function average(nums) {
|
|
244
|
-
if (nums.length === 0) return void 0;
|
|
245
|
-
return nums.reduce((a7, b3) => a7 + b3, 0) / nums.length;
|
|
246
|
-
}
|
|
247
|
-
function formatCost(usd) {
|
|
248
|
-
if (usd === void 0) return "\u2014";
|
|
249
|
-
if (usd === 0) return "$0.00";
|
|
250
|
-
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
251
|
-
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
252
|
-
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
253
|
-
}
|
|
254
|
-
function pad(str, width, align) {
|
|
255
|
-
if (align === "right") return str.padStart(width);
|
|
256
|
-
return str.padEnd(width);
|
|
257
|
-
}
|
|
258
|
-
function colorLen(str) {
|
|
259
|
-
const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
|
|
260
|
-
return str.length - stripped.length;
|
|
261
|
-
}
|
|
262
|
-
function apiKeyHint(providerId, error) {
|
|
263
|
-
const lower = error.toLowerCase();
|
|
264
|
-
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
265
|
-
if (!isAuthError) return void 0;
|
|
266
|
-
const prefix = providerId.split("/")[0];
|
|
267
|
-
switch (prefix) {
|
|
268
|
-
case "openai":
|
|
269
|
-
return "Set: export OPENAI_API_KEY=sk-...";
|
|
270
|
-
case "azure":
|
|
271
|
-
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
272
|
-
case "anthropic":
|
|
273
|
-
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
274
|
-
case "google":
|
|
275
|
-
return "Set: export GOOGLE_API_KEY=...";
|
|
276
|
-
default:
|
|
277
|
-
return `Check the API key for ${providerId}`;
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
function providerLabel(providerId) {
|
|
281
|
-
const prefix = providerId.split("/")[0];
|
|
282
|
-
switch (prefix) {
|
|
283
|
-
case "azure":
|
|
284
|
-
return "(OpenAI via Azure)";
|
|
285
|
-
case "openai":
|
|
286
|
-
return "(OpenAI)";
|
|
287
|
-
case "anthropic":
|
|
288
|
-
return "(Anthropic)";
|
|
289
|
-
case "google":
|
|
290
|
-
return "(Google)";
|
|
291
|
-
case "mistral":
|
|
292
|
-
return "(Mistral)";
|
|
293
|
-
case "meta":
|
|
294
|
-
return "(Meta)";
|
|
295
|
-
case "deepseek":
|
|
296
|
-
return "(DeepSeek)";
|
|
297
|
-
case "cohere":
|
|
298
|
-
return "(Cohere)";
|
|
299
|
-
case "qwen":
|
|
300
|
-
return "(Qwen)";
|
|
301
|
-
case "xai":
|
|
302
|
-
return "(xAI)";
|
|
303
|
-
case "minimax":
|
|
304
|
-
return "(MiniMax)";
|
|
305
|
-
case "moonshot":
|
|
306
|
-
return "(Moonshot / Kimi)";
|
|
307
|
-
case "perplexity":
|
|
308
|
-
return "(Perplexity)";
|
|
309
|
-
case "amazon":
|
|
310
|
-
return "(Amazon)";
|
|
311
|
-
case "nvidia":
|
|
312
|
-
return "(NVIDIA)";
|
|
313
|
-
case "microsoft":
|
|
314
|
-
return "(Microsoft)";
|
|
315
|
-
case "ai21":
|
|
316
|
-
return "(AI21 Labs)";
|
|
317
|
-
case "bytedance":
|
|
318
|
-
return "(ByteDance)";
|
|
319
|
-
case "together":
|
|
320
|
-
return "(Together AI)";
|
|
321
|
-
case "fireworks":
|
|
322
|
-
return "(Fireworks AI)";
|
|
323
|
-
case "groq":
|
|
324
|
-
return "(Groq)";
|
|
325
|
-
case "cerebras":
|
|
326
|
-
return "(Cerebras)";
|
|
327
|
-
default:
|
|
328
|
-
return `(${prefix})`;
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
var reset, boldCode, dimCode, green, red, yellow, cyan;
|
|
332
|
-
var init_console = __esm({
|
|
333
|
-
"src/reporter/console.ts"() {
|
|
334
|
-
"use strict";
|
|
335
|
-
reset = "\x1B[0m";
|
|
336
|
-
boldCode = "\x1B[1m";
|
|
337
|
-
dimCode = "\x1B[2m";
|
|
338
|
-
green = "\x1B[32m";
|
|
339
|
-
red = "\x1B[31m";
|
|
340
|
-
yellow = "\x1B[33m";
|
|
341
|
-
cyan = "\x1B[36m";
|
|
342
|
-
}
|
|
343
|
-
});
|
|
344
|
-
|
|
345
|
-
// src/reporter/json.ts
|
|
346
|
-
var json_exports = {};
|
|
347
|
-
__export(json_exports, {
|
|
348
|
-
jsonReporter: () => jsonReporter
|
|
349
|
-
});
|
|
350
|
-
function jsonReporter(results) {
|
|
351
|
-
return JSON.stringify(
|
|
352
|
-
{
|
|
353
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
354
|
-
summary: buildSummary(results),
|
|
355
|
-
results
|
|
356
|
-
},
|
|
357
|
-
null,
|
|
358
|
-
2
|
|
359
|
-
);
|
|
360
|
-
}
|
|
361
|
-
function buildSummary(results) {
|
|
362
|
-
const tasks = [...new Set(results.map((r3) => r3.taskName))];
|
|
363
|
-
const providers = [...new Set(results.map((r3) => r3.providerId))];
|
|
364
|
-
return {
|
|
365
|
-
totalBenchmarks: results.length,
|
|
366
|
-
tasks: tasks.length,
|
|
367
|
-
providers: providers.length,
|
|
368
|
-
providerIds: providers,
|
|
369
|
-
taskNames: tasks
|
|
370
|
-
};
|
|
371
|
-
}
|
|
372
|
-
var init_json = __esm({
|
|
373
|
-
"src/reporter/json.ts"() {
|
|
374
|
-
"use strict";
|
|
375
|
-
}
|
|
376
|
-
});
|
|
377
|
-
|
|
378
41
|
// node_modules/tsx/dist/temporary-directory-CwHp0_NW.mjs
|
|
379
42
|
import r from "path";
|
|
380
43
|
import o from "os";
|
|
@@ -6530,44 +6193,1795 @@ var init_api = __esm({
|
|
|
6530
6193
|
// src/cli.ts
|
|
6531
6194
|
import "dotenv/config";
|
|
6532
6195
|
import { Command } from "commander";
|
|
6533
|
-
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
6534
|
-
import { resolve, join, dirname } from "path";
|
|
6196
|
+
import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSync as mkdirSync2, existsSync } from "fs";
|
|
6197
|
+
import { resolve, join, dirname as dirname2 } from "path";
|
|
6535
6198
|
import { pathToFileURL, fileURLToPath } from "url";
|
|
6536
|
-
|
|
6537
|
-
|
|
6538
|
-
|
|
6539
|
-
|
|
6540
|
-
|
|
6541
|
-
if (
|
|
6542
|
-
|
|
6543
|
-
|
|
6199
|
+
|
|
6200
|
+
// src/utils/format.ts
|
|
6201
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
6202
|
+
function formatCost(usd) {
|
|
6203
|
+
if (usd === void 0) return "\u2014";
|
|
6204
|
+
if (usd === 0) return "$0.00";
|
|
6205
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
6206
|
+
const digits = Math.min(
|
|
6207
|
+
MAX_FRACTION_DIGITS,
|
|
6208
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
6209
|
+
);
|
|
6210
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
6211
|
+
}
|
|
6212
|
+
function formatDelta(delta, precision = 4) {
|
|
6213
|
+
const sign = delta >= 0 ? "+" : "";
|
|
6214
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
6215
|
+
}
|
|
6216
|
+
|
|
6217
|
+
// src/reporter/shared.ts
|
|
6218
|
+
function groupResults(results) {
|
|
6219
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
6220
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
6221
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
6222
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
6223
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
6224
|
+
let hasErrors = false;
|
|
6225
|
+
let maxRun = 0;
|
|
6226
|
+
for (const r3 of results) {
|
|
6227
|
+
taskSet.add(r3.taskName);
|
|
6228
|
+
providerSet.add(r3.providerId);
|
|
6229
|
+
for (const s5 of r3.scores) scorerSet.add(s5.name);
|
|
6230
|
+
if (r3.error) hasErrors = true;
|
|
6231
|
+
if (r3.run > maxRun) maxRun = r3.run;
|
|
6232
|
+
const key = `${r3.taskName}::${r3.providerId}`;
|
|
6233
|
+
let group = grouped.get(key);
|
|
6234
|
+
if (!group) {
|
|
6235
|
+
group = [];
|
|
6236
|
+
grouped.set(key, group);
|
|
6237
|
+
}
|
|
6238
|
+
group.push(r3);
|
|
6239
|
+
let provGroup = byProvider.get(r3.providerId);
|
|
6240
|
+
if (!provGroup) {
|
|
6241
|
+
provGroup = [];
|
|
6242
|
+
byProvider.set(r3.providerId, provGroup);
|
|
6243
|
+
}
|
|
6244
|
+
provGroup.push(r3);
|
|
6245
|
+
}
|
|
6246
|
+
return {
|
|
6247
|
+
tasks: [...taskSet],
|
|
6248
|
+
providers: [...providerSet],
|
|
6249
|
+
scorerNames: [...scorerSet],
|
|
6250
|
+
grouped,
|
|
6251
|
+
byProvider,
|
|
6252
|
+
hasErrors,
|
|
6253
|
+
maxRun
|
|
6254
|
+
};
|
|
6255
|
+
}
|
|
6256
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
6257
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
6258
|
+
const errorResults = taskResults.filter((r3) => r3.error);
|
|
6259
|
+
const successResults = taskResults.filter((r3) => !r3.error);
|
|
6260
|
+
if (successResults.length === 0) {
|
|
6261
|
+
return {
|
|
6262
|
+
providerId,
|
|
6263
|
+
avgScores: {},
|
|
6264
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
6265
|
+
latencyMs: void 0,
|
|
6266
|
+
allErrors: errorResults.length > 0,
|
|
6267
|
+
errorCount: errorResults.length
|
|
6268
|
+
};
|
|
6269
|
+
}
|
|
6270
|
+
return {
|
|
6271
|
+
providerId,
|
|
6272
|
+
avgScores: averageScores(successResults),
|
|
6273
|
+
avgDetails: averageDetails(successResults),
|
|
6274
|
+
latencyMs: average(successResults.map((r3) => r3.raw.latencyMs)),
|
|
6275
|
+
allErrors: false,
|
|
6276
|
+
errorCount: errorResults.length
|
|
6277
|
+
};
|
|
6278
|
+
}
|
|
6279
|
+
function averageScores(results) {
|
|
6280
|
+
const sums = {};
|
|
6281
|
+
const counts = {};
|
|
6282
|
+
for (const result of results) {
|
|
6283
|
+
for (const score of result.scores) {
|
|
6284
|
+
if (score.value < 0) continue;
|
|
6285
|
+
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
6286
|
+
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
6287
|
+
}
|
|
6288
|
+
}
|
|
6289
|
+
const avgs = {};
|
|
6290
|
+
for (const name of Object.keys(sums)) {
|
|
6291
|
+
avgs[name] = sums[name] / counts[name];
|
|
6292
|
+
}
|
|
6293
|
+
return avgs;
|
|
6294
|
+
}
|
|
6295
|
+
function averageDetails(results) {
|
|
6296
|
+
let costSum = 0;
|
|
6297
|
+
let costCount = 0;
|
|
6298
|
+
let tokenSum = 0;
|
|
6299
|
+
let tokenCount = 0;
|
|
6300
|
+
for (const result of results) {
|
|
6301
|
+
const costScore = result.scores.find((s5) => s5.name === "cost");
|
|
6302
|
+
const details = costScore?.details;
|
|
6303
|
+
if (details?.estimatedUsd != null) {
|
|
6304
|
+
costSum += details.estimatedUsd;
|
|
6305
|
+
costCount++;
|
|
6306
|
+
}
|
|
6307
|
+
if (details?.totalTokens != null) {
|
|
6308
|
+
tokenSum += details.totalTokens;
|
|
6309
|
+
tokenCount++;
|
|
6310
|
+
}
|
|
6311
|
+
}
|
|
6312
|
+
return {
|
|
6313
|
+
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
6314
|
+
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
6315
|
+
};
|
|
6316
|
+
}
|
|
6317
|
+
function average(nums) {
|
|
6318
|
+
if (nums.length === 0) return void 0;
|
|
6319
|
+
return nums.reduce((a7, b3) => a7 + b3, 0) / nums.length;
|
|
6320
|
+
}
|
|
6321
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
6322
|
+
const stats = /* @__PURE__ */ new Map();
|
|
6323
|
+
const valid = providerData.filter((p5) => !p5.allErrors);
|
|
6324
|
+
if (scorerNames.includes("latency")) {
|
|
6325
|
+
const values = /* @__PURE__ */ new Map();
|
|
6326
|
+
for (const p5 of providerData) {
|
|
6327
|
+
values.set(p5.providerId, p5.allErrors ? void 0 : p5.latencyMs);
|
|
6328
|
+
}
|
|
6329
|
+
const nums = valid.map((p5) => p5.latencyMs).filter((v4) => v4 !== void 0);
|
|
6330
|
+
stats.set("latency", {
|
|
6331
|
+
values,
|
|
6332
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
6333
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
6334
|
+
});
|
|
6335
|
+
}
|
|
6336
|
+
if (scorerNames.includes("cost")) {
|
|
6337
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
6338
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
6339
|
+
for (const p5 of providerData) {
|
|
6340
|
+
costValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.costUsd);
|
|
6341
|
+
tokenValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.totalTokens);
|
|
6342
|
+
}
|
|
6343
|
+
const costNums = valid.map((p5) => p5.avgDetails.costUsd).filter((v4) => v4 !== void 0);
|
|
6344
|
+
const tokenNums = valid.map((p5) => p5.avgDetails.totalTokens).filter((v4) => v4 !== void 0);
|
|
6345
|
+
stats.set("cost", {
|
|
6346
|
+
values: costValues,
|
|
6347
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
6348
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
6349
|
+
});
|
|
6350
|
+
stats.set("tokens", {
|
|
6351
|
+
values: tokenValues,
|
|
6352
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
6353
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
6354
|
+
});
|
|
6355
|
+
}
|
|
6356
|
+
for (const name of scorerNames) {
|
|
6357
|
+
if (name === "latency" || name === "cost") continue;
|
|
6358
|
+
const values = /* @__PURE__ */ new Map();
|
|
6359
|
+
for (const p5 of providerData) {
|
|
6360
|
+
values.set(p5.providerId, p5.allErrors ? void 0 : p5.avgScores[name]);
|
|
6361
|
+
}
|
|
6362
|
+
const nums = valid.map((p5) => p5.avgScores[name]).filter((v4) => v4 !== void 0);
|
|
6363
|
+
stats.set(name, {
|
|
6364
|
+
values,
|
|
6365
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
6366
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
6367
|
+
});
|
|
6368
|
+
}
|
|
6369
|
+
return stats;
|
|
6370
|
+
}
|
|
6371
|
+
function computeMedals(columnStats, providerIds) {
|
|
6372
|
+
const medals = /* @__PURE__ */ new Map();
|
|
6373
|
+
if (providerIds.length < 2) {
|
|
6374
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
6375
|
+
return medals;
|
|
6376
|
+
}
|
|
6377
|
+
const wins = /* @__PURE__ */ new Map();
|
|
6378
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
6379
|
+
for (const [, colStats] of columnStats) {
|
|
6380
|
+
if (colStats.best === void 0) continue;
|
|
6381
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v4]) => v4 !== void 0 && v4 === colStats.best);
|
|
6382
|
+
if (bestProviders.length === 1) {
|
|
6383
|
+
wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
|
|
6384
|
+
}
|
|
6385
|
+
}
|
|
6386
|
+
const totalWins = [...wins.values()].reduce((a7, b3) => a7 + b3, 0);
|
|
6387
|
+
if (totalWins === 0) {
|
|
6388
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
6389
|
+
return medals;
|
|
6390
|
+
}
|
|
6391
|
+
const sorted = [...wins.entries()].sort(
|
|
6392
|
+
(a7, b3) => b3[1] - a7[1] || a7[0].localeCompare(b3[0])
|
|
6393
|
+
);
|
|
6394
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
6395
|
+
let rank = 0;
|
|
6396
|
+
for (let i7 = 0; i7 < sorted.length; i7++) {
|
|
6397
|
+
if (i7 > 0 && sorted[i7][1] < sorted[i7 - 1][1]) {
|
|
6398
|
+
rank = i7;
|
|
6399
|
+
}
|
|
6400
|
+
const hasWins = sorted[i7][1] > 0;
|
|
6401
|
+
medals.set(sorted[i7][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
|
|
6402
|
+
}
|
|
6403
|
+
return medals;
|
|
6404
|
+
}
|
|
6405
|
+
function providerLabel(providerId) {
|
|
6406
|
+
const prefix = providerId.split("/")[0];
|
|
6407
|
+
switch (prefix) {
|
|
6408
|
+
case "azure":
|
|
6409
|
+
return "(OpenAI via Azure)";
|
|
6410
|
+
case "openai":
|
|
6411
|
+
return "(OpenAI)";
|
|
6412
|
+
case "anthropic":
|
|
6413
|
+
return "(Anthropic)";
|
|
6414
|
+
case "google":
|
|
6415
|
+
return "(Google)";
|
|
6416
|
+
case "mistral":
|
|
6417
|
+
return "(Mistral)";
|
|
6418
|
+
case "meta":
|
|
6419
|
+
return "(Meta)";
|
|
6420
|
+
case "deepseek":
|
|
6421
|
+
return "(DeepSeek)";
|
|
6422
|
+
case "cohere":
|
|
6423
|
+
return "(Cohere)";
|
|
6424
|
+
case "qwen":
|
|
6425
|
+
return "(Qwen)";
|
|
6426
|
+
case "xai":
|
|
6427
|
+
return "(xAI)";
|
|
6428
|
+
case "minimax":
|
|
6429
|
+
return "(MiniMax)";
|
|
6430
|
+
case "moonshot":
|
|
6431
|
+
return "(Moonshot / Kimi)";
|
|
6432
|
+
case "perplexity":
|
|
6433
|
+
return "(Perplexity)";
|
|
6434
|
+
case "amazon":
|
|
6435
|
+
return "(Amazon)";
|
|
6436
|
+
case "nvidia":
|
|
6437
|
+
return "(NVIDIA)";
|
|
6438
|
+
case "microsoft":
|
|
6439
|
+
return "(Microsoft)";
|
|
6440
|
+
case "ai21":
|
|
6441
|
+
return "(AI21 Labs)";
|
|
6442
|
+
case "bytedance":
|
|
6443
|
+
return "(ByteDance)";
|
|
6444
|
+
case "together":
|
|
6445
|
+
return "(Together AI)";
|
|
6446
|
+
case "fireworks":
|
|
6447
|
+
return "(Fireworks AI)";
|
|
6448
|
+
case "groq":
|
|
6449
|
+
return "(Groq)";
|
|
6450
|
+
case "cerebras":
|
|
6451
|
+
return "(Cerebras)";
|
|
6452
|
+
default:
|
|
6453
|
+
return `(${prefix})`;
|
|
6454
|
+
}
|
|
6455
|
+
}
|
|
6456
|
+
function apiKeyHint(providerId, error) {
|
|
6457
|
+
const lower = error.toLowerCase();
|
|
6458
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
6459
|
+
if (!isAuthError) return void 0;
|
|
6460
|
+
const prefix = providerId.split("/")[0];
|
|
6461
|
+
switch (prefix) {
|
|
6462
|
+
case "openai":
|
|
6463
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
6464
|
+
case "azure":
|
|
6465
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
6466
|
+
case "anthropic":
|
|
6467
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
6468
|
+
case "google":
|
|
6469
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
6470
|
+
default:
|
|
6471
|
+
return `Check the API key for ${providerId}`;
|
|
6472
|
+
}
|
|
6473
|
+
}
|
|
6474
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
6475
|
+
const ranked = providers.map((id) => {
|
|
6476
|
+
const runs = successByProvider.get(id) ?? [];
|
|
6477
|
+
const scores = runs.flatMap((r3) => r3.scores.filter((s5) => s5.name === scorerName && s5.value >= 0)).map((s5) => s5.value);
|
|
6478
|
+
const avg = scores.length > 0 ? scores.reduce((a7, b3) => a7 + b3, 0) / scores.length : void 0;
|
|
6479
|
+
return { id, avg };
|
|
6480
|
+
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => b3.avg - a7.avg);
|
|
6481
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
6482
|
+
}
|
|
6483
|
+
function scorerLabel(name) {
|
|
6484
|
+
switch (name) {
|
|
6485
|
+
case "correctness":
|
|
6486
|
+
return "Match";
|
|
6487
|
+
case "schema-correctness":
|
|
6488
|
+
return "Schema";
|
|
6489
|
+
case "fuzzy-similarity":
|
|
6490
|
+
return "Fuzzy";
|
|
6491
|
+
case "llm-judge-correctness":
|
|
6492
|
+
return "Judge";
|
|
6493
|
+
case "tool-usage":
|
|
6494
|
+
return "Tool";
|
|
6495
|
+
default:
|
|
6496
|
+
return name;
|
|
6497
|
+
}
|
|
6498
|
+
}
|
|
6499
|
+
function medalEmoji(medal) {
|
|
6500
|
+
switch (medal) {
|
|
6501
|
+
case "gold":
|
|
6502
|
+
return "\u{1F947}";
|
|
6503
|
+
case "silver":
|
|
6504
|
+
return "\u{1F948}";
|
|
6505
|
+
case "bronze":
|
|
6506
|
+
return "\u{1F949}";
|
|
6507
|
+
case "none":
|
|
6508
|
+
return "";
|
|
6509
|
+
}
|
|
6510
|
+
}
|
|
6511
|
+
|
|
6512
|
+
// src/reporter/console.ts
|
|
6513
|
+
var reset = "\x1B[0m";
|
|
6514
|
+
var boldCode = "\x1B[1m";
|
|
6515
|
+
var dimCode = "\x1B[2m";
|
|
6516
|
+
var green = "\x1B[32m";
|
|
6517
|
+
var red = "\x1B[31m";
|
|
6518
|
+
var yellow = "\x1B[33m";
|
|
6519
|
+
var cyan = "\x1B[36m";
|
|
6520
|
+
var brightGreen = "\x1B[92m";
|
|
6521
|
+
var brightWhite = "\x1B[97m";
|
|
6522
|
+
function bold(s5) {
|
|
6523
|
+
return `${boldCode}${s5}${reset}`;
|
|
6524
|
+
}
|
|
6525
|
+
function dim(s5) {
|
|
6526
|
+
return `${dimCode}${s5}${reset}`;
|
|
6527
|
+
}
|
|
6528
|
+
function stripAnsi(s5) {
|
|
6529
|
+
return s5.replace(/\x1b\[[0-9;]*m/g, "");
|
|
6530
|
+
}
|
|
6531
|
+
function displayWidth(s5) {
|
|
6532
|
+
const stripped = stripAnsi(s5);
|
|
6533
|
+
let width = 0;
|
|
6534
|
+
for (const ch of stripped) {
|
|
6535
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
6536
|
+
if (code >= 126976) width += 2;
|
|
6537
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
6538
|
+
else width += 1;
|
|
6539
|
+
}
|
|
6540
|
+
return width;
|
|
6541
|
+
}
|
|
6542
|
+
function padCell(str, targetWidth, align) {
|
|
6543
|
+
const dw = displayWidth(str);
|
|
6544
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
6545
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
6546
|
+
return str + " ".repeat(padding);
|
|
6547
|
+
}
|
|
6548
|
+
function sparkBar(ratio, width = 8) {
|
|
6549
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
6550
|
+
const fillLen = Math.round(clamped * width);
|
|
6551
|
+
const fill = "\u2593".repeat(fillLen);
|
|
6552
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
6553
|
+
return { fill, track };
|
|
6554
|
+
}
|
|
6555
|
+
function drawTableLine(widths, position) {
|
|
6556
|
+
const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
|
|
6557
|
+
if (position === "bottom") {
|
|
6558
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
6559
|
+
}
|
|
6560
|
+
if (position === "merge") {
|
|
6561
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
6562
|
+
}
|
|
6563
|
+
const segments = widths.map((w4) => "\u2500".repeat(w4 + 2));
|
|
6564
|
+
if (position === "top") {
|
|
6565
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
6566
|
+
}
|
|
6567
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
6568
|
+
}
|
|
6569
|
+
function drawTableRow(cells, widths, aligns) {
|
|
6570
|
+
const parts = cells.map(
|
|
6571
|
+
(cell, i7) => " " + padCell(cell, widths[i7], aligns[i7]) + " "
|
|
6572
|
+
);
|
|
6573
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
6574
|
+
}
|
|
6575
|
+
function drawSpanRow(content, widths) {
|
|
6576
|
+
const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
|
|
6577
|
+
const dw = displayWidth(content);
|
|
6578
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
6579
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
6580
|
+
}
|
|
6581
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
6582
|
+
if (value === void 0) return dim("\u2014");
|
|
6583
|
+
if (providerCount < 2) return text;
|
|
6584
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
6585
|
+
if (colStats.best === colStats.worst) return text;
|
|
6586
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
6587
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
6588
|
+
return `${yellow}${text}${reset}`;
|
|
6589
|
+
}
|
|
6590
|
+
function consoleReporter(results, options) {
|
|
6591
|
+
const showSparklines = options?.sparklines ?? true;
|
|
6592
|
+
if (results.length === 0) {
|
|
6593
|
+
console.log("\nNo results to display.\n");
|
|
6594
|
+
return;
|
|
6595
|
+
}
|
|
6596
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
6597
|
+
const hasCost = scorerNames.includes("cost");
|
|
6598
|
+
const multi = providers.length >= 2;
|
|
6599
|
+
const runsPerCell = maxRun;
|
|
6600
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
6601
|
+
console.log("");
|
|
6602
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
6603
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
6604
|
+
console.log("");
|
|
6605
|
+
for (const task of tasks) {
|
|
6606
|
+
console.log(` ${bold(`Task: ${task}`)}`);
|
|
6607
|
+
console.log("");
|
|
6608
|
+
const providerData = providers.map(
|
|
6609
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
6610
|
+
);
|
|
6611
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
6612
|
+
const medals = computeMedals(columnStats, providers);
|
|
6613
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
6614
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
6615
|
+
const cols = [
|
|
6616
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
6617
|
+
];
|
|
6618
|
+
for (const name of scorerNames) {
|
|
6619
|
+
if (name === "latency") {
|
|
6620
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
6621
|
+
} else if (name === "cost") {
|
|
6622
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
6623
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
6624
|
+
} else {
|
|
6625
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
6626
|
+
}
|
|
6627
|
+
}
|
|
6628
|
+
if (hasErrors) {
|
|
6629
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
6630
|
+
}
|
|
6631
|
+
const widths = cols.map((c3) => c3.width);
|
|
6632
|
+
const aligns = cols.map((c3) => c3.align);
|
|
6633
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
6634
|
+
const headerCells = cols.map((c3) => bold(c3.label));
|
|
6635
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
6636
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
6637
|
+
for (const pd of providerData) {
|
|
6638
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
6639
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
6640
|
+
const cells = [providerCell];
|
|
6641
|
+
if (pd.allErrors) {
|
|
6642
|
+
for (const col of cols.slice(1)) {
|
|
6643
|
+
if (col.label === "Status") {
|
|
6644
|
+
cells.push(`${red}FAIL${reset}`);
|
|
6645
|
+
} else {
|
|
6646
|
+
cells.push(dim("\u2014"));
|
|
6647
|
+
}
|
|
6648
|
+
}
|
|
6649
|
+
} else {
|
|
6650
|
+
for (const col of cols.slice(1)) {
|
|
6651
|
+
if (col.label === "Status") {
|
|
6652
|
+
cells.push(
|
|
6653
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
6654
|
+
);
|
|
6655
|
+
continue;
|
|
6656
|
+
}
|
|
6657
|
+
const statsKey = col.statsKey;
|
|
6658
|
+
const colStats = columnStats.get(statsKey);
|
|
6659
|
+
if (statsKey === "latency") {
|
|
6660
|
+
const ms = pd.latencyMs;
|
|
6661
|
+
if (ms === void 0) {
|
|
6662
|
+
cells.push(dim("\u2014"));
|
|
6663
|
+
} else {
|
|
6664
|
+
const text = `${Math.round(ms)}ms`;
|
|
6665
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
6666
|
+
}
|
|
6667
|
+
} else if (statsKey === "cost") {
|
|
6668
|
+
const cost = pd.avgDetails.costUsd;
|
|
6669
|
+
if (cost === void 0) {
|
|
6670
|
+
cells.push(dim("\u2014"));
|
|
6671
|
+
} else {
|
|
6672
|
+
const text = formatCost(cost);
|
|
6673
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
6674
|
+
}
|
|
6675
|
+
} else if (statsKey === "tokens") {
|
|
6676
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
6677
|
+
if (tokens === void 0) {
|
|
6678
|
+
cells.push(dim("\u2014"));
|
|
6679
|
+
} else {
|
|
6680
|
+
const text = `${tokens}`;
|
|
6681
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
6682
|
+
}
|
|
6683
|
+
} else {
|
|
6684
|
+
const val = pd.avgScores[statsKey];
|
|
6685
|
+
if (val === void 0) {
|
|
6686
|
+
cells.push(dim("\u2014"));
|
|
6687
|
+
} else {
|
|
6688
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
6689
|
+
let coloredPct;
|
|
6690
|
+
if (multi && colStats) {
|
|
6691
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
6692
|
+
} else {
|
|
6693
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
6694
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
6695
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
6696
|
+
}
|
|
6697
|
+
if (showSparklines) {
|
|
6698
|
+
const { fill, track } = sparkBar(val);
|
|
6699
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
6700
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
6701
|
+
} else {
|
|
6702
|
+
cells.push(coloredPct);
|
|
6703
|
+
}
|
|
6704
|
+
}
|
|
6705
|
+
}
|
|
6706
|
+
}
|
|
6707
|
+
}
|
|
6708
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
6709
|
+
}
|
|
6710
|
+
if (multi && providerData.some((p5) => !p5.allErrors)) {
|
|
6711
|
+
const winnerId = [...medals.entries()].find(([, m8]) => m8 === "gold")?.[0];
|
|
6712
|
+
if (winnerId) {
|
|
6713
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
6714
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
6715
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
6716
|
+
}
|
|
6717
|
+
}
|
|
6718
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
6719
|
+
console.log("");
|
|
6720
|
+
}
|
|
6721
|
+
printSummary(results, providers, byProvider);
|
|
6722
|
+
const errorResults = results.filter((r3) => r3.error);
|
|
6723
|
+
if (errorResults.length > 0) {
|
|
6724
|
+
console.log(` ${bold("Errors")}`);
|
|
6725
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
6726
|
+
const seen = /* @__PURE__ */ new Set();
|
|
6727
|
+
for (const r3 of errorResults) {
|
|
6728
|
+
const key = `${r3.providerId}::${r3.error}`;
|
|
6729
|
+
if (seen.has(key)) continue;
|
|
6730
|
+
seen.add(key);
|
|
6731
|
+
const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
|
|
6732
|
+
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
6733
|
+
console.log(` ${red}\u2716${reset} ${r3.providerId}: ${r3.error}${suffix}`);
|
|
6734
|
+
const hint = apiKeyHint(r3.providerId, r3.error ?? "");
|
|
6735
|
+
if (hint) console.log(` ${dim(hint)}`);
|
|
6736
|
+
}
|
|
6737
|
+
console.log("");
|
|
6738
|
+
}
|
|
6739
|
+
if (hasCost) {
|
|
6740
|
+
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
6741
|
+
console.log("");
|
|
6742
|
+
}
|
|
6743
|
+
}
|
|
6744
|
+
function printSummary(results, providers, byProvider) {
|
|
6745
|
+
const successResults = results.filter((r3) => !r3.error);
|
|
6746
|
+
if (successResults.length === 0) return;
|
|
6747
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
6748
|
+
for (const id of providers) {
|
|
6749
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r3) => !r3.error));
|
|
6750
|
+
}
|
|
6751
|
+
console.log(` ${bold("Summary")}`);
|
|
6752
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
6753
|
+
console.log("");
|
|
6754
|
+
const single = providers.length === 1;
|
|
6755
|
+
const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
6756
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
6757
|
+
if (byCorrectness) {
|
|
6758
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
6759
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
6760
|
+
if (single) {
|
|
6761
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
6762
|
+
} else {
|
|
6763
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
6764
|
+
}
|
|
6765
|
+
}
|
|
6766
|
+
const byLatency = providers.map((id) => {
|
|
6767
|
+
const runs = successByProvider.get(id) ?? [];
|
|
6768
|
+
const avg = average(runs.map((r3) => r3.raw.latencyMs));
|
|
6769
|
+
return { id, avg: avg ?? Infinity };
|
|
6770
|
+
}).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
6771
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
6772
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
6773
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
6774
|
+
if (single) {
|
|
6775
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
6776
|
+
} else {
|
|
6777
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
6778
|
+
}
|
|
6779
|
+
}
|
|
6780
|
+
const byCost = providers.map((id) => {
|
|
6781
|
+
const runs = successByProvider.get(id) ?? [];
|
|
6782
|
+
const costs = runs.map((r3) => {
|
|
6783
|
+
const s5 = r3.scores.find((s6) => s6.name === "cost");
|
|
6784
|
+
return s5 && s5.value >= 0 ? s5.value : void 0;
|
|
6785
|
+
}).filter((c3) => c3 !== void 0);
|
|
6786
|
+
const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
|
|
6787
|
+
return { id, avg };
|
|
6788
|
+
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
6789
|
+
if (byCost?.avg !== void 0) {
|
|
6790
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
6791
|
+
const costStr = formatCost(byCost.avg);
|
|
6792
|
+
if (single) {
|
|
6793
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
6794
|
+
} else {
|
|
6795
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
6796
|
+
}
|
|
6797
|
+
}
|
|
6798
|
+
if (!single) {
|
|
6799
|
+
const wins = /* @__PURE__ */ new Map();
|
|
6800
|
+
for (const id of providers) wins.set(id, 0);
|
|
6801
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
6802
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
6803
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
6804
|
+
const maxWins = Math.max(...wins.values());
|
|
6805
|
+
if (maxWins > 0) {
|
|
6806
|
+
const topProviders = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
|
|
6807
|
+
console.log("");
|
|
6808
|
+
if (topProviders.length === 1) {
|
|
6809
|
+
const [winnerId, winCount] = topProviders[0];
|
|
6810
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
6811
|
+
} else {
|
|
6812
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
6813
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
6814
|
+
}
|
|
6815
|
+
}
|
|
6816
|
+
}
|
|
6817
|
+
console.log("");
|
|
6818
|
+
}
|
|
6819
|
+
|
|
6820
|
+
// src/reporter/json.ts
|
|
6821
|
+
function jsonReporter(results) {
|
|
6822
|
+
return JSON.stringify(
|
|
6823
|
+
{
|
|
6824
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6825
|
+
summary: buildSummary(results),
|
|
6826
|
+
results
|
|
6827
|
+
},
|
|
6828
|
+
null,
|
|
6829
|
+
2
|
|
6830
|
+
);
|
|
6831
|
+
}
|
|
6832
|
+
function buildSummary(results) {
|
|
6833
|
+
const tasks = [...new Set(results.map((r3) => r3.taskName))];
|
|
6834
|
+
const providers = [...new Set(results.map((r3) => r3.providerId))];
|
|
6835
|
+
return {
|
|
6836
|
+
totalBenchmarks: results.length,
|
|
6837
|
+
tasks: tasks.length,
|
|
6838
|
+
providers: providers.length,
|
|
6839
|
+
providerIds: providers,
|
|
6840
|
+
taskNames: tasks
|
|
6841
|
+
};
|
|
6842
|
+
}
|
|
6843
|
+
|
|
6844
|
+
// src/reporter/markdown.ts
|
|
6845
|
+
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
6846
|
+
function markdownReporter(report, _current) {
|
|
6847
|
+
const lines = [COMMENT_MARKER, ""];
|
|
6848
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
6849
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
6850
|
+
lines.push("");
|
|
6851
|
+
if (report.comparisons.length > 0) {
|
|
6852
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
6853
|
+
lines.push("");
|
|
6854
|
+
}
|
|
6855
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
6856
|
+
lines.push(markdownCostSummary(report.cost));
|
|
6857
|
+
lines.push("");
|
|
6858
|
+
}
|
|
6859
|
+
if (report.flakyResults.length > 0) {
|
|
6860
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
6861
|
+
lines.push("");
|
|
6862
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
6863
|
+
lines.push("");
|
|
6864
|
+
for (const f6 of report.flakyResults) {
|
|
6865
|
+
lines.push(`- **${f6.providerId}** \xD7 ${f6.taskName} \u2192 ${f6.scorerName} (CV = ${f6.current.cv.toFixed(2)})`);
|
|
6866
|
+
}
|
|
6867
|
+
lines.push("");
|
|
6868
|
+
}
|
|
6869
|
+
if (report.failureReasons.length > 0) {
|
|
6870
|
+
lines.push("### Failure Reasons");
|
|
6871
|
+
lines.push("");
|
|
6872
|
+
for (const reason of report.failureReasons) {
|
|
6873
|
+
lines.push(`- ${reason}`);
|
|
6874
|
+
}
|
|
6875
|
+
lines.push("");
|
|
6876
|
+
}
|
|
6877
|
+
lines.push("---");
|
|
6878
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
6879
|
+
return lines.join("\n");
|
|
6880
|
+
}
|
|
6881
|
+
function markdownComparisonTable(comparisons) {
|
|
6882
|
+
const lines = [];
|
|
6883
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
6884
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
6885
|
+
for (const c3 of comparisons) {
|
|
6886
|
+
const baselineStr = c3.baseline ? formatStats(c3.baseline) : "\u2014";
|
|
6887
|
+
const currentStr = formatStats(c3.current);
|
|
6888
|
+
const deltaStr = c3.delta !== null ? formatDelta(c3.delta, 3) : "\u2014";
|
|
6889
|
+
const status = statusIndicator(c3);
|
|
6890
|
+
lines.push(`| ${c3.providerId} | ${c3.taskName} | ${c3.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
6891
|
+
}
|
|
6892
|
+
return lines.join("\n");
|
|
6893
|
+
}
|
|
6894
|
+
function markdownCostSummary(cost) {
|
|
6895
|
+
const lines = [];
|
|
6896
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
6897
|
+
lines.push("");
|
|
6898
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
6899
|
+
if (cost.budget !== void 0) {
|
|
6900
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
6901
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
6902
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
6903
|
+
}
|
|
6904
|
+
if (cost.perProvider.size > 1) {
|
|
6905
|
+
lines.push("");
|
|
6906
|
+
lines.push("| Provider | Cost |");
|
|
6907
|
+
lines.push("|----------|------|");
|
|
6908
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
6909
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
6910
|
+
}
|
|
6911
|
+
}
|
|
6912
|
+
return lines.join("\n");
|
|
6913
|
+
}
|
|
6914
|
+
function formatStats(stats) {
|
|
6915
|
+
if (stats.n > 1) {
|
|
6916
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
6917
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
6918
|
+
}
|
|
6919
|
+
return stats.mean.toFixed(3);
|
|
6920
|
+
}
|
|
6921
|
+
function statusIndicator(c3) {
|
|
6922
|
+
if (c3.regressed) return "\u{1F534} regressed";
|
|
6923
|
+
if (c3.improved) return "\u{1F7E2} improved";
|
|
6924
|
+
if (c3.baseline === null) return "\u{1F195} new";
|
|
6925
|
+
return "\u26AA unchanged";
|
|
6926
|
+
}
|
|
6927
|
+
|
|
6928
|
+
// src/reporter/html.ts
|
|
6929
|
+
function esc(s5) {
|
|
6930
|
+
return s5.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
6931
|
+
}
|
|
6932
|
+
function htmlReporter(results) {
|
|
6933
|
+
if (results.length === 0) {
|
|
6934
|
+
return emptyReport();
|
|
6935
|
+
}
|
|
6936
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
6937
|
+
const hasCost = scorerNames.includes("cost");
|
|
6938
|
+
const multi = providers.length >= 2;
|
|
6939
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
6940
|
+
const taskSections = tasks.map((task) => {
|
|
6941
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
6942
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
6943
|
+
const medals = computeMedals(columnStats, providers);
|
|
6944
|
+
const winnerId = multi ? [...medals.entries()].find(([, m8]) => m8 === "gold")?.[0] : void 0;
|
|
6945
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
6946
|
+
});
|
|
6947
|
+
const successResults = results.filter((r3) => !r3.error);
|
|
6948
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
6949
|
+
for (const id of providers) {
|
|
6950
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r3) => !r3.error));
|
|
6951
|
+
}
|
|
6952
|
+
const correctnessKey = successResults.some(
|
|
6953
|
+
(r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)
|
|
6954
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
6955
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
6956
|
+
const byLatency = providers.map((id) => {
|
|
6957
|
+
const runs = successByProvider.get(id) ?? [];
|
|
6958
|
+
const avg = average(runs.map((r3) => r3.raw.latencyMs));
|
|
6959
|
+
return { id, avg: avg ?? Infinity };
|
|
6960
|
+
}).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
6961
|
+
const byCost = providers.map((id) => {
|
|
6962
|
+
const runs = successByProvider.get(id) ?? [];
|
|
6963
|
+
const costs = runs.map((r3) => {
|
|
6964
|
+
const s5 = r3.scores.find((s6) => s6.name === "cost");
|
|
6965
|
+
return s5 && s5.value >= 0 ? s5.value : void 0;
|
|
6966
|
+
}).filter((c3) => c3 !== void 0);
|
|
6967
|
+
const avg = costs.length > 0 ? costs.reduce((a7, b3) => a7 + b3, 0) / costs.length : void 0;
|
|
6968
|
+
return { id, avg };
|
|
6969
|
+
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
6970
|
+
let overallWinner;
|
|
6971
|
+
if (multi) {
|
|
6972
|
+
const wins = /* @__PURE__ */ new Map();
|
|
6973
|
+
for (const id of providers) wins.set(id, 0);
|
|
6974
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
6975
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
6976
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
6977
|
+
const maxWins = Math.max(...wins.values());
|
|
6978
|
+
if (maxWins > 0) {
|
|
6979
|
+
const tops = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
|
|
6980
|
+
if (tops.length === 1) overallWinner = tops[0][0];
|
|
6981
|
+
}
|
|
6982
|
+
}
|
|
6983
|
+
const errorResults = results.filter((r3) => r3.error);
|
|
6984
|
+
const deduped = dedupeErrors(errorResults);
|
|
6985
|
+
return `<!DOCTYPE html>
|
|
6986
|
+
<html lang="en">
|
|
6987
|
+
<head>
|
|
6988
|
+
<meta charset="UTF-8">
|
|
6989
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
6990
|
+
<title>Agent Duelist Report</title>
|
|
6991
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
6992
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
6993
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
6994
|
+
<meta property="og:type" content="website">
|
|
6995
|
+
${renderStyle()}
|
|
6996
|
+
</head>
|
|
6997
|
+
<body>
|
|
6998
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
6999
|
+
<div class="report">
|
|
7000
|
+
|
|
7001
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
7002
|
+
|
|
7003
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
7004
|
+
|
|
7005
|
+
<main>
|
|
7006
|
+
${taskSections.map((s5, i7) => renderTaskSection(
|
|
7007
|
+
s5.task,
|
|
7008
|
+
s5.providerData,
|
|
7009
|
+
s5.columnStats,
|
|
7010
|
+
s5.medals,
|
|
7011
|
+
s5.winnerId,
|
|
7012
|
+
scorerNames,
|
|
7013
|
+
hasCost,
|
|
7014
|
+
multi,
|
|
7015
|
+
i7
|
|
7016
|
+
)).join("\n")}
|
|
7017
|
+
</main>
|
|
7018
|
+
|
|
7019
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
7020
|
+
|
|
7021
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
7022
|
+
|
|
7023
|
+
${renderFooter()}
|
|
7024
|
+
|
|
7025
|
+
</div>
|
|
7026
|
+
${renderScript(tasks.length)}
|
|
7027
|
+
</body>
|
|
7028
|
+
</html>`;
|
|
7029
|
+
}
|
|
7030
|
+
function emptyReport() {
|
|
7031
|
+
return `<!DOCTYPE html>
|
|
7032
|
+
<html lang="en">
|
|
7033
|
+
<head>
|
|
7034
|
+
<meta charset="UTF-8">
|
|
7035
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
7036
|
+
<title>Agent Duelist Report</title>
|
|
7037
|
+
${renderStyle()}
|
|
7038
|
+
</head>
|
|
7039
|
+
<body>
|
|
7040
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
7041
|
+
<div class="report">
|
|
7042
|
+
${renderHeader("0 runs", 0, 0)}
|
|
7043
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
7044
|
+
${renderFooter()}
|
|
7045
|
+
</div>
|
|
7046
|
+
</body>
|
|
7047
|
+
</html>`;
|
|
7048
|
+
}
|
|
7049
|
+
function dedupeErrors(errorResults) {
|
|
7050
|
+
const seen = /* @__PURE__ */ new Map();
|
|
7051
|
+
for (const r3 of errorResults) {
|
|
7052
|
+
const key = `${r3.providerId}::${r3.error}`;
|
|
7053
|
+
const existing = seen.get(key);
|
|
7054
|
+
if (existing) {
|
|
7055
|
+
existing.count++;
|
|
7056
|
+
} else {
|
|
7057
|
+
seen.set(key, {
|
|
7058
|
+
providerId: r3.providerId,
|
|
7059
|
+
error: r3.error ?? "Unknown error",
|
|
7060
|
+
count: 1,
|
|
7061
|
+
hint: apiKeyHint(r3.providerId, r3.error ?? "")
|
|
7062
|
+
});
|
|
7063
|
+
}
|
|
7064
|
+
}
|
|
7065
|
+
return [...seen.values()];
|
|
7066
|
+
}
|
|
7067
|
+
function renderStyle() {
|
|
7068
|
+
return `<style>
|
|
7069
|
+
:root {
|
|
7070
|
+
--bg: #0f172a;
|
|
7071
|
+
--bg-deep: #020617;
|
|
7072
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
7073
|
+
--accent: #f59e0b;
|
|
7074
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
7075
|
+
--text: #e2e8f0;
|
|
7076
|
+
--muted: #94a3b8;
|
|
7077
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
7078
|
+
--green: #22c55e;
|
|
7079
|
+
--red: #ef4444;
|
|
7080
|
+
--yellow: #eab308;
|
|
7081
|
+
--radius: 12px;
|
|
7082
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
7083
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
7084
|
+
}
|
|
7085
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
7086
|
+
html, body {
|
|
7087
|
+
font-family: var(--sans);
|
|
7088
|
+
background: var(--bg);
|
|
7089
|
+
color: var(--text);
|
|
7090
|
+
min-height: 100vh;
|
|
7091
|
+
}
|
|
7092
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
7093
|
+
|
|
7094
|
+
/* Animated gradient mesh */
|
|
7095
|
+
.bg-mesh {
|
|
7096
|
+
position: fixed; inset: 0; z-index: 0;
|
|
7097
|
+
overflow: hidden; pointer-events: none;
|
|
7098
|
+
}
|
|
7099
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
7100
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
7101
|
+
filter: blur(120px); opacity: 0.4;
|
|
7102
|
+
}
|
|
7103
|
+
.bg-mesh::before {
|
|
7104
|
+
width: 600px; height: 600px;
|
|
7105
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
7106
|
+
top: -10%; left: -5%;
|
|
7107
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
7108
|
+
}
|
|
7109
|
+
.bg-mesh::after {
|
|
7110
|
+
width: 500px; height: 500px;
|
|
7111
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
7112
|
+
bottom: -10%; right: -5%;
|
|
7113
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
7114
|
+
}
|
|
7115
|
+
.bg-mesh-extra {
|
|
7116
|
+
position: absolute; width: 400px; height: 400px;
|
|
7117
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
7118
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
7119
|
+
top: 50%; left: 60%;
|
|
7120
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
7121
|
+
}
|
|
7122
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
7123
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
7124
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
7125
|
+
|
|
7126
|
+
/* Report container */
|
|
7127
|
+
.report {
|
|
7128
|
+
position: relative; z-index: 1;
|
|
7129
|
+
width: 100%; max-width: 960px;
|
|
7130
|
+
}
|
|
7131
|
+
|
|
7132
|
+
/* Header */
|
|
7133
|
+
.report-header {
|
|
7134
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
7135
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
7136
|
+
}
|
|
7137
|
+
.report-brand {
|
|
7138
|
+
display: flex; align-items: center; gap: 10px;
|
|
7139
|
+
text-decoration: none; color: var(--muted);
|
|
7140
|
+
font-weight: 600; font-size: 14px;
|
|
7141
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
7142
|
+
}
|
|
7143
|
+
.report-brand:hover { color: var(--text); }
|
|
7144
|
+
.brand-icon {
|
|
7145
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
7146
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
7147
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
7148
|
+
display: flex; align-items: center; justify-content: center;
|
|
7149
|
+
font-size: 16px;
|
|
7150
|
+
}
|
|
7151
|
+
.report-meta {
|
|
7152
|
+
font-size: 12px; color: var(--muted);
|
|
7153
|
+
text-align: right; line-height: 1.6;
|
|
7154
|
+
}
|
|
7155
|
+
|
|
7156
|
+
/* Task tabs */
|
|
7157
|
+
.task-tabs {
|
|
7158
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
7159
|
+
}
|
|
7160
|
+
.task-tab {
|
|
7161
|
+
padding: 6px 16px; border-radius: 999px;
|
|
7162
|
+
border: 1px solid var(--border);
|
|
7163
|
+
background: transparent; color: var(--muted);
|
|
7164
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
7165
|
+
transition: all 150ms ease;
|
|
7166
|
+
}
|
|
7167
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
7168
|
+
.task-tab.active {
|
|
7169
|
+
background: var(--accent-soft);
|
|
7170
|
+
border-color: rgba(245,158,11,0.4);
|
|
7171
|
+
color: var(--accent);
|
|
7172
|
+
}
|
|
7173
|
+
|
|
7174
|
+
/* Task sections */
|
|
7175
|
+
.task-section { display: none; }
|
|
7176
|
+
.task-section.active { display: block; }
|
|
7177
|
+
.task-name {
|
|
7178
|
+
font-size: 18px; font-weight: 600;
|
|
7179
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
7180
|
+
}
|
|
7181
|
+
|
|
7182
|
+
/* Results table */
|
|
7183
|
+
.results-table {
|
|
7184
|
+
width: 100%; border-collapse: collapse;
|
|
7185
|
+
font-size: 13px; margin-bottom: 16px;
|
|
7186
|
+
border-radius: var(--radius); overflow: hidden;
|
|
7187
|
+
border: 1px solid var(--border);
|
|
7188
|
+
}
|
|
7189
|
+
.results-table th, .results-table td {
|
|
7190
|
+
padding: 10px 14px;
|
|
7191
|
+
text-align: left;
|
|
7192
|
+
border-bottom: 1px solid var(--border);
|
|
7193
|
+
}
|
|
7194
|
+
.results-table th {
|
|
7195
|
+
background: rgba(0,0,0,0.3);
|
|
7196
|
+
font-size: 11px; font-weight: 600;
|
|
7197
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
7198
|
+
color: var(--muted); cursor: pointer;
|
|
7199
|
+
user-select: none; white-space: nowrap;
|
|
7200
|
+
}
|
|
7201
|
+
.results-table th:hover { color: var(--text); }
|
|
7202
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
7203
|
+
.results-table tbody tr {
|
|
7204
|
+
background: var(--panel);
|
|
7205
|
+
transition: background 120ms ease;
|
|
7206
|
+
}
|
|
7207
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
7208
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
7209
|
+
|
|
7210
|
+
/* Score cell with progress bar */
|
|
7211
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
7212
|
+
.score-bar {
|
|
7213
|
+
position: absolute; left: 0; bottom: 0;
|
|
7214
|
+
height: 3px; border-radius: 2px;
|
|
7215
|
+
transition: width 300ms ease;
|
|
7216
|
+
}
|
|
7217
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
7218
|
+
|
|
7219
|
+
/* Color ranking */
|
|
7220
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
7221
|
+
.rank-worst { color: var(--red); }
|
|
7222
|
+
.rank-mid { color: var(--yellow); }
|
|
7223
|
+
.rank-neutral { color: var(--text); }
|
|
7224
|
+
.rank-error { color: var(--muted); }
|
|
7225
|
+
|
|
7226
|
+
/* Winner banner */
|
|
7227
|
+
.task-winner {
|
|
7228
|
+
display: flex; align-items: center; gap: 10px;
|
|
7229
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
7230
|
+
border-radius: var(--radius);
|
|
7231
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
7232
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
7233
|
+
font-size: 14px; font-weight: 500;
|
|
7234
|
+
}
|
|
7235
|
+
.task-winner .trophy { font-size: 20px; }
|
|
7236
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
7237
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
7238
|
+
|
|
7239
|
+
/* Summary cards */
|
|
7240
|
+
.summary-section { margin-top: 32px; }
|
|
7241
|
+
.summary-title {
|
|
7242
|
+
font-size: 16px; font-weight: 600;
|
|
7243
|
+
margin-bottom: 12px; color: var(--text);
|
|
7244
|
+
}
|
|
7245
|
+
.summary-cards {
|
|
7246
|
+
display: grid;
|
|
7247
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
7248
|
+
gap: 12px;
|
|
7249
|
+
}
|
|
7250
|
+
.summary-card {
|
|
7251
|
+
padding: 16px; border-radius: var(--radius);
|
|
7252
|
+
border: 1px solid var(--border);
|
|
7253
|
+
background: var(--panel);
|
|
7254
|
+
}
|
|
7255
|
+
.summary-card .card-label {
|
|
7256
|
+
font-size: 11px; font-weight: 600;
|
|
7257
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
7258
|
+
color: var(--muted); margin-bottom: 6px;
|
|
7259
|
+
}
|
|
7260
|
+
.summary-card .card-value {
|
|
7261
|
+
font-size: 20px; font-weight: 700;
|
|
7262
|
+
color: var(--green); font-family: var(--mono);
|
|
7263
|
+
}
|
|
7264
|
+
.summary-card .card-provider {
|
|
7265
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
7266
|
+
}
|
|
7267
|
+
|
|
7268
|
+
/* Errors */
|
|
7269
|
+
.errors-section { margin-top: 24px; }
|
|
7270
|
+
.errors-title {
|
|
7271
|
+
font-size: 16px; font-weight: 600;
|
|
7272
|
+
margin-bottom: 8px; color: var(--red);
|
|
7273
|
+
cursor: pointer;
|
|
7274
|
+
}
|
|
7275
|
+
.errors-list {
|
|
7276
|
+
border-radius: var(--radius);
|
|
7277
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
7278
|
+
background: rgba(239,68,68,0.04);
|
|
7279
|
+
overflow: hidden;
|
|
7280
|
+
}
|
|
7281
|
+
.error-item {
|
|
7282
|
+
padding: 10px 16px;
|
|
7283
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
7284
|
+
font-size: 13px;
|
|
7285
|
+
}
|
|
7286
|
+
.error-item:last-child { border-bottom: none; }
|
|
7287
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
7288
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
7289
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
7290
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
7291
|
+
|
|
7292
|
+
/* Footer */
|
|
7293
|
+
.report-footer {
|
|
7294
|
+
margin-top: 40px; padding: 20px 0;
|
|
7295
|
+
border-top: 1px solid var(--border);
|
|
7296
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
7297
|
+
flex-wrap: wrap; gap: 12px;
|
|
7298
|
+
}
|
|
7299
|
+
.footer-brand {
|
|
7300
|
+
font-size: 13px; color: var(--muted);
|
|
7301
|
+
}
|
|
7302
|
+
.footer-brand a {
|
|
7303
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
7304
|
+
}
|
|
7305
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
7306
|
+
.footer-cta {
|
|
7307
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
7308
|
+
padding: 6px 14px; border-radius: 8px;
|
|
7309
|
+
background: var(--accent-soft);
|
|
7310
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
7311
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
7312
|
+
text-decoration: none;
|
|
7313
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
7314
|
+
}
|
|
7315
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
7316
|
+
|
|
7317
|
+
/* Empty state */
|
|
7318
|
+
.empty-msg {
|
|
7319
|
+
text-align: center; color: var(--muted);
|
|
7320
|
+
padding: 60px 20px; font-size: 16px;
|
|
7321
|
+
}
|
|
7322
|
+
|
|
7323
|
+
/* Responsive */
|
|
7324
|
+
@media (max-width: 640px) {
|
|
7325
|
+
body { padding: 12px; }
|
|
7326
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
7327
|
+
.report-meta { text-align: left; }
|
|
7328
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
7329
|
+
.results-table { font-size: 12px; }
|
|
7330
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
7331
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
7332
|
+
}
|
|
7333
|
+
</style>`;
|
|
7334
|
+
}
|
|
7335
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
7336
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
7337
|
+
return `<header class="report-header">
|
|
7338
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
7339
|
+
<div class="brand-icon">⬡</div>
|
|
7340
|
+
<span>Agent Duelist</span>
|
|
7341
|
+
</a>
|
|
7342
|
+
<div class="report-meta">
|
|
7343
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
7344
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
7345
|
+
${esc(runsLabel)}<br>
|
|
7346
|
+
${esc(now)}
|
|
7347
|
+
</div>
|
|
7348
|
+
</header>`;
|
|
7349
|
+
}
|
|
7350
|
+
function renderTabs(tasks) {
|
|
7351
|
+
const buttons = tasks.map(
|
|
7352
|
+
(t3, i7) => `<button class="task-tab${i7 === 0 ? " active" : ""}" data-task="${i7}">${esc(t3)}</button>`
|
|
7353
|
+
).join("\n ");
|
|
7354
|
+
return `<nav class="task-tabs">
|
|
7355
|
+
${buttons}
|
|
7356
|
+
</nav>`;
|
|
7357
|
+
}
|
|
7358
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
7359
|
+
const cols = [
|
|
7360
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
7361
|
+
];
|
|
7362
|
+
for (const name of scorerNames) {
|
|
7363
|
+
if (name === "latency") {
|
|
7364
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
7365
|
+
} else if (name === "cost") {
|
|
7366
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
7367
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
7368
|
+
} else {
|
|
7369
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
7370
|
+
}
|
|
7371
|
+
}
|
|
7372
|
+
const ths = cols.map(
|
|
7373
|
+
(c3) => `<th data-col="${esc(c3.key)}">${esc(c3.label)}<span class="sort-arrow"></span></th>`
|
|
7374
|
+
).join("");
|
|
7375
|
+
const rows = providerData.map((pd) => {
|
|
7376
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
7377
|
+
const cells = [];
|
|
7378
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
7379
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
7380
|
+
if (pd.allErrors) {
|
|
7381
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
7382
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
7383
|
+
}
|
|
7384
|
+
} else {
|
|
7385
|
+
for (const col of cols.slice(1)) {
|
|
7386
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
7387
|
+
}
|
|
7388
|
+
}
|
|
7389
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
7390
|
+
}).join("\n");
|
|
7391
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
7392
|
+
<span class="trophy">🏆</span>
|
|
7393
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
7394
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
7395
|
+
</div>` : "";
|
|
7396
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
7397
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
7398
|
+
<table class="results-table">
|
|
7399
|
+
<thead><tr>${ths}</tr></thead>
|
|
7400
|
+
<tbody>${rows}</tbody>
|
|
7401
|
+
</table>
|
|
7402
|
+
${winnerHtml}
|
|
7403
|
+
</section>`;
|
|
7404
|
+
}
|
|
7405
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
7406
|
+
const colStats = columnStats.get(key);
|
|
7407
|
+
if (key === "latency") {
|
|
7408
|
+
const ms = pd.latencyMs;
|
|
7409
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
7410
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
7411
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
7412
|
+
}
|
|
7413
|
+
if (key === "cost") {
|
|
7414
|
+
const cost = pd.avgDetails.costUsd;
|
|
7415
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
7416
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
7417
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
7418
|
+
}
|
|
7419
|
+
if (key === "tokens") {
|
|
7420
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
7421
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
7422
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
7423
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
7424
|
+
}
|
|
7425
|
+
const val = pd.avgScores[key];
|
|
7426
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
7427
|
+
const pct = Math.round(val * 100);
|
|
7428
|
+
let rankCls;
|
|
7429
|
+
if (multi && colStats) {
|
|
7430
|
+
rankCls = rankClass_(val, colStats);
|
|
7431
|
+
} else {
|
|
7432
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
7433
|
+
}
|
|
7434
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
7435
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
7436
|
+
<span class="score-val">${pct}%</span>
|
|
7437
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
7438
|
+
</td>`;
|
|
7439
|
+
}
|
|
7440
|
+
function rankClass_(value, colStats) {
|
|
7441
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
7442
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
7443
|
+
if (value === colStats.best) return "rank-best";
|
|
7444
|
+
if (value === colStats.worst) return "rank-worst";
|
|
7445
|
+
return "rank-mid";
|
|
7446
|
+
}
|
|
7447
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
7448
|
+
const cards = [];
|
|
7449
|
+
if (byCorrectness) {
|
|
7450
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
7451
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
7452
|
+
cards.push(`<div class="summary-card">
|
|
7453
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
7454
|
+
<div class="card-value">${pct}</div>
|
|
7455
|
+
${provider}
|
|
7456
|
+
</div>`);
|
|
7457
|
+
}
|
|
7458
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
7459
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
7460
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
7461
|
+
cards.push(`<div class="summary-card">
|
|
7462
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
7463
|
+
<div class="card-value">${ms}</div>
|
|
7464
|
+
${provider}
|
|
7465
|
+
</div>`);
|
|
7466
|
+
}
|
|
7467
|
+
if (byCost?.avg !== void 0) {
|
|
7468
|
+
const cost = esc(formatCost(byCost.avg));
|
|
7469
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
7470
|
+
cards.push(`<div class="summary-card">
|
|
7471
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
7472
|
+
<div class="card-value">${cost}</div>
|
|
7473
|
+
${provider}
|
|
7474
|
+
</div>`);
|
|
7475
|
+
}
|
|
7476
|
+
if (overallWinner) {
|
|
7477
|
+
cards.push(`<div class="summary-card">
|
|
7478
|
+
<div class="card-label">Overall Winner</div>
|
|
7479
|
+
<div class="card-value">🏆</div>
|
|
7480
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
7481
|
+
</div>`);
|
|
7482
|
+
}
|
|
7483
|
+
if (cards.length === 0) return "";
|
|
7484
|
+
return `<section class="summary-section">
|
|
7485
|
+
<h2 class="summary-title">Summary</h2>
|
|
7486
|
+
<div class="summary-cards">
|
|
7487
|
+
${cards.join("\n ")}
|
|
7488
|
+
</div>
|
|
7489
|
+
</section>`;
|
|
7490
|
+
}
|
|
7491
|
+
function renderErrors(errors) {
|
|
7492
|
+
const items = errors.map((e5) => {
|
|
7493
|
+
const suffix = e5.count > 1 ? ` <span class="error-count">(×${e5.count})</span>` : "";
|
|
7494
|
+
const hint = e5.hint ? `<div class="error-hint">${esc(e5.hint)}</div>` : "";
|
|
7495
|
+
return `<div class="error-item">
|
|
7496
|
+
<span class="error-provider">${esc(e5.providerId)}:</span>
|
|
7497
|
+
<span class="error-msg">${esc(e5.error)}</span>${suffix}
|
|
7498
|
+
${hint}
|
|
7499
|
+
</div>`;
|
|
7500
|
+
}).join("\n");
|
|
7501
|
+
return `<section class="errors-section">
|
|
7502
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
|
|
7503
|
+
<div class="errors-list">
|
|
7504
|
+
${items}
|
|
7505
|
+
</div>
|
|
7506
|
+
</section>`;
|
|
7507
|
+
}
|
|
7508
|
+
function renderFooter() {
|
|
7509
|
+
return `<footer class="report-footer">
|
|
7510
|
+
<div class="footer-brand">
|
|
7511
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
7512
|
+
</div>
|
|
7513
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
7514
|
+
⭐ Star on GitHub
|
|
7515
|
+
</a>
|
|
7516
|
+
</footer>`;
|
|
7517
|
+
}
|
|
7518
|
+
function renderScript(taskCount) {
|
|
7519
|
+
return `<script>
|
|
7520
|
+
(function() {
|
|
7521
|
+
/* Tab switching */
|
|
7522
|
+
${taskCount > 1 ? `
|
|
7523
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
7524
|
+
var sections = document.querySelectorAll('.task-section');
|
|
7525
|
+
tabs.forEach(function(tab) {
|
|
7526
|
+
tab.addEventListener('click', function() {
|
|
7527
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
7528
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
7529
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
7530
|
+
tab.classList.add('active');
|
|
7531
|
+
sections[idx].classList.add('active');
|
|
7532
|
+
});
|
|
7533
|
+
});` : ""}
|
|
7534
|
+
|
|
7535
|
+
/* Column sorting */
|
|
7536
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
7537
|
+
var table = th.closest('table');
|
|
7538
|
+
var asc = true;
|
|
7539
|
+
th.addEventListener('click', function() {
|
|
7540
|
+
var tbody = table.querySelector('tbody');
|
|
7541
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
7542
|
+
rows.sort(function(a, b) {
|
|
7543
|
+
var aCell = a.children[colIdx];
|
|
7544
|
+
var bCell = b.children[colIdx];
|
|
7545
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
7546
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
7547
|
+
if (aVal !== null && bVal !== null) {
|
|
7548
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
7549
|
+
}
|
|
7550
|
+
var aText = aCell.textContent || '';
|
|
7551
|
+
var bText = bCell.textContent || '';
|
|
7552
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
7553
|
+
});
|
|
7554
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
7555
|
+
|
|
7556
|
+
/* Update sort arrows */
|
|
7557
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
7558
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
7559
|
+
asc = !asc;
|
|
7560
|
+
});
|
|
7561
|
+
});
|
|
7562
|
+
})();
|
|
7563
|
+
</script>`;
|
|
7564
|
+
}
|
|
7565
|
+
|
|
7566
|
+
// src/ci.ts
|
|
7567
|
+
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
7568
|
+
import { dirname } from "path";
|
|
7569
|
+
var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
7570
|
+
var FLAKY_CV_THRESHOLD = 0.3;
|
|
7571
|
+
var T_CRITICAL_95 = {
|
|
7572
|
+
1: 12.706,
|
|
7573
|
+
2: 4.303,
|
|
7574
|
+
3: 3.182,
|
|
7575
|
+
4: 2.776,
|
|
7576
|
+
5: 2.571,
|
|
7577
|
+
6: 2.447,
|
|
7578
|
+
7: 2.365,
|
|
7579
|
+
8: 2.306,
|
|
7580
|
+
9: 2.262,
|
|
7581
|
+
10: 2.228,
|
|
7582
|
+
15: 2.131,
|
|
7583
|
+
20: 2.086,
|
|
7584
|
+
25: 2.06,
|
|
7585
|
+
30: 2.042
|
|
7586
|
+
};
|
|
7587
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a7, b3) => a7 - b3);
|
|
7588
|
+
function tCritical(df) {
|
|
7589
|
+
if (df <= 0) return 1.96;
|
|
7590
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
7591
|
+
const keys = T_CRITICAL_KEYS;
|
|
7592
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
7593
|
+
for (let i7 = 0; i7 < keys.length - 1; i7++) {
|
|
7594
|
+
if (df > keys[i7] && df < keys[i7 + 1]) {
|
|
7595
|
+
const low = keys[i7], high = keys[i7 + 1];
|
|
7596
|
+
const ratio = (df - low) / (high - low);
|
|
7597
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
7598
|
+
}
|
|
7599
|
+
}
|
|
7600
|
+
return 1.96;
|
|
7601
|
+
}
|
|
7602
|
+
function computeScorerStats(samples) {
|
|
7603
|
+
const n3 = samples.length;
|
|
7604
|
+
if (n3 === 0) {
|
|
7605
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
7606
|
+
}
|
|
7607
|
+
const mean = samples.reduce((a7, b3) => a7 + b3, 0) / n3;
|
|
7608
|
+
if (n3 === 1) {
|
|
7609
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
7610
|
+
}
|
|
7611
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n3 - 1);
|
|
7612
|
+
const stddev = Math.sqrt(variance);
|
|
7613
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
7614
|
+
const se4 = stddev / Math.sqrt(n3);
|
|
7615
|
+
const t3 = tCritical(n3 - 1);
|
|
7616
|
+
return {
|
|
7617
|
+
mean,
|
|
7618
|
+
stddev,
|
|
7619
|
+
cv,
|
|
7620
|
+
n: n3,
|
|
7621
|
+
ci95Lower: mean - t3 * se4,
|
|
7622
|
+
ci95Upper: mean + t3 * se4
|
|
7623
|
+
};
|
|
7624
|
+
}
|
|
7625
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
7626
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
7627
|
+
}
|
|
7628
|
+
function computeStats(results) {
|
|
7629
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
7630
|
+
for (const r3 of results) {
|
|
7631
|
+
if (r3.error) continue;
|
|
7632
|
+
for (const score of r3.scores) {
|
|
7633
|
+
if (score.value < 0) continue;
|
|
7634
|
+
const key = groupKey(r3.providerId, r3.taskName, score.name);
|
|
7635
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
7636
|
+
grouped.get(key).push(score.value);
|
|
7637
|
+
}
|
|
7638
|
+
}
|
|
7639
|
+
const stats = /* @__PURE__ */ new Map();
|
|
7640
|
+
for (const [key, samples] of grouped) {
|
|
7641
|
+
stats.set(key, computeScorerStats(samples));
|
|
7642
|
+
}
|
|
7643
|
+
return stats;
|
|
7644
|
+
}
|
|
7645
|
+
function computeCostSummary(results, budget) {
|
|
7646
|
+
let totalUsd = 0;
|
|
7647
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
7648
|
+
for (const r3 of results) {
|
|
7649
|
+
if (r3.error) continue;
|
|
7650
|
+
const costScore = r3.scores.find((s5) => s5.name === "cost");
|
|
7651
|
+
if (!costScore || costScore.value < 0) continue;
|
|
7652
|
+
const details = costScore.details;
|
|
7653
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
7654
|
+
if (usd <= 0) continue;
|
|
7655
|
+
totalUsd += usd;
|
|
7656
|
+
perProvider.set(r3.providerId, (perProvider.get(r3.providerId) ?? 0) + usd);
|
|
7657
|
+
}
|
|
7658
|
+
return {
|
|
7659
|
+
totalUsd,
|
|
7660
|
+
perProvider,
|
|
7661
|
+
budget,
|
|
7662
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
7663
|
+
};
|
|
7664
|
+
}
|
|
7665
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
7666
|
+
const comparisons = [];
|
|
7667
|
+
const failureReasons = [];
|
|
7668
|
+
for (const [key, current] of currentStats) {
|
|
7669
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
7670
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
7671
|
+
let delta = null;
|
|
7672
|
+
let regressed = false;
|
|
7673
|
+
let improved = false;
|
|
7674
|
+
if (baseline) {
|
|
7675
|
+
delta = current.mean - baseline.mean;
|
|
7676
|
+
const threshold = thresholds.get(scorerName);
|
|
7677
|
+
if (threshold !== void 0) {
|
|
7678
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
7679
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
7680
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
7681
|
+
}
|
|
7682
|
+
}
|
|
7683
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
7684
|
+
comparisons.push({
|
|
7685
|
+
providerId,
|
|
7686
|
+
taskName,
|
|
7687
|
+
scorerName,
|
|
7688
|
+
baseline,
|
|
7689
|
+
current,
|
|
7690
|
+
delta,
|
|
7691
|
+
regressed,
|
|
7692
|
+
improved,
|
|
7693
|
+
flaky
|
|
7694
|
+
});
|
|
7695
|
+
}
|
|
7696
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
7697
|
+
const regressions = comparisons.filter((c3) => c3.regressed);
|
|
7698
|
+
if (regressions.length > 0) {
|
|
7699
|
+
for (const r3 of regressions) {
|
|
7700
|
+
failureReasons.push(
|
|
7701
|
+
`${r3.providerId} \xD7 ${r3.taskName}: ${r3.scorerName} regressed by ${formatDelta(r3.delta)}`
|
|
7702
|
+
);
|
|
7703
|
+
}
|
|
7704
|
+
}
|
|
7705
|
+
if (cost.overBudget) {
|
|
7706
|
+
failureReasons.push(
|
|
7707
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
7708
|
+
);
|
|
7709
|
+
}
|
|
7710
|
+
const flakyResults = comparisons.filter((c3) => c3.flaky);
|
|
7711
|
+
const failed = failureReasons.length > 0;
|
|
7712
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
7713
|
+
}
|
|
7714
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
7715
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
7716
|
+
const delta = current.mean - baseline.mean;
|
|
7717
|
+
if (lowerIsBetter) return delta > threshold;
|
|
7718
|
+
return delta < -threshold;
|
|
7719
|
+
}
|
|
7720
|
+
if (lowerIsBetter) {
|
|
7721
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
7722
|
+
}
|
|
7723
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
7724
|
+
}
|
|
7725
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
7726
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
7727
|
+
const delta = current.mean - baseline.mean;
|
|
7728
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
7729
|
+
return delta > threshold;
|
|
7730
|
+
}
|
|
7731
|
+
if (lowerIsBetter) {
|
|
7732
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
7733
|
+
}
|
|
7734
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
7735
|
+
}
|
|
7736
|
+
function loadBaseline(path) {
|
|
7737
|
+
try {
|
|
7738
|
+
const raw = readFileSync(path, "utf-8");
|
|
7739
|
+
const data = JSON.parse(raw);
|
|
7740
|
+
const results = data.results ?? data;
|
|
7741
|
+
if (!Array.isArray(results)) return null;
|
|
7742
|
+
return {
|
|
7743
|
+
timestamp: data.timestamp ?? "unknown",
|
|
7744
|
+
results
|
|
7745
|
+
};
|
|
7746
|
+
} catch {
|
|
7747
|
+
return null;
|
|
7748
|
+
}
|
|
7749
|
+
}
|
|
7750
|
+
function saveBaseline(path, results) {
|
|
7751
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
7752
|
+
const data = {
|
|
7753
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
7754
|
+
results
|
|
7755
|
+
};
|
|
7756
|
+
writeFileSync(path, JSON.stringify(data, null, 2));
|
|
7757
|
+
}
|
|
7758
|
+
|
|
7759
|
+
// src/github.ts
|
|
7760
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
7761
|
+
function detectGitHubContext() {
|
|
7762
|
+
const token = process.env.GITHUB_TOKEN;
|
|
7763
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
7764
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
7765
|
+
if (!token || !repository) return null;
|
|
7766
|
+
const [owner, repo] = repository.split("/");
|
|
7767
|
+
if (!owner || !repo) return null;
|
|
7768
|
+
let prNumber;
|
|
7769
|
+
if (eventPath) {
|
|
7770
|
+
try {
|
|
7771
|
+
const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
|
|
7772
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
7773
|
+
const pr = event.pull_request;
|
|
7774
|
+
prNumber = pr.number;
|
|
7775
|
+
}
|
|
7776
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
7777
|
+
const issue = event.issue;
|
|
7778
|
+
if (issue.pull_request) {
|
|
7779
|
+
prNumber = issue.number;
|
|
7780
|
+
}
|
|
7781
|
+
}
|
|
7782
|
+
} catch {
|
|
7783
|
+
}
|
|
7784
|
+
}
|
|
7785
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
7786
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
7787
|
+
}
|
|
7788
|
+
if (!prNumber) return null;
|
|
7789
|
+
return { token, owner, repo, prNumber };
|
|
7790
|
+
}
|
|
7791
|
+
var API_BASE = "https://api.github.com";
|
|
7792
|
+
function ghHeaders(token, extra) {
|
|
7793
|
+
return {
|
|
7794
|
+
Authorization: `Bearer ${token}`,
|
|
7795
|
+
Accept: "application/vnd.github+json",
|
|
7796
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
7797
|
+
...extra
|
|
7798
|
+
};
|
|
7799
|
+
}
|
|
7800
|
+
async function findExistingComment(ctx, marker) {
|
|
7801
|
+
let page = 1;
|
|
7802
|
+
const perPage = 50;
|
|
7803
|
+
while (true) {
|
|
7804
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
7805
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
7806
|
+
if (!res.ok) return null;
|
|
7807
|
+
const comments = await res.json();
|
|
7808
|
+
if (comments.length === 0) break;
|
|
7809
|
+
for (const comment of comments) {
|
|
7810
|
+
if (comment.body?.includes(marker)) {
|
|
7811
|
+
return comment.id;
|
|
7812
|
+
}
|
|
7813
|
+
}
|
|
7814
|
+
if (comments.length < perPage) break;
|
|
7815
|
+
page++;
|
|
7816
|
+
}
|
|
7817
|
+
return null;
|
|
7818
|
+
}
|
|
7819
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
7820
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
7821
|
+
if (existingId) {
|
|
7822
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
7823
|
+
const res = await fetch(url, {
|
|
7824
|
+
method: "PATCH",
|
|
7825
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
7826
|
+
body: JSON.stringify({ body })
|
|
7827
|
+
});
|
|
7828
|
+
if (!res.ok) {
|
|
7829
|
+
const text = await res.text();
|
|
7830
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
7831
|
+
}
|
|
7832
|
+
} else {
|
|
7833
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
7834
|
+
const res = await fetch(url, {
|
|
7835
|
+
method: "POST",
|
|
7836
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
7837
|
+
body: JSON.stringify({ body })
|
|
7838
|
+
});
|
|
7839
|
+
if (!res.ok) {
|
|
7840
|
+
const text = await res.text();
|
|
7841
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
7842
|
+
}
|
|
7843
|
+
}
|
|
7844
|
+
}
|
|
7845
|
+
|
|
7846
|
+
// src/cli.ts
|
|
7847
|
+
var __dirname2 = dirname2(fileURLToPath(import.meta.url));
|
|
7848
|
+
var program = new Command();
|
|
7849
|
+
program.name("duelist").description("Pit LLM providers against each other on agent tasks.").version(getVersion());
|
|
7850
|
+
program.command("init").description("Scaffold an arena.config.ts in the current directory").option("--force", "Overwrite existing config file").action((opts) => {
|
|
7851
|
+
const target = resolve("arena.config.ts");
|
|
7852
|
+
if (existsSync(target) && !opts.force) {
|
|
7853
|
+
console.error("arena.config.ts already exists. Use --force to overwrite.");
|
|
7854
|
+
process.exit(1);
|
|
6544
7855
|
}
|
|
6545
7856
|
const templatePath = join(__dirname2, "..", "templates", "arena.config.ts");
|
|
6546
7857
|
let template;
|
|
6547
7858
|
if (existsSync(templatePath)) {
|
|
6548
|
-
template =
|
|
7859
|
+
template = readFileSync3(templatePath, "utf-8");
|
|
6549
7860
|
} else {
|
|
6550
7861
|
template = DEFAULT_TEMPLATE;
|
|
6551
7862
|
}
|
|
6552
|
-
|
|
7863
|
+
writeFileSync2(target, template);
|
|
6553
7864
|
console.log(existsSync(target) && opts.force ? "Overwrote arena.config.ts" : "Created arena.config.ts");
|
|
6554
7865
|
console.log("");
|
|
6555
7866
|
console.log("Next steps:");
|
|
6556
7867
|
console.log(" 1. export OPENAI_API_KEY=sk-...");
|
|
6557
7868
|
console.log(" 2. npx duelist run");
|
|
6558
7869
|
});
|
|
6559
|
-
program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console or
|
|
6560
|
-
|
|
7870
|
+
program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console, json, or html", "console").option("--output <path>", "Output file path (used with html reporter)", "duelist-report.html").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
|
|
7871
|
+
if (!["console", "json", "html"].includes(opts.reporter)) {
|
|
7872
|
+
console.error(`Unknown reporter "${opts.reporter}". Use "console", "json", or "html".`);
|
|
7873
|
+
process.exit(1);
|
|
7874
|
+
}
|
|
7875
|
+
const typedArena = await loadArenaConfig(opts.config);
|
|
7876
|
+
try {
|
|
7877
|
+
const showProgress = opts.reporter !== "json" && !opts.quiet;
|
|
7878
|
+
const onResult = showProgress ? logResult : void 0;
|
|
7879
|
+
const results = await typedArena.run({ onResult });
|
|
7880
|
+
if (opts.reporter === "json") {
|
|
7881
|
+
console.log(jsonReporter(results));
|
|
7882
|
+
} else if (opts.reporter === "html") {
|
|
7883
|
+
const html = htmlReporter(results);
|
|
7884
|
+
const outPath = resolve(opts.output);
|
|
7885
|
+
mkdirSync2(dirname2(outPath), { recursive: true });
|
|
7886
|
+
writeFileSync2(outPath, html);
|
|
7887
|
+
console.log(`
|
|
7888
|
+
HTML report written to ${outPath}`);
|
|
7889
|
+
} else {
|
|
7890
|
+
console.log("");
|
|
7891
|
+
consoleReporter(results, { sparklines: typedArena.config?.sparklines });
|
|
7892
|
+
}
|
|
7893
|
+
const allFailed = results.length > 0 && results.every((r3) => r3.error);
|
|
7894
|
+
if (allFailed) process.exit(1);
|
|
7895
|
+
} catch (err) {
|
|
7896
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7897
|
+
console.error(`Benchmark failed: ${message}`);
|
|
7898
|
+
process.exit(1);
|
|
7899
|
+
}
|
|
7900
|
+
});
|
|
7901
|
+
function collectThreshold(value, previous) {
|
|
7902
|
+
const [scorer, delta] = value.split("=");
|
|
7903
|
+
if (!scorer || delta === void 0 || isNaN(Number(delta))) {
|
|
7904
|
+
console.error(`Invalid threshold format: "${value}". Expected scorer=delta (e.g., correctness=0.1)`);
|
|
7905
|
+
process.exit(1);
|
|
7906
|
+
}
|
|
7907
|
+
previous.set(scorer, Number(delta));
|
|
7908
|
+
return previous;
|
|
7909
|
+
}
|
|
7910
|
+
program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
|
|
7911
|
+
const ciOpts = {
|
|
7912
|
+
configPath: opts.config,
|
|
7913
|
+
baselinePath: resolve(opts.baseline),
|
|
7914
|
+
budget: opts.budget,
|
|
7915
|
+
thresholds: opts.threshold,
|
|
7916
|
+
updateBaseline: opts.updateBaseline ?? false,
|
|
7917
|
+
comment: opts.comment ?? false,
|
|
7918
|
+
quiet: opts.quiet ?? false
|
|
7919
|
+
};
|
|
7920
|
+
const typedArena = await loadArenaConfig(ciOpts.configPath);
|
|
7921
|
+
console.log("Running benchmarks...");
|
|
7922
|
+
const onResult = ciOpts.quiet ? void 0 : logResult;
|
|
7923
|
+
let results;
|
|
7924
|
+
try {
|
|
7925
|
+
results = await typedArena.run({ onResult });
|
|
7926
|
+
} catch (err) {
|
|
7927
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7928
|
+
console.error(`Benchmark failed: ${message}`);
|
|
7929
|
+
process.exit(1);
|
|
7930
|
+
}
|
|
7931
|
+
const baseline = loadBaseline(ciOpts.baselinePath);
|
|
7932
|
+
const baselineStats = baseline ? computeStats(baseline.results) : null;
|
|
7933
|
+
if (baseline) {
|
|
7934
|
+
console.log(`Loaded baseline from ${ciOpts.baselinePath} (${baseline.timestamp})`);
|
|
7935
|
+
} else {
|
|
7936
|
+
console.log("No baseline found \u2014 this run establishes the first baseline.");
|
|
7937
|
+
}
|
|
7938
|
+
const currentStats = computeStats(results);
|
|
7939
|
+
const report = compareResults(baselineStats, currentStats, ciOpts.thresholds, ciOpts.budget, results);
|
|
7940
|
+
console.log("");
|
|
7941
|
+
consoleReporter(results, { sparklines: typedArena.config?.sparklines ?? true });
|
|
7942
|
+
if (report.flakyResults.length > 0) {
|
|
7943
|
+
console.log(`\u26A0 ${report.flakyResults.length} flaky result(s) detected (high variance)`);
|
|
7944
|
+
}
|
|
7945
|
+
if (report.cost.overBudget) {
|
|
7946
|
+
console.log(`\u{1F534} Budget exceeded: $${report.cost.totalUsd.toFixed(4)} > $${report.cost.budget.toFixed(2)}`);
|
|
7947
|
+
}
|
|
7948
|
+
for (const reason of report.failureReasons) {
|
|
7949
|
+
console.log(`\u{1F534} ${reason}`);
|
|
7950
|
+
}
|
|
7951
|
+
if (!report.failed) {
|
|
7952
|
+
console.log("\u{1F7E2} CI passed");
|
|
7953
|
+
}
|
|
7954
|
+
if (ciOpts.comment) {
|
|
7955
|
+
const ghCtx = detectGitHubContext();
|
|
7956
|
+
if (ghCtx) {
|
|
7957
|
+
const markdown = markdownReporter(report, results);
|
|
7958
|
+
try {
|
|
7959
|
+
await upsertPrComment(ghCtx, markdown, COMMENT_MARKER);
|
|
7960
|
+
console.log("Posted results to PR comment.");
|
|
7961
|
+
} catch (err) {
|
|
7962
|
+
console.warn(`Failed to post PR comment: ${err instanceof Error ? err.message : err}`);
|
|
7963
|
+
}
|
|
7964
|
+
} else {
|
|
7965
|
+
console.warn("--comment: not in a GitHub Actions PR context, skipping.");
|
|
7966
|
+
}
|
|
7967
|
+
}
|
|
7968
|
+
if (ciOpts.updateBaseline && !report.failed) {
|
|
7969
|
+
saveBaseline(ciOpts.baselinePath, results);
|
|
7970
|
+
console.log(`Baseline saved to ${ciOpts.baselinePath}`);
|
|
7971
|
+
} else if (ciOpts.updateBaseline && report.failed) {
|
|
7972
|
+
console.log("Baseline not updated (CI failed).");
|
|
7973
|
+
}
|
|
7974
|
+
process.exit(report.failed ? 1 : 0);
|
|
7975
|
+
});
|
|
7976
|
+
program.parse();
|
|
7977
|
+
async function loadArenaConfig(configOpt) {
|
|
7978
|
+
const configPath = resolve(configOpt);
|
|
6561
7979
|
if (!existsSync(configPath)) {
|
|
6562
7980
|
console.error(`Config not found: ${configPath}`);
|
|
6563
7981
|
console.error("");
|
|
6564
7982
|
console.error("Create one with: npx duelist init");
|
|
6565
7983
|
process.exit(1);
|
|
6566
7984
|
}
|
|
6567
|
-
if (!["console", "json"].includes(opts.reporter)) {
|
|
6568
|
-
console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
|
|
6569
|
-
process.exit(1);
|
|
6570
|
-
}
|
|
6571
7985
|
let mod;
|
|
6572
7986
|
try {
|
|
6573
7987
|
if (configPath.endsWith(".ts")) {
|
|
@@ -6591,35 +8005,21 @@ program.command("run").description("Run benchmarks defined in your arena config"
|
|
|
6591
8005
|
console.error(`Loaded from: ${configPath}`);
|
|
6592
8006
|
process.exit(1);
|
|
6593
8007
|
}
|
|
6594
|
-
|
|
6595
|
-
|
|
6596
|
-
|
|
6597
|
-
|
|
6598
|
-
if (result.error) {
|
|
6599
|
-
console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
|
|
6600
|
-
} else {
|
|
6601
|
-
const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
|
|
6602
|
-
console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
|
|
6603
|
-
}
|
|
6604
|
-
} : void 0;
|
|
6605
|
-
const results = await typedArena.run({ onResult });
|
|
6606
|
-
const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
|
|
6607
|
-
const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
|
|
6608
|
-
if (opts.reporter === "json") {
|
|
6609
|
-
console.log(jsonReporter2(results));
|
|
6610
|
-
} else {
|
|
6611
|
-
console.log("");
|
|
6612
|
-
consoleReporter2(results);
|
|
6613
|
-
}
|
|
6614
|
-
const allFailed = results.length > 0 && results.every((r3) => r3.error);
|
|
6615
|
-
if (allFailed) process.exit(1);
|
|
6616
|
-
} catch (err) {
|
|
6617
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
6618
|
-
console.error(`Benchmark failed: ${message}`);
|
|
6619
|
-
process.exit(1);
|
|
8008
|
+
const maybeConfig = arena.config;
|
|
8009
|
+
if (maybeConfig === void 0 || maybeConfig === null || typeof maybeConfig !== "object") {
|
|
8010
|
+
;
|
|
8011
|
+
arena.config = {};
|
|
6620
8012
|
}
|
|
6621
|
-
|
|
6622
|
-
|
|
8013
|
+
return arena;
|
|
8014
|
+
}
|
|
8015
|
+
function logResult(result) {
|
|
8016
|
+
if (result.error) {
|
|
8017
|
+
console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
|
|
8018
|
+
} else {
|
|
8019
|
+
const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
|
|
8020
|
+
console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
|
|
8021
|
+
}
|
|
8022
|
+
}
|
|
6623
8023
|
async function importTypeScript(filePath) {
|
|
6624
8024
|
try {
|
|
6625
8025
|
await Promise.resolve().then(() => (init_api(), api_exports));
|
|
@@ -6642,17 +8042,16 @@ function formatScoreForLog(s5) {
|
|
|
6642
8042
|
return `${Math.round(details.ms)}ms`;
|
|
6643
8043
|
}
|
|
6644
8044
|
if (s5.name === "cost" && details?.estimatedUsd != null) {
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
|
|
6648
|
-
|
|
6649
|
-
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
8045
|
+
return formatCost(details.estimatedUsd);
|
|
8046
|
+
}
|
|
8047
|
+
if (s5.value < 0 && details?.reason) {
|
|
8048
|
+
return `SKIP (${details.reason})`;
|
|
6650
8049
|
}
|
|
6651
8050
|
return String(s5.value);
|
|
6652
8051
|
}
|
|
6653
8052
|
function getVersion() {
|
|
6654
8053
|
try {
|
|
6655
|
-
const pkg =
|
|
8054
|
+
const pkg = readFileSync3(join(__dirname2, "..", "package.json"), "utf-8");
|
|
6656
8055
|
return JSON.parse(pkg).version ?? "0.0.0";
|
|
6657
8056
|
} catch {
|
|
6658
8057
|
return "0.0.0";
|
|
@@ -6674,12 +8073,12 @@ import { z } from 'zod'
|
|
|
6674
8073
|
|
|
6675
8074
|
export default defineArena({
|
|
6676
8075
|
providers: [
|
|
6677
|
-
openai('gpt-
|
|
8076
|
+
openai('gpt-5-mini'),
|
|
6678
8077
|
// Add more providers to compare:
|
|
6679
|
-
// openai('gpt-
|
|
6680
|
-
// azureOpenai('gpt-
|
|
6681
|
-
// anthropic('claude-sonnet-4
|
|
6682
|
-
// gemini('gemini-
|
|
8078
|
+
// openai('gpt-5.2'),
|
|
8079
|
+
// azureOpenai('gpt-5-mini'),
|
|
8080
|
+
// anthropic('claude-sonnet-4.6'),
|
|
8081
|
+
// gemini('gemini-3-flash-preview'),
|
|
6683
8082
|
],
|
|
6684
8083
|
|
|
6685
8084
|
tasks: [
|