agent-duelist 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -58
- package/dist/cli.js +870 -123
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +897 -227
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -3
- package/dist/index.d.ts +67 -3
- package/dist/index.js +887 -224
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/cli.js
CHANGED
|
@@ -49,14 +49,154 @@ function bold(s5) {
|
|
|
49
49
|
function dim(s5) {
|
|
50
50
|
return `${dimCode}${s5}${reset}`;
|
|
51
51
|
}
|
|
52
|
-
function
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
52
|
+
function stripAnsi(s5) {
|
|
53
|
+
return s5.replace(/\x1b\[[0-9;]*m/g, "");
|
|
54
|
+
}
|
|
55
|
+
function displayWidth(s5) {
|
|
56
|
+
const stripped = stripAnsi(s5);
|
|
57
|
+
let width = 0;
|
|
58
|
+
for (const ch of stripped) {
|
|
59
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
60
|
+
if (code >= 126976) width += 2;
|
|
61
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
62
|
+
else width += 1;
|
|
63
|
+
}
|
|
64
|
+
return width;
|
|
65
|
+
}
|
|
66
|
+
function padCell(str, targetWidth, align) {
|
|
67
|
+
const dw = displayWidth(str);
|
|
68
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
69
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
70
|
+
return str + " ".repeat(padding);
|
|
71
|
+
}
|
|
72
|
+
function sparkBar(ratio, width = 8) {
|
|
73
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
74
|
+
const fillLen = Math.round(clamped * width);
|
|
75
|
+
const fill = "\u2593".repeat(fillLen);
|
|
76
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
77
|
+
return { fill, track };
|
|
78
|
+
}
|
|
79
|
+
function drawTableLine(widths, position) {
|
|
80
|
+
const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
|
|
81
|
+
if (position === "bottom") {
|
|
82
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
83
|
+
}
|
|
84
|
+
if (position === "merge") {
|
|
85
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
86
|
+
}
|
|
87
|
+
const segments = widths.map((w4) => "\u2500".repeat(w4 + 2));
|
|
88
|
+
if (position === "top") {
|
|
89
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
90
|
+
}
|
|
91
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
92
|
+
}
|
|
93
|
+
function drawTableRow(cells, widths, aligns) {
|
|
94
|
+
const parts = cells.map(
|
|
95
|
+
(cell, i7) => " " + padCell(cell, widths[i7], aligns[i7]) + " "
|
|
96
|
+
);
|
|
97
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
98
|
+
}
|
|
99
|
+
function drawSpanRow(content, widths) {
|
|
100
|
+
const totalInner = widths.reduce((sum, w4) => sum + w4 + 2, 0) + widths.length - 1;
|
|
101
|
+
const dw = displayWidth(content);
|
|
102
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
103
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
104
|
+
}
|
|
105
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
106
|
+
const stats = /* @__PURE__ */ new Map();
|
|
107
|
+
const valid = providerData.filter((p5) => !p5.allErrors);
|
|
108
|
+
if (scorerNames.includes("latency")) {
|
|
109
|
+
const values = /* @__PURE__ */ new Map();
|
|
110
|
+
for (const p5 of providerData) {
|
|
111
|
+
values.set(p5.providerId, p5.allErrors ? void 0 : p5.latencyMs);
|
|
112
|
+
}
|
|
113
|
+
const nums = valid.map((p5) => p5.latencyMs).filter((v4) => v4 !== void 0);
|
|
114
|
+
stats.set("latency", {
|
|
115
|
+
values,
|
|
116
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
117
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
if (scorerNames.includes("cost")) {
|
|
121
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
122
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
123
|
+
for (const p5 of providerData) {
|
|
124
|
+
costValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.costUsd);
|
|
125
|
+
tokenValues.set(p5.providerId, p5.allErrors ? void 0 : p5.avgDetails.totalTokens);
|
|
126
|
+
}
|
|
127
|
+
const costNums = valid.map((p5) => p5.avgDetails.costUsd).filter((v4) => v4 !== void 0);
|
|
128
|
+
const tokenNums = valid.map((p5) => p5.avgDetails.totalTokens).filter((v4) => v4 !== void 0);
|
|
129
|
+
stats.set("cost", {
|
|
130
|
+
values: costValues,
|
|
131
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
132
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
133
|
+
});
|
|
134
|
+
stats.set("tokens", {
|
|
135
|
+
values: tokenValues,
|
|
136
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
137
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
for (const name of scorerNames) {
|
|
141
|
+
if (name === "latency" || name === "cost") continue;
|
|
142
|
+
const values = /* @__PURE__ */ new Map();
|
|
143
|
+
for (const p5 of providerData) {
|
|
144
|
+
values.set(p5.providerId, p5.allErrors ? void 0 : p5.avgScores[name]);
|
|
145
|
+
}
|
|
146
|
+
const nums = valid.map((p5) => p5.avgScores[name]).filter((v4) => v4 !== void 0);
|
|
147
|
+
stats.set(name, {
|
|
148
|
+
values,
|
|
149
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
150
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
return stats;
|
|
154
|
+
}
|
|
155
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
156
|
+
if (value === void 0) return dim("\u2014");
|
|
157
|
+
if (providerCount < 2) return text;
|
|
158
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
159
|
+
if (colStats.best === colStats.worst) return text;
|
|
160
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
161
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
162
|
+
return `${yellow}${text}${reset}`;
|
|
163
|
+
}
|
|
164
|
+
function computeMedals(columnStats, providerIds) {
|
|
165
|
+
const medals = /* @__PURE__ */ new Map();
|
|
166
|
+
if (providerIds.length < 2) {
|
|
167
|
+
for (const id of providerIds) medals.set(id, "");
|
|
168
|
+
return medals;
|
|
169
|
+
}
|
|
170
|
+
const wins = /* @__PURE__ */ new Map();
|
|
171
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
172
|
+
for (const [, colStats] of columnStats) {
|
|
173
|
+
if (colStats.best === void 0) continue;
|
|
174
|
+
for (const [providerId, value] of colStats.values) {
|
|
175
|
+
if (value !== void 0 && value === colStats.best) {
|
|
176
|
+
wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
const totalWins = [...wins.values()].reduce((a7, b3) => a7 + b3, 0);
|
|
181
|
+
if (totalWins === 0) {
|
|
182
|
+
for (const id of providerIds) medals.set(id, "");
|
|
183
|
+
return medals;
|
|
184
|
+
}
|
|
185
|
+
const sorted = [...wins.entries()].sort(
|
|
186
|
+
(a7, b3) => b3[1] - a7[1] || a7[0].localeCompare(b3[0])
|
|
187
|
+
);
|
|
188
|
+
const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
|
|
189
|
+
let rank = 0;
|
|
190
|
+
for (let i7 = 0; i7 < sorted.length; i7++) {
|
|
191
|
+
if (i7 > 0 && sorted[i7][1] < sorted[i7 - 1][1]) {
|
|
192
|
+
rank = i7;
|
|
193
|
+
}
|
|
194
|
+
medals.set(sorted[i7][0], rank < medalList.length ? medalList[rank] : "");
|
|
195
|
+
}
|
|
196
|
+
return medals;
|
|
197
|
+
}
|
|
198
|
+
function consoleReporter(results, options) {
|
|
199
|
+
const showSparklines = options?.sparklines ?? true;
|
|
60
200
|
if (results.length === 0) {
|
|
61
201
|
console.log("\nNo results to display.\n");
|
|
62
202
|
return;
|
|
@@ -66,78 +206,155 @@ function consoleReporter(results) {
|
|
|
66
206
|
const scorerNames = [...new Set(results.flatMap((r3) => r3.scores.map((s5) => s5.name)))];
|
|
67
207
|
const hasCost = scorerNames.includes("cost");
|
|
68
208
|
const hasErrors = results.some((r3) => r3.error);
|
|
209
|
+
const multi = providers.length >= 2;
|
|
69
210
|
const runsPerCell = Math.max(...results.map((r3) => r3.run));
|
|
70
|
-
const runLabel = runsPerCell > 1 ? `
|
|
211
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
71
212
|
console.log("");
|
|
72
|
-
console.log(` ${
|
|
73
|
-
console.log(` ${dim("\
|
|
213
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
214
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
74
215
|
console.log("");
|
|
75
216
|
for (const task of tasks) {
|
|
76
217
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
else if (name === "cost") {
|
|
81
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
82
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
83
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
84
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
85
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
86
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
87
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
88
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
89
|
-
}
|
|
90
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
91
|
-
const totalWidth = cols.reduce((sum, c3) => sum + c3.width + 2, 0);
|
|
92
|
-
console.log(` ${dim(cols.map((c3) => pad(c3.label, c3.width + 2, c3.align)).join(""))}`);
|
|
93
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
94
|
-
for (const provider of providers) {
|
|
95
|
-
const taskResults = results.filter(
|
|
96
|
-
(r3) => r3.taskName === task && r3.providerId === provider
|
|
97
|
-
);
|
|
218
|
+
console.log("");
|
|
219
|
+
const providerData = providers.map((providerId) => {
|
|
220
|
+
const taskResults = results.filter((r3) => r3.taskName === task && r3.providerId === providerId);
|
|
98
221
|
const errorResults2 = taskResults.filter((r3) => r3.error);
|
|
99
222
|
const successResults = taskResults.filter((r3) => !r3.error);
|
|
100
|
-
if (successResults.length === 0
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
console.log(` ${cells2.join("")}`);
|
|
110
|
-
continue;
|
|
223
|
+
if (successResults.length === 0) {
|
|
224
|
+
return {
|
|
225
|
+
providerId,
|
|
226
|
+
avgScores: {},
|
|
227
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
228
|
+
latencyMs: void 0,
|
|
229
|
+
allErrors: errorResults2.length > 0,
|
|
230
|
+
errorCount: errorResults2.length
|
|
231
|
+
};
|
|
111
232
|
}
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
233
|
+
return {
|
|
234
|
+
providerId,
|
|
235
|
+
avgScores: averageScores(successResults),
|
|
236
|
+
avgDetails: averageDetails(successResults),
|
|
237
|
+
latencyMs: average(successResults.map((r3) => r3.raw.latencyMs)),
|
|
238
|
+
allErrors: false,
|
|
239
|
+
errorCount: errorResults2.length
|
|
240
|
+
};
|
|
241
|
+
});
|
|
242
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
243
|
+
const medals = computeMedals(columnStats, providers);
|
|
244
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
245
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
246
|
+
const cols = [
|
|
247
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
248
|
+
];
|
|
249
|
+
for (const name of scorerNames) {
|
|
250
|
+
if (name === "latency") {
|
|
251
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
252
|
+
} else if (name === "cost") {
|
|
253
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
254
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
255
|
+
} else {
|
|
256
|
+
const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
|
|
257
|
+
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
if (hasErrors) {
|
|
261
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
262
|
+
}
|
|
263
|
+
const widths = cols.map((c3) => c3.width);
|
|
264
|
+
const aligns = cols.map((c3) => c3.align);
|
|
265
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
266
|
+
const headerCells = cols.map((c3) => bold(c3.label));
|
|
267
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
268
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
269
|
+
for (const pd of providerData) {
|
|
270
|
+
const medal = medals.get(pd.providerId) ?? "";
|
|
271
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
272
|
+
const cells = [providerCell];
|
|
273
|
+
if (pd.allErrors) {
|
|
274
|
+
for (const col of cols.slice(1)) {
|
|
275
|
+
if (col.label === "Status") {
|
|
276
|
+
cells.push(`${red}FAIL${reset}`);
|
|
277
|
+
} else {
|
|
278
|
+
cells.push(dim("\u2014"));
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
} else {
|
|
282
|
+
for (const col of cols.slice(1)) {
|
|
283
|
+
if (col.label === "Status") {
|
|
284
|
+
cells.push(
|
|
285
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
286
|
+
);
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
const statsKey = col.statsKey;
|
|
290
|
+
const colStats = columnStats.get(statsKey);
|
|
291
|
+
if (statsKey === "latency") {
|
|
292
|
+
const ms = pd.latencyMs;
|
|
293
|
+
if (ms === void 0) {
|
|
294
|
+
cells.push(dim("\u2014"));
|
|
295
|
+
} else {
|
|
296
|
+
const text = `${Math.round(ms)}ms`;
|
|
297
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
298
|
+
}
|
|
299
|
+
} else if (statsKey === "cost") {
|
|
300
|
+
const cost = pd.avgDetails.costUsd;
|
|
301
|
+
if (cost === void 0) {
|
|
302
|
+
cells.push(dim("\u2014"));
|
|
303
|
+
} else {
|
|
304
|
+
const text = formatCost(cost);
|
|
305
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
306
|
+
}
|
|
307
|
+
} else if (statsKey === "tokens") {
|
|
308
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
309
|
+
if (tokens === void 0) {
|
|
310
|
+
cells.push(dim("\u2014"));
|
|
311
|
+
} else {
|
|
312
|
+
const text = `${tokens}`;
|
|
313
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
314
|
+
}
|
|
315
|
+
} else {
|
|
316
|
+
const val = pd.avgScores[statsKey];
|
|
317
|
+
if (val === void 0) {
|
|
318
|
+
cells.push(dim("\u2014"));
|
|
319
|
+
} else {
|
|
320
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
321
|
+
let coloredPct;
|
|
322
|
+
if (multi && colStats) {
|
|
323
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
324
|
+
} else {
|
|
325
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
326
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
327
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
328
|
+
}
|
|
329
|
+
if (showSparklines) {
|
|
330
|
+
const { fill, track } = sparkBar(val);
|
|
331
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
332
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
333
|
+
} else {
|
|
334
|
+
cells.push(coloredPct);
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
}
|
|
126
338
|
}
|
|
127
339
|
}
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
340
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
341
|
+
}
|
|
342
|
+
if (multi && providerData.some((p5) => !p5.allErrors)) {
|
|
343
|
+
const winnerId = [...medals.entries()].find(([, m8]) => m8 === "\u{1F947}")?.[0];
|
|
344
|
+
if (winnerId) {
|
|
345
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
346
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
347
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
131
348
|
}
|
|
132
|
-
console.log(` ${cells.join("")}`);
|
|
133
349
|
}
|
|
350
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
134
351
|
console.log("");
|
|
135
352
|
}
|
|
136
353
|
printSummary(results, providers);
|
|
137
354
|
const errorResults = results.filter((r3) => r3.error);
|
|
138
355
|
if (errorResults.length > 0) {
|
|
139
356
|
console.log(` ${bold("Errors")}`);
|
|
140
|
-
console.log(` ${dim("\
|
|
357
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
141
358
|
const seen = /* @__PURE__ */ new Set();
|
|
142
359
|
for (const r3 of errorResults) {
|
|
143
360
|
const key = `${r3.providerId}::${r3.error}`;
|
|
@@ -145,7 +362,7 @@ function consoleReporter(results) {
|
|
|
145
362
|
seen.add(key);
|
|
146
363
|
const count = errorResults.filter((e5) => e5.providerId === r3.providerId && e5.error === r3.error).length;
|
|
147
364
|
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
148
|
-
console.log(` ${red}\
|
|
365
|
+
console.log(` ${red}\u2716${reset} ${r3.providerId}: ${r3.error}${suffix}`);
|
|
149
366
|
const hint = apiKeyHint(r3.providerId, r3.error ?? "");
|
|
150
367
|
if (hint) console.log(` ${dim(hint)}`);
|
|
151
368
|
}
|
|
@@ -159,15 +376,20 @@ function consoleReporter(results) {
|
|
|
159
376
|
function printSummary(results, providers) {
|
|
160
377
|
const successResults = results.filter((r3) => !r3.error);
|
|
161
378
|
if (successResults.length === 0) return;
|
|
162
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
163
379
|
console.log(` ${bold("Summary")}`);
|
|
380
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
164
381
|
console.log("");
|
|
165
382
|
const single = providers.length === 1;
|
|
166
383
|
const correctnessKey = successResults.some((r3) => r3.scores.some((s5) => s5.name === "llm-judge-correctness" && s5.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
167
384
|
const byCorrectness = rankProviders(successResults, providers, correctnessKey);
|
|
168
385
|
if (byCorrectness) {
|
|
169
|
-
const
|
|
170
|
-
|
|
386
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
387
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
388
|
+
if (single) {
|
|
389
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
390
|
+
} else {
|
|
391
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
392
|
+
}
|
|
171
393
|
}
|
|
172
394
|
const byLatency = providers.map((id) => {
|
|
173
395
|
const runs = successResults.filter((r3) => r3.providerId === id);
|
|
@@ -175,8 +397,13 @@ function printSummary(results, providers) {
|
|
|
175
397
|
return { id, avg: avg ?? Infinity };
|
|
176
398
|
}).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
177
399
|
if (byLatency && byLatency.avg !== Infinity) {
|
|
178
|
-
const
|
|
179
|
-
|
|
400
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
401
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
402
|
+
if (single) {
|
|
403
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
404
|
+
} else {
|
|
405
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
406
|
+
}
|
|
180
407
|
}
|
|
181
408
|
const byCost = providers.map((id) => {
|
|
182
409
|
const runs = successResults.filter((r3) => r3.providerId === id);
|
|
@@ -188,8 +415,32 @@ function printSummary(results, providers) {
|
|
|
188
415
|
return { id, avg };
|
|
189
416
|
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
190
417
|
if (byCost?.avg !== void 0) {
|
|
191
|
-
const
|
|
192
|
-
|
|
418
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
419
|
+
const costStr = formatCost(byCost.avg);
|
|
420
|
+
if (single) {
|
|
421
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
422
|
+
} else {
|
|
423
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
if (!single) {
|
|
427
|
+
const wins = /* @__PURE__ */ new Map();
|
|
428
|
+
for (const id of providers) wins.set(id, 0);
|
|
429
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
430
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
431
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
432
|
+
const maxWins = Math.max(...wins.values());
|
|
433
|
+
if (maxWins > 0) {
|
|
434
|
+
const topProviders = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
|
|
435
|
+
console.log("");
|
|
436
|
+
if (topProviders.length === 1) {
|
|
437
|
+
const [winnerId, winCount] = topProviders[0];
|
|
438
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
439
|
+
} else {
|
|
440
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
441
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
193
444
|
}
|
|
194
445
|
console.log("");
|
|
195
446
|
}
|
|
@@ -251,14 +502,6 @@ function formatCost(usd) {
|
|
|
251
502
|
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
252
503
|
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
253
504
|
}
|
|
254
|
-
function pad(str, width, align) {
|
|
255
|
-
if (align === "right") return str.padStart(width);
|
|
256
|
-
return str.padEnd(width);
|
|
257
|
-
}
|
|
258
|
-
function colorLen(str) {
|
|
259
|
-
const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
|
|
260
|
-
return str.length - stripped.length;
|
|
261
|
-
}
|
|
262
505
|
function apiKeyHint(providerId, error) {
|
|
263
506
|
const lower = error.toLowerCase();
|
|
264
507
|
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
@@ -328,7 +571,7 @@ function providerLabel(providerId) {
|
|
|
328
571
|
return `(${prefix})`;
|
|
329
572
|
}
|
|
330
573
|
}
|
|
331
|
-
var reset, boldCode, dimCode, green, red, yellow, cyan;
|
|
574
|
+
var reset, boldCode, dimCode, green, red, yellow, cyan, brightGreen, brightWhite;
|
|
332
575
|
var init_console = __esm({
|
|
333
576
|
"src/reporter/console.ts"() {
|
|
334
577
|
"use strict";
|
|
@@ -339,6 +582,8 @@ var init_console = __esm({
|
|
|
339
582
|
red = "\x1B[31m";
|
|
340
583
|
yellow = "\x1B[33m";
|
|
341
584
|
cyan = "\x1B[36m";
|
|
585
|
+
brightGreen = "\x1B[92m";
|
|
586
|
+
brightWhite = "\x1B[97m";
|
|
342
587
|
}
|
|
343
588
|
});
|
|
344
589
|
|
|
@@ -375,6 +620,425 @@ var init_json = __esm({
|
|
|
375
620
|
}
|
|
376
621
|
});
|
|
377
622
|
|
|
623
|
+
// src/ci.ts
|
|
624
|
+
var ci_exports = {};
|
|
625
|
+
__export(ci_exports, {
|
|
626
|
+
compareResults: () => compareResults,
|
|
627
|
+
computeCostSummary: () => computeCostSummary,
|
|
628
|
+
computeScorerStats: () => computeScorerStats,
|
|
629
|
+
computeStats: () => computeStats,
|
|
630
|
+
loadBaseline: () => loadBaseline,
|
|
631
|
+
saveBaseline: () => saveBaseline
|
|
632
|
+
});
|
|
633
|
+
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
634
|
+
import { dirname } from "path";
|
|
635
|
+
function tCritical(df) {
|
|
636
|
+
if (df <= 0) return 1.96;
|
|
637
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
638
|
+
const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a7, b3) => a7 - b3);
|
|
639
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
640
|
+
for (let i7 = 0; i7 < keys.length - 1; i7++) {
|
|
641
|
+
if (df > keys[i7] && df < keys[i7 + 1]) {
|
|
642
|
+
const low = keys[i7], high = keys[i7 + 1];
|
|
643
|
+
const ratio = (df - low) / (high - low);
|
|
644
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
return 1.96;
|
|
648
|
+
}
|
|
649
|
+
function computeScorerStats(samples) {
|
|
650
|
+
const n3 = samples.length;
|
|
651
|
+
if (n3 === 0) {
|
|
652
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
653
|
+
}
|
|
654
|
+
const mean = samples.reduce((a7, b3) => a7 + b3, 0) / n3;
|
|
655
|
+
if (n3 === 1) {
|
|
656
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
657
|
+
}
|
|
658
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n3 - 1);
|
|
659
|
+
const stddev = Math.sqrt(variance);
|
|
660
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
661
|
+
const se4 = stddev / Math.sqrt(n3);
|
|
662
|
+
const t3 = tCritical(n3 - 1);
|
|
663
|
+
return {
|
|
664
|
+
mean,
|
|
665
|
+
stddev,
|
|
666
|
+
cv,
|
|
667
|
+
n: n3,
|
|
668
|
+
ci95Lower: mean - t3 * se4,
|
|
669
|
+
ci95Upper: mean + t3 * se4
|
|
670
|
+
};
|
|
671
|
+
}
|
|
672
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
673
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
674
|
+
}
|
|
675
|
+
function computeStats(results) {
|
|
676
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
677
|
+
for (const r3 of results) {
|
|
678
|
+
if (r3.error) continue;
|
|
679
|
+
for (const score of r3.scores) {
|
|
680
|
+
if (score.value < 0) continue;
|
|
681
|
+
const key = groupKey(r3.providerId, r3.taskName, score.name);
|
|
682
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
683
|
+
grouped.get(key).push(score.value);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
const stats = /* @__PURE__ */ new Map();
|
|
687
|
+
for (const [key, samples] of grouped) {
|
|
688
|
+
stats.set(key, computeScorerStats(samples));
|
|
689
|
+
}
|
|
690
|
+
return stats;
|
|
691
|
+
}
|
|
692
|
+
function computeCostSummary(results, budget) {
|
|
693
|
+
let totalUsd = 0;
|
|
694
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
695
|
+
for (const r3 of results) {
|
|
696
|
+
if (r3.error) continue;
|
|
697
|
+
const costScore = r3.scores.find((s5) => s5.name === "cost");
|
|
698
|
+
if (!costScore || costScore.value < 0) continue;
|
|
699
|
+
const details = costScore.details;
|
|
700
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
701
|
+
if (usd <= 0) continue;
|
|
702
|
+
totalUsd += usd;
|
|
703
|
+
perProvider.set(r3.providerId, (perProvider.get(r3.providerId) ?? 0) + usd);
|
|
704
|
+
}
|
|
705
|
+
return {
|
|
706
|
+
totalUsd,
|
|
707
|
+
perProvider,
|
|
708
|
+
budget,
|
|
709
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
710
|
+
};
|
|
711
|
+
}
|
|
712
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
713
|
+
const comparisons = [];
|
|
714
|
+
const failureReasons = [];
|
|
715
|
+
for (const [key, current] of currentStats) {
|
|
716
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
717
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
718
|
+
let delta = null;
|
|
719
|
+
let regressed = false;
|
|
720
|
+
let improved = false;
|
|
721
|
+
if (baseline) {
|
|
722
|
+
delta = current.mean - baseline.mean;
|
|
723
|
+
const threshold = thresholds.get(scorerName);
|
|
724
|
+
if (threshold !== void 0) {
|
|
725
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
726
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
727
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
731
|
+
comparisons.push({
|
|
732
|
+
providerId,
|
|
733
|
+
taskName,
|
|
734
|
+
scorerName,
|
|
735
|
+
baseline,
|
|
736
|
+
current,
|
|
737
|
+
delta,
|
|
738
|
+
regressed,
|
|
739
|
+
improved,
|
|
740
|
+
flaky
|
|
741
|
+
});
|
|
742
|
+
}
|
|
743
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
744
|
+
const regressions = comparisons.filter((c3) => c3.regressed);
|
|
745
|
+
if (regressions.length > 0) {
|
|
746
|
+
for (const r3 of regressions) {
|
|
747
|
+
failureReasons.push(
|
|
748
|
+
`${r3.providerId} \xD7 ${r3.taskName}: ${r3.scorerName} regressed by ${formatDelta(r3.delta)}`
|
|
749
|
+
);
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
if (cost.overBudget) {
|
|
753
|
+
failureReasons.push(
|
|
754
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
755
|
+
);
|
|
756
|
+
}
|
|
757
|
+
const flakyResults = comparisons.filter((c3) => c3.flaky);
|
|
758
|
+
const failed = failureReasons.length > 0;
|
|
759
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
760
|
+
}
|
|
761
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
762
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
763
|
+
const delta = current.mean - baseline.mean;
|
|
764
|
+
if (lowerIsBetter) return delta > threshold;
|
|
765
|
+
return delta < -threshold;
|
|
766
|
+
}
|
|
767
|
+
if (lowerIsBetter) {
|
|
768
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
769
|
+
}
|
|
770
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
771
|
+
}
|
|
772
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
773
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
774
|
+
const delta = current.mean - baseline.mean;
|
|
775
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
776
|
+
return delta > threshold;
|
|
777
|
+
}
|
|
778
|
+
if (lowerIsBetter) {
|
|
779
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
780
|
+
}
|
|
781
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
782
|
+
}
|
|
783
|
+
function formatDelta(delta) {
|
|
784
|
+
const sign = delta >= 0 ? "+" : "";
|
|
785
|
+
return `${sign}${delta.toFixed(4)}`;
|
|
786
|
+
}
|
|
787
|
+
function loadBaseline(path) {
|
|
788
|
+
try {
|
|
789
|
+
const raw = readFileSync(path, "utf-8");
|
|
790
|
+
const data = JSON.parse(raw);
|
|
791
|
+
const results = data.results ?? data;
|
|
792
|
+
if (!Array.isArray(results)) return null;
|
|
793
|
+
return {
|
|
794
|
+
timestamp: data.timestamp ?? "unknown",
|
|
795
|
+
results
|
|
796
|
+
};
|
|
797
|
+
} catch {
|
|
798
|
+
return null;
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
function saveBaseline(path, results) {
|
|
802
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
803
|
+
const data = {
|
|
804
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
805
|
+
results
|
|
806
|
+
};
|
|
807
|
+
writeFileSync(path, JSON.stringify(data, null, 2));
|
|
808
|
+
}
|
|
809
|
+
var LOWER_IS_BETTER, FLAKY_CV_THRESHOLD, T_CRITICAL_95;
|
|
810
|
+
var init_ci = __esm({
|
|
811
|
+
"src/ci.ts"() {
|
|
812
|
+
"use strict";
|
|
813
|
+
LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
814
|
+
FLAKY_CV_THRESHOLD = 0.3;
|
|
815
|
+
T_CRITICAL_95 = {
|
|
816
|
+
1: 12.706,
|
|
817
|
+
2: 4.303,
|
|
818
|
+
3: 3.182,
|
|
819
|
+
4: 2.776,
|
|
820
|
+
5: 2.571,
|
|
821
|
+
6: 2.447,
|
|
822
|
+
7: 2.365,
|
|
823
|
+
8: 2.306,
|
|
824
|
+
9: 2.262,
|
|
825
|
+
10: 2.228,
|
|
826
|
+
15: 2.131,
|
|
827
|
+
20: 2.086,
|
|
828
|
+
25: 2.06,
|
|
829
|
+
30: 2.042
|
|
830
|
+
};
|
|
831
|
+
}
|
|
832
|
+
});
|
|
833
|
+
|
|
834
|
+
// src/reporter/markdown.ts
|
|
835
|
+
var markdown_exports = {};
|
|
836
|
+
__export(markdown_exports, {
|
|
837
|
+
COMMENT_MARKER: () => COMMENT_MARKER,
|
|
838
|
+
markdownComparisonTable: () => markdownComparisonTable,
|
|
839
|
+
markdownCostSummary: () => markdownCostSummary,
|
|
840
|
+
markdownReporter: () => markdownReporter
|
|
841
|
+
});
|
|
842
|
+
function markdownReporter(report, _current) {
|
|
843
|
+
const lines = [COMMENT_MARKER, ""];
|
|
844
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
845
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
846
|
+
lines.push("");
|
|
847
|
+
if (report.comparisons.length > 0) {
|
|
848
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
849
|
+
lines.push("");
|
|
850
|
+
}
|
|
851
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
852
|
+
lines.push(markdownCostSummary(report.cost));
|
|
853
|
+
lines.push("");
|
|
854
|
+
}
|
|
855
|
+
if (report.flakyResults.length > 0) {
|
|
856
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
857
|
+
lines.push("");
|
|
858
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
859
|
+
lines.push("");
|
|
860
|
+
for (const f6 of report.flakyResults) {
|
|
861
|
+
lines.push(`- **${f6.providerId}** \xD7 ${f6.taskName} \u2192 ${f6.scorerName} (CV = ${f6.current.cv.toFixed(2)})`);
|
|
862
|
+
}
|
|
863
|
+
lines.push("");
|
|
864
|
+
}
|
|
865
|
+
if (report.failureReasons.length > 0) {
|
|
866
|
+
lines.push("### Failure Reasons");
|
|
867
|
+
lines.push("");
|
|
868
|
+
for (const reason of report.failureReasons) {
|
|
869
|
+
lines.push(`- ${reason}`);
|
|
870
|
+
}
|
|
871
|
+
lines.push("");
|
|
872
|
+
}
|
|
873
|
+
lines.push("---");
|
|
874
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
875
|
+
return lines.join("\n");
|
|
876
|
+
}
|
|
877
|
+
function markdownComparisonTable(comparisons) {
|
|
878
|
+
const lines = [];
|
|
879
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
880
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
881
|
+
for (const c3 of comparisons) {
|
|
882
|
+
const baselineStr = c3.baseline ? formatStats(c3.baseline) : "\u2014";
|
|
883
|
+
const currentStr = formatStats(c3.current);
|
|
884
|
+
const deltaStr = c3.delta !== null ? formatDelta2(c3.delta) : "\u2014";
|
|
885
|
+
const status = statusIndicator(c3);
|
|
886
|
+
lines.push(`| ${c3.providerId} | ${c3.taskName} | ${c3.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
887
|
+
}
|
|
888
|
+
return lines.join("\n");
|
|
889
|
+
}
|
|
890
|
+
function markdownCostSummary(cost) {
|
|
891
|
+
const lines = [];
|
|
892
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
893
|
+
lines.push("");
|
|
894
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
895
|
+
if (cost.budget !== void 0) {
|
|
896
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
897
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
898
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
899
|
+
}
|
|
900
|
+
if (cost.perProvider.size > 1) {
|
|
901
|
+
lines.push("");
|
|
902
|
+
lines.push("| Provider | Cost |");
|
|
903
|
+
lines.push("|----------|------|");
|
|
904
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
905
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
return lines.join("\n");
|
|
909
|
+
}
|
|
910
|
+
function formatStats(stats) {
|
|
911
|
+
if (stats.n > 1) {
|
|
912
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
913
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
914
|
+
}
|
|
915
|
+
return stats.mean.toFixed(3);
|
|
916
|
+
}
|
|
917
|
+
function formatDelta2(delta) {
|
|
918
|
+
const sign = delta >= 0 ? "+" : "";
|
|
919
|
+
return `${sign}${delta.toFixed(3)}`;
|
|
920
|
+
}
|
|
921
|
+
function statusIndicator(c3) {
|
|
922
|
+
if (c3.regressed) return "\u{1F534} regressed";
|
|
923
|
+
if (c3.improved) return "\u{1F7E2} improved";
|
|
924
|
+
if (c3.baseline === null) return "\u{1F195} new";
|
|
925
|
+
return "\u26AA unchanged";
|
|
926
|
+
}
|
|
927
|
+
var COMMENT_MARKER;
|
|
928
|
+
var init_markdown = __esm({
|
|
929
|
+
"src/reporter/markdown.ts"() {
|
|
930
|
+
"use strict";
|
|
931
|
+
COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
932
|
+
}
|
|
933
|
+
});
|
|
934
|
+
|
|
935
|
+
// src/github.ts
|
|
936
|
+
var github_exports = {};
|
|
937
|
+
__export(github_exports, {
|
|
938
|
+
detectGitHubContext: () => detectGitHubContext,
|
|
939
|
+
findExistingComment: () => findExistingComment,
|
|
940
|
+
upsertPrComment: () => upsertPrComment
|
|
941
|
+
});
|
|
942
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
943
|
+
function detectGitHubContext() {
|
|
944
|
+
const token = process.env.GITHUB_TOKEN;
|
|
945
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
946
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
947
|
+
if (!token || !repository) return null;
|
|
948
|
+
const [owner, repo] = repository.split("/");
|
|
949
|
+
if (!owner || !repo) return null;
|
|
950
|
+
let prNumber;
|
|
951
|
+
if (eventPath) {
|
|
952
|
+
try {
|
|
953
|
+
const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
|
|
954
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
955
|
+
const pr = event.pull_request;
|
|
956
|
+
prNumber = pr.number;
|
|
957
|
+
}
|
|
958
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
959
|
+
const issue = event.issue;
|
|
960
|
+
if (issue.pull_request) {
|
|
961
|
+
prNumber = issue.number;
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
} catch {
|
|
965
|
+
}
|
|
966
|
+
}
|
|
967
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
968
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
969
|
+
}
|
|
970
|
+
if (!prNumber) return null;
|
|
971
|
+
return { token, owner, repo, prNumber };
|
|
972
|
+
}
|
|
973
|
+
async function findExistingComment(ctx, marker) {
|
|
974
|
+
let page = 1;
|
|
975
|
+
const perPage = 50;
|
|
976
|
+
while (true) {
|
|
977
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
978
|
+
const res = await fetch(url, {
|
|
979
|
+
headers: {
|
|
980
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
981
|
+
Accept: "application/vnd.github+json",
|
|
982
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
983
|
+
}
|
|
984
|
+
});
|
|
985
|
+
if (!res.ok) return null;
|
|
986
|
+
const comments = await res.json();
|
|
987
|
+
if (comments.length === 0) break;
|
|
988
|
+
for (const comment of comments) {
|
|
989
|
+
if (comment.body?.includes(marker)) {
|
|
990
|
+
return comment.id;
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
if (comments.length < perPage) break;
|
|
994
|
+
page++;
|
|
995
|
+
}
|
|
996
|
+
return null;
|
|
997
|
+
}
|
|
998
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
999
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
1000
|
+
if (existingId) {
|
|
1001
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
1002
|
+
const res = await fetch(url, {
|
|
1003
|
+
method: "PATCH",
|
|
1004
|
+
headers: {
|
|
1005
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
1006
|
+
Accept: "application/vnd.github+json",
|
|
1007
|
+
"Content-Type": "application/json",
|
|
1008
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
1009
|
+
},
|
|
1010
|
+
body: JSON.stringify({ body })
|
|
1011
|
+
});
|
|
1012
|
+
if (!res.ok) {
|
|
1013
|
+
const text = await res.text();
|
|
1014
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
1015
|
+
}
|
|
1016
|
+
} else {
|
|
1017
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
1018
|
+
const res = await fetch(url, {
|
|
1019
|
+
method: "POST",
|
|
1020
|
+
headers: {
|
|
1021
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
1022
|
+
Accept: "application/vnd.github+json",
|
|
1023
|
+
"Content-Type": "application/json",
|
|
1024
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
1025
|
+
},
|
|
1026
|
+
body: JSON.stringify({ body })
|
|
1027
|
+
});
|
|
1028
|
+
if (!res.ok) {
|
|
1029
|
+
const text = await res.text();
|
|
1030
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
1031
|
+
}
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
var API_BASE;
|
|
1035
|
+
var init_github = __esm({
|
|
1036
|
+
"src/github.ts"() {
|
|
1037
|
+
"use strict";
|
|
1038
|
+
API_BASE = "https://api.github.com";
|
|
1039
|
+
}
|
|
1040
|
+
});
|
|
1041
|
+
|
|
378
1042
|
// node_modules/tsx/dist/temporary-directory-CwHp0_NW.mjs
|
|
379
1043
|
import r from "path";
|
|
380
1044
|
import o from "os";
|
|
@@ -6530,10 +7194,10 @@ var init_api = __esm({
|
|
|
6530
7194
|
// src/cli.ts
|
|
6531
7195
|
import "dotenv/config";
|
|
6532
7196
|
import { Command } from "commander";
|
|
6533
|
-
import { readFileSync, writeFileSync, existsSync } from "fs";
|
|
6534
|
-
import { resolve, join, dirname } from "path";
|
|
7197
|
+
import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, existsSync } from "fs";
|
|
7198
|
+
import { resolve, join, dirname as dirname2 } from "path";
|
|
6535
7199
|
import { pathToFileURL, fileURLToPath } from "url";
|
|
6536
|
-
var __dirname2 =
|
|
7200
|
+
var __dirname2 = dirname2(fileURLToPath(import.meta.url));
|
|
6537
7201
|
var program = new Command();
|
|
6538
7202
|
program.name("duelist").description("Pit LLM providers against each other on agent tasks.").version(getVersion());
|
|
6539
7203
|
program.command("init").description("Scaffold an arena.config.ts in the current directory").option("--force", "Overwrite existing config file").action((opts) => {
|
|
@@ -6545,11 +7209,11 @@ program.command("init").description("Scaffold an arena.config.ts in the current
|
|
|
6545
7209
|
const templatePath = join(__dirname2, "..", "templates", "arena.config.ts");
|
|
6546
7210
|
let template;
|
|
6547
7211
|
if (existsSync(templatePath)) {
|
|
6548
|
-
template =
|
|
7212
|
+
template = readFileSync3(templatePath, "utf-8");
|
|
6549
7213
|
} else {
|
|
6550
7214
|
template = DEFAULT_TEMPLATE;
|
|
6551
7215
|
}
|
|
6552
|
-
|
|
7216
|
+
writeFileSync2(target, template);
|
|
6553
7217
|
console.log(existsSync(target) && opts.force ? "Overwrote arena.config.ts" : "Created arena.config.ts");
|
|
6554
7218
|
console.log("");
|
|
6555
7219
|
console.log("Next steps:");
|
|
@@ -6557,17 +7221,119 @@ program.command("init").description("Scaffold an arena.config.ts in the current
|
|
|
6557
7221
|
console.log(" 2. npx duelist run");
|
|
6558
7222
|
});
|
|
6559
7223
|
program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console or json", "console").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
|
|
6560
|
-
|
|
7224
|
+
if (!["console", "json"].includes(opts.reporter)) {
|
|
7225
|
+
console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
|
|
7226
|
+
process.exit(1);
|
|
7227
|
+
}
|
|
7228
|
+
const typedArena = await loadArenaConfig(opts.config);
|
|
7229
|
+
try {
|
|
7230
|
+
const showProgress = opts.reporter === "console" && !opts.quiet;
|
|
7231
|
+
const onResult = showProgress ? logResult : void 0;
|
|
7232
|
+
const results = await typedArena.run({ onResult });
|
|
7233
|
+
const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
|
|
7234
|
+
const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
|
|
7235
|
+
if (opts.reporter === "json") {
|
|
7236
|
+
console.log(jsonReporter2(results));
|
|
7237
|
+
} else {
|
|
7238
|
+
console.log("");
|
|
7239
|
+
consoleReporter2(results, { sparklines: typedArena.config?.sparklines });
|
|
7240
|
+
}
|
|
7241
|
+
const allFailed = results.length > 0 && results.every((r3) => r3.error);
|
|
7242
|
+
if (allFailed) process.exit(1);
|
|
7243
|
+
} catch (err) {
|
|
7244
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7245
|
+
console.error(`Benchmark failed: ${message}`);
|
|
7246
|
+
process.exit(1);
|
|
7247
|
+
}
|
|
7248
|
+
});
|
|
7249
|
+
function collectThreshold(value, previous) {
|
|
7250
|
+
const [scorer, delta] = value.split("=");
|
|
7251
|
+
if (!scorer || delta === void 0 || isNaN(Number(delta))) {
|
|
7252
|
+
console.error(`Invalid threshold format: "${value}". Expected scorer=delta (e.g., correctness=0.1)`);
|
|
7253
|
+
process.exit(1);
|
|
7254
|
+
}
|
|
7255
|
+
previous.set(scorer, Number(delta));
|
|
7256
|
+
return previous;
|
|
7257
|
+
}
|
|
7258
|
+
program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
|
|
7259
|
+
const ciOpts = {
|
|
7260
|
+
configPath: opts.config,
|
|
7261
|
+
baselinePath: resolve(opts.baseline),
|
|
7262
|
+
budget: opts.budget,
|
|
7263
|
+
thresholds: opts.threshold,
|
|
7264
|
+
updateBaseline: opts.updateBaseline ?? false,
|
|
7265
|
+
comment: opts.comment ?? false,
|
|
7266
|
+
quiet: opts.quiet ?? false
|
|
7267
|
+
};
|
|
7268
|
+
const typedArena = await loadArenaConfig(ciOpts.configPath);
|
|
7269
|
+
console.log("Running benchmarks...");
|
|
7270
|
+
const onResult = ciOpts.quiet ? void 0 : logResult;
|
|
7271
|
+
let results;
|
|
7272
|
+
try {
|
|
7273
|
+
results = await typedArena.run({ onResult });
|
|
7274
|
+
} catch (err) {
|
|
7275
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7276
|
+
console.error(`Benchmark failed: ${message}`);
|
|
7277
|
+
process.exit(1);
|
|
7278
|
+
}
|
|
7279
|
+
const { loadBaseline: loadBaseline2, saveBaseline: saveBaseline2, computeStats: computeStats2, compareResults: compareResults2 } = await Promise.resolve().then(() => (init_ci(), ci_exports));
|
|
7280
|
+
const baseline = loadBaseline2(ciOpts.baselinePath);
|
|
7281
|
+
const baselineStats = baseline ? computeStats2(baseline.results) : null;
|
|
7282
|
+
if (baseline) {
|
|
7283
|
+
console.log(`Loaded baseline from ${ciOpts.baselinePath} (${baseline.timestamp})`);
|
|
7284
|
+
} else {
|
|
7285
|
+
console.log("No baseline found \u2014 this run establishes the first baseline.");
|
|
7286
|
+
}
|
|
7287
|
+
const currentStats = computeStats2(results);
|
|
7288
|
+
const report = compareResults2(baselineStats, currentStats, ciOpts.thresholds, ciOpts.budget, results);
|
|
7289
|
+
const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
|
|
7290
|
+
console.log("");
|
|
7291
|
+
consoleReporter2(results, { sparklines: typedArena.config?.sparklines });
|
|
7292
|
+
const { markdownReporter: markdownReporter2, COMMENT_MARKER: COMMENT_MARKER2 } = await Promise.resolve().then(() => (init_markdown(), markdown_exports));
|
|
7293
|
+
if (report.flakyResults.length > 0) {
|
|
7294
|
+
console.log(`\u26A0 ${report.flakyResults.length} flaky result(s) detected (high variance)`);
|
|
7295
|
+
}
|
|
7296
|
+
if (report.cost.overBudget) {
|
|
7297
|
+
console.log(`\u{1F534} Budget exceeded: $${report.cost.totalUsd.toFixed(4)} > $${report.cost.budget.toFixed(2)}`);
|
|
7298
|
+
}
|
|
7299
|
+
for (const reason of report.failureReasons) {
|
|
7300
|
+
console.log(`\u{1F534} ${reason}`);
|
|
7301
|
+
}
|
|
7302
|
+
if (!report.failed) {
|
|
7303
|
+
console.log("\u{1F7E2} CI passed");
|
|
7304
|
+
}
|
|
7305
|
+
if (ciOpts.comment) {
|
|
7306
|
+
const { detectGitHubContext: detectGitHubContext2, upsertPrComment: upsertPrComment2 } = await Promise.resolve().then(() => (init_github(), github_exports));
|
|
7307
|
+
const ghCtx = detectGitHubContext2();
|
|
7308
|
+
if (ghCtx) {
|
|
7309
|
+
const markdown = markdownReporter2(report, results);
|
|
7310
|
+
try {
|
|
7311
|
+
await upsertPrComment2(ghCtx, markdown, COMMENT_MARKER2);
|
|
7312
|
+
console.log("Posted results to PR comment.");
|
|
7313
|
+
} catch (err) {
|
|
7314
|
+
console.warn(`Failed to post PR comment: ${err instanceof Error ? err.message : err}`);
|
|
7315
|
+
}
|
|
7316
|
+
} else {
|
|
7317
|
+
console.warn("--comment: not in a GitHub Actions PR context, skipping.");
|
|
7318
|
+
}
|
|
7319
|
+
}
|
|
7320
|
+
if (ciOpts.updateBaseline && !report.failed) {
|
|
7321
|
+
saveBaseline2(ciOpts.baselinePath, results);
|
|
7322
|
+
console.log(`Baseline saved to ${ciOpts.baselinePath}`);
|
|
7323
|
+
} else if (ciOpts.updateBaseline && report.failed) {
|
|
7324
|
+
console.log("Baseline not updated (CI failed).");
|
|
7325
|
+
}
|
|
7326
|
+
process.exit(report.failed ? 1 : 0);
|
|
7327
|
+
});
|
|
7328
|
+
program.parse();
|
|
7329
|
+
async function loadArenaConfig(configOpt) {
|
|
7330
|
+
const configPath = resolve(configOpt);
|
|
6561
7331
|
if (!existsSync(configPath)) {
|
|
6562
7332
|
console.error(`Config not found: ${configPath}`);
|
|
6563
7333
|
console.error("");
|
|
6564
7334
|
console.error("Create one with: npx duelist init");
|
|
6565
7335
|
process.exit(1);
|
|
6566
7336
|
}
|
|
6567
|
-
if (!["console", "json"].includes(opts.reporter)) {
|
|
6568
|
-
console.error(`Unknown reporter "${opts.reporter}". Use "console" or "json".`);
|
|
6569
|
-
process.exit(1);
|
|
6570
|
-
}
|
|
6571
7337
|
let mod;
|
|
6572
7338
|
try {
|
|
6573
7339
|
if (configPath.endsWith(".ts")) {
|
|
@@ -6591,35 +7357,16 @@ program.command("run").description("Run benchmarks defined in your arena config"
|
|
|
6591
7357
|
console.error(`Loaded from: ${configPath}`);
|
|
6592
7358
|
process.exit(1);
|
|
6593
7359
|
}
|
|
6594
|
-
|
|
6595
|
-
|
|
6596
|
-
|
|
6597
|
-
|
|
6598
|
-
|
|
6599
|
-
|
|
6600
|
-
|
|
6601
|
-
|
|
6602
|
-
console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
|
|
6603
|
-
}
|
|
6604
|
-
} : void 0;
|
|
6605
|
-
const results = await typedArena.run({ onResult });
|
|
6606
|
-
const { consoleReporter: consoleReporter2 } = await Promise.resolve().then(() => (init_console(), console_exports));
|
|
6607
|
-
const { jsonReporter: jsonReporter2 } = await Promise.resolve().then(() => (init_json(), json_exports));
|
|
6608
|
-
if (opts.reporter === "json") {
|
|
6609
|
-
console.log(jsonReporter2(results));
|
|
6610
|
-
} else {
|
|
6611
|
-
console.log("");
|
|
6612
|
-
consoleReporter2(results);
|
|
6613
|
-
}
|
|
6614
|
-
const allFailed = results.length > 0 && results.every((r3) => r3.error);
|
|
6615
|
-
if (allFailed) process.exit(1);
|
|
6616
|
-
} catch (err) {
|
|
6617
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
6618
|
-
console.error(`Benchmark failed: ${message}`);
|
|
6619
|
-
process.exit(1);
|
|
7360
|
+
return arena;
|
|
7361
|
+
}
|
|
7362
|
+
function logResult(result) {
|
|
7363
|
+
if (result.error) {
|
|
7364
|
+
console.log(` ${result.providerId} \xD7 ${result.taskName}: ERROR ${result.error}`);
|
|
7365
|
+
} else {
|
|
7366
|
+
const scores = result.scores.map((s5) => `${s5.name}=${formatScoreForLog(s5)}`).join(" ");
|
|
7367
|
+
console.log(` ${result.providerId} \xD7 ${result.taskName}: ${scores}`);
|
|
6620
7368
|
}
|
|
6621
|
-
}
|
|
6622
|
-
program.parse();
|
|
7369
|
+
}
|
|
6623
7370
|
async function importTypeScript(filePath) {
|
|
6624
7371
|
try {
|
|
6625
7372
|
await Promise.resolve().then(() => (init_api(), api_exports));
|
|
@@ -6652,7 +7399,7 @@ function formatScoreForLog(s5) {
|
|
|
6652
7399
|
}
|
|
6653
7400
|
function getVersion() {
|
|
6654
7401
|
try {
|
|
6655
|
-
const pkg =
|
|
7402
|
+
const pkg = readFileSync3(join(__dirname2, "..", "package.json"), "utf-8");
|
|
6656
7403
|
return JSON.parse(pkg).version ?? "0.0.0";
|
|
6657
7404
|
} catch {
|
|
6658
7405
|
return "0.0.0";
|
|
@@ -6674,12 +7421,12 @@ import { z } from 'zod'
|
|
|
6674
7421
|
|
|
6675
7422
|
export default defineArena({
|
|
6676
7423
|
providers: [
|
|
6677
|
-
openai('gpt-
|
|
7424
|
+
openai('gpt-5-mini'),
|
|
6678
7425
|
// Add more providers to compare:
|
|
6679
|
-
// openai('gpt-
|
|
6680
|
-
// azureOpenai('gpt-
|
|
6681
|
-
// anthropic('claude-sonnet-4
|
|
6682
|
-
// gemini('gemini-
|
|
7426
|
+
// openai('gpt-5.2'),
|
|
7427
|
+
// azureOpenai('gpt-5-mini'),
|
|
7428
|
+
// anthropic('claude-sonnet-4.6'),
|
|
7429
|
+
// gemini('gemini-3-flash-preview'),
|
|
6683
7430
|
],
|
|
6684
7431
|
|
|
6685
7432
|
tasks: [
|