agent-duelist 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -58
- package/dist/cli.js +870 -123
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +897 -227
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -3
- package/dist/index.d.ts +67 -3
- package/dist/index.js +887 -224
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.cjs
CHANGED
|
@@ -32,13 +32,20 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
anthropic: () => anthropic,
|
|
34
34
|
azureOpenai: () => azureOpenai,
|
|
35
|
+
compareResults: () => compareResults,
|
|
36
|
+
computeStats: () => computeStats,
|
|
35
37
|
consoleReporter: () => consoleReporter,
|
|
36
38
|
defineArena: () => defineArena,
|
|
39
|
+
detectGitHubContext: () => detectGitHubContext,
|
|
37
40
|
gemini: () => gemini,
|
|
38
41
|
jsonReporter: () => jsonReporter,
|
|
42
|
+
loadBaseline: () => loadBaseline,
|
|
43
|
+
markdownReporter: () => markdownReporter,
|
|
39
44
|
openai: () => openai,
|
|
40
45
|
openaiCompatible: () => openaiCompatible,
|
|
41
|
-
registerPricing: () => registerPricing
|
|
46
|
+
registerPricing: () => registerPricing,
|
|
47
|
+
saveBaseline: () => saveBaseline,
|
|
48
|
+
upsertPrComment: () => upsertPrComment
|
|
42
49
|
});
|
|
43
50
|
module.exports = __toCommonJS(index_exports);
|
|
44
51
|
|
|
@@ -1479,7 +1486,142 @@ function jaccardSimilarity(a, b) {
|
|
|
1479
1486
|
}
|
|
1480
1487
|
|
|
1481
1488
|
// src/scorers/llm-judge.ts
|
|
1489
|
+
var import_openai2 = __toESM(require("openai"), 1);
|
|
1490
|
+
|
|
1491
|
+
// src/providers/openai.ts
|
|
1482
1492
|
var import_openai = __toESM(require("openai"), 1);
|
|
1493
|
+
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1494
|
+
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1495
|
+
function openai(model, options) {
|
|
1496
|
+
const client = new import_openai.default({
|
|
1497
|
+
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1498
|
+
baseURL: options?.baseURL,
|
|
1499
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1500
|
+
});
|
|
1501
|
+
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1502
|
+
}
|
|
1503
|
+
function openaiCompatible(options) {
|
|
1504
|
+
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1505
|
+
const client = new import_openai.default({
|
|
1506
|
+
apiKey,
|
|
1507
|
+
baseURL: options.baseURL,
|
|
1508
|
+
timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1509
|
+
});
|
|
1510
|
+
if (options.free) {
|
|
1511
|
+
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1512
|
+
}
|
|
1513
|
+
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1514
|
+
}
|
|
1515
|
+
function azureOpenai(model, options) {
|
|
1516
|
+
const deployment = options?.deployment ?? model;
|
|
1517
|
+
const client = new import_openai.AzureOpenAI({
|
|
1518
|
+
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1519
|
+
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1520
|
+
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1521
|
+
deployment,
|
|
1522
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1523
|
+
});
|
|
1524
|
+
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1525
|
+
}
|
|
1526
|
+
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1527
|
+
return {
|
|
1528
|
+
id,
|
|
1529
|
+
name,
|
|
1530
|
+
model,
|
|
1531
|
+
async run(input) {
|
|
1532
|
+
const start = Date.now();
|
|
1533
|
+
const params = {
|
|
1534
|
+
model: requestModel,
|
|
1535
|
+
messages: [{ role: "user", content: input.prompt }]
|
|
1536
|
+
};
|
|
1537
|
+
if (input.schema) {
|
|
1538
|
+
params.response_format = { type: "json_object" };
|
|
1539
|
+
params.messages = [
|
|
1540
|
+
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
1541
|
+
...params.messages
|
|
1542
|
+
];
|
|
1543
|
+
}
|
|
1544
|
+
if (input.tools?.length) {
|
|
1545
|
+
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1546
|
+
params.tool_choice = "auto";
|
|
1547
|
+
}
|
|
1548
|
+
const response = await client.chat.completions.create(params, { signal: input.signal });
|
|
1549
|
+
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1550
|
+
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1551
|
+
const choice = response.choices[0];
|
|
1552
|
+
const toolCallsRaw = choice?.message?.tool_calls;
|
|
1553
|
+
const collectedToolCalls = [];
|
|
1554
|
+
let finalResponse = response;
|
|
1555
|
+
if (toolCallsRaw?.length && input.tools?.length) {
|
|
1556
|
+
const toolMessages = [
|
|
1557
|
+
...params.messages,
|
|
1558
|
+
choice.message
|
|
1559
|
+
];
|
|
1560
|
+
for (const tc of toolCallsRaw) {
|
|
1561
|
+
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
1562
|
+
let args;
|
|
1563
|
+
try {
|
|
1564
|
+
args = JSON.parse(tc.function.arguments);
|
|
1565
|
+
} catch {
|
|
1566
|
+
args = tc.function.arguments;
|
|
1567
|
+
}
|
|
1568
|
+
let result;
|
|
1569
|
+
if (toolDef?.handler) {
|
|
1570
|
+
result = await toolDef.handler(args);
|
|
1571
|
+
}
|
|
1572
|
+
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
1573
|
+
toolMessages.push({
|
|
1574
|
+
role: "tool",
|
|
1575
|
+
tool_call_id: tc.id,
|
|
1576
|
+
content: JSON.stringify(result ?? {})
|
|
1577
|
+
});
|
|
1578
|
+
}
|
|
1579
|
+
const followUp = await client.chat.completions.create({
|
|
1580
|
+
model: requestModel,
|
|
1581
|
+
messages: toolMessages
|
|
1582
|
+
}, { signal: input.signal });
|
|
1583
|
+
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1584
|
+
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1585
|
+
finalResponse = followUp;
|
|
1586
|
+
}
|
|
1587
|
+
const latencyMs = Date.now() - start;
|
|
1588
|
+
const finalChoice = finalResponse.choices[0];
|
|
1589
|
+
let rawContent = finalChoice?.message?.content ?? "";
|
|
1590
|
+
if (stripThinking) {
|
|
1591
|
+
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1592
|
+
}
|
|
1593
|
+
let output = rawContent;
|
|
1594
|
+
if (input.schema) {
|
|
1595
|
+
try {
|
|
1596
|
+
output = JSON.parse(rawContent);
|
|
1597
|
+
} catch {
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
return {
|
|
1601
|
+
output,
|
|
1602
|
+
usage: {
|
|
1603
|
+
promptTokens: totalPromptTokens || void 0,
|
|
1604
|
+
completionTokens: totalCompletionTokens || void 0
|
|
1605
|
+
},
|
|
1606
|
+
latencyMs,
|
|
1607
|
+
raw: finalResponse,
|
|
1608
|
+
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
1609
|
+
};
|
|
1610
|
+
}
|
|
1611
|
+
};
|
|
1612
|
+
}
|
|
1613
|
+
function toolDefToOpenAI(tool) {
|
|
1614
|
+
return {
|
|
1615
|
+
type: "function",
|
|
1616
|
+
function: {
|
|
1617
|
+
name: tool.name,
|
|
1618
|
+
description: tool.description,
|
|
1619
|
+
parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
1620
|
+
}
|
|
1621
|
+
};
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
// src/scorers/llm-judge.ts
|
|
1483
1625
|
var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
|
|
1484
1626
|
|
|
1485
1627
|
Criteria:
|
|
@@ -1495,40 +1637,42 @@ conciseness: <number>
|
|
|
1495
1637
|
Task: {task}
|
|
1496
1638
|
Expected: {expected}
|
|
1497
1639
|
Actual: {actual}`;
|
|
1498
|
-
function resolveJudgeClient(configModel) {
|
|
1499
|
-
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-
|
|
1640
|
+
function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1641
|
+
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
|
|
1500
1642
|
if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
|
|
1501
1643
|
return {
|
|
1502
|
-
client: new
|
|
1644
|
+
client: new import_openai2.default({
|
|
1503
1645
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
1504
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
1646
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1647
|
+
timeout: timeoutMs
|
|
1505
1648
|
}),
|
|
1506
1649
|
model
|
|
1507
1650
|
};
|
|
1508
1651
|
}
|
|
1509
1652
|
if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
|
|
1510
1653
|
return {
|
|
1511
|
-
client: new
|
|
1654
|
+
client: new import_openai2.AzureOpenAI({
|
|
1512
1655
|
apiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
1513
1656
|
endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
1514
1657
|
apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1515
|
-
deployment: model
|
|
1658
|
+
deployment: model,
|
|
1659
|
+
timeout: timeoutMs
|
|
1516
1660
|
}),
|
|
1517
1661
|
model
|
|
1518
1662
|
};
|
|
1519
1663
|
}
|
|
1520
1664
|
const apiKey = process.env.OPENAI_API_KEY;
|
|
1521
1665
|
if (!apiKey) return void 0;
|
|
1522
|
-
return { client: new
|
|
1666
|
+
return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
|
|
1523
1667
|
}
|
|
1524
|
-
function createLlmJudgeScorer(judgeModel) {
|
|
1668
|
+
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1525
1669
|
let cached = void 0;
|
|
1526
1670
|
return async ({ task, result }) => {
|
|
1527
1671
|
if (task.expected === void 0) {
|
|
1528
1672
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
1529
1673
|
}
|
|
1530
1674
|
if (cached === void 0) {
|
|
1531
|
-
cached = resolveJudgeClient(judgeModel) ?? null;
|
|
1675
|
+
cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
|
|
1532
1676
|
}
|
|
1533
1677
|
if (!cached) {
|
|
1534
1678
|
return {
|
|
@@ -1601,10 +1745,10 @@ var staticScorers = {
|
|
|
1601
1745
|
"fuzzy-similarity": fuzzySimilarityScorer,
|
|
1602
1746
|
"tool-usage": toolUsageScorer
|
|
1603
1747
|
};
|
|
1604
|
-
function resolveScorers(names, judgeModel) {
|
|
1748
|
+
function resolveScorers(names, judgeModel, timeoutMs) {
|
|
1605
1749
|
return names.map((name) => {
|
|
1606
1750
|
if (name === "llm-judge-correctness") {
|
|
1607
|
-
return createLlmJudgeScorer(judgeModel);
|
|
1751
|
+
return createLlmJudgeScorer(judgeModel, timeoutMs);
|
|
1608
1752
|
}
|
|
1609
1753
|
const scorer = staticScorers[name];
|
|
1610
1754
|
if (!scorer) {
|
|
@@ -1615,19 +1759,41 @@ function resolveScorers(names, judgeModel) {
|
|
|
1615
1759
|
}
|
|
1616
1760
|
|
|
1617
1761
|
// src/runner.ts
|
|
1762
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
1763
|
+
function withTimeout(run, ms) {
|
|
1764
|
+
return new Promise((resolve, reject) => {
|
|
1765
|
+
const controller = new AbortController();
|
|
1766
|
+
const timer = setTimeout(() => {
|
|
1767
|
+
controller.abort();
|
|
1768
|
+
reject(new Error(`Request timed out after ${ms}ms`));
|
|
1769
|
+
}, ms);
|
|
1770
|
+
run(controller.signal).then(
|
|
1771
|
+
(v) => {
|
|
1772
|
+
clearTimeout(timer);
|
|
1773
|
+
resolve(v);
|
|
1774
|
+
},
|
|
1775
|
+
(e) => {
|
|
1776
|
+
clearTimeout(timer);
|
|
1777
|
+
reject(e);
|
|
1778
|
+
}
|
|
1779
|
+
);
|
|
1780
|
+
});
|
|
1781
|
+
}
|
|
1618
1782
|
async function runBenchmarks(options) {
|
|
1619
1783
|
const { providers, tasks, scorers, runs, onResult } = options;
|
|
1784
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1620
1785
|
const results = [];
|
|
1621
1786
|
for (const task of tasks) {
|
|
1622
1787
|
for (const provider of providers) {
|
|
1623
1788
|
for (let run = 1; run <= runs; run++) {
|
|
1624
1789
|
let result;
|
|
1625
1790
|
try {
|
|
1626
|
-
const taskResult = await provider.run({
|
|
1791
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1627
1792
|
prompt: task.prompt,
|
|
1628
1793
|
schema: task.schema,
|
|
1629
|
-
tools: task.tools
|
|
1630
|
-
|
|
1794
|
+
tools: task.tools,
|
|
1795
|
+
signal
|
|
1796
|
+
}), timeout);
|
|
1631
1797
|
const scores = await Promise.all(
|
|
1632
1798
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1633
1799
|
);
|
|
@@ -1670,20 +1836,162 @@ var green = "\x1B[32m";
|
|
|
1670
1836
|
var red = "\x1B[31m";
|
|
1671
1837
|
var yellow = "\x1B[33m";
|
|
1672
1838
|
var cyan = "\x1B[36m";
|
|
1839
|
+
var brightGreen = "\x1B[92m";
|
|
1840
|
+
var brightWhite = "\x1B[97m";
|
|
1673
1841
|
function bold(s) {
|
|
1674
1842
|
return `${boldCode}${s}${reset}`;
|
|
1675
1843
|
}
|
|
1676
1844
|
function dim(s) {
|
|
1677
1845
|
return `${dimCode}${s}${reset}`;
|
|
1678
1846
|
}
|
|
1679
|
-
function
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1847
|
+
function stripAnsi(s) {
|
|
1848
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
1849
|
+
}
|
|
1850
|
+
function displayWidth(s) {
|
|
1851
|
+
const stripped = stripAnsi(s);
|
|
1852
|
+
let width = 0;
|
|
1853
|
+
for (const ch of stripped) {
|
|
1854
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
1855
|
+
if (code >= 126976) width += 2;
|
|
1856
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
1857
|
+
else width += 1;
|
|
1858
|
+
}
|
|
1859
|
+
return width;
|
|
1860
|
+
}
|
|
1861
|
+
function padCell(str, targetWidth, align) {
|
|
1862
|
+
const dw = displayWidth(str);
|
|
1863
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
1864
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
1865
|
+
return str + " ".repeat(padding);
|
|
1866
|
+
}
|
|
1867
|
+
function sparkBar(ratio, width = 8) {
|
|
1868
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1869
|
+
const fillLen = Math.round(clamped * width);
|
|
1870
|
+
const fill = "\u2593".repeat(fillLen);
|
|
1871
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
1872
|
+
return { fill, track };
|
|
1873
|
+
}
|
|
1874
|
+
function drawTableLine(widths, position) {
|
|
1875
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
1876
|
+
if (position === "bottom") {
|
|
1877
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
1878
|
+
}
|
|
1879
|
+
if (position === "merge") {
|
|
1880
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
1881
|
+
}
|
|
1882
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
1883
|
+
if (position === "top") {
|
|
1884
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
1885
|
+
}
|
|
1886
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
1887
|
+
}
|
|
1888
|
+
function drawTableRow(cells, widths, aligns) {
|
|
1889
|
+
const parts = cells.map(
|
|
1890
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
1891
|
+
);
|
|
1892
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
1893
|
+
}
|
|
1894
|
+
function drawSpanRow(content, widths) {
|
|
1895
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
1896
|
+
const dw = displayWidth(content);
|
|
1897
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
1898
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
1899
|
+
}
|
|
1900
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
1901
|
+
const stats = /* @__PURE__ */ new Map();
|
|
1902
|
+
const valid = providerData.filter((p) => !p.allErrors);
|
|
1903
|
+
if (scorerNames.includes("latency")) {
|
|
1904
|
+
const values = /* @__PURE__ */ new Map();
|
|
1905
|
+
for (const p of providerData) {
|
|
1906
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
|
|
1907
|
+
}
|
|
1908
|
+
const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
|
|
1909
|
+
stats.set("latency", {
|
|
1910
|
+
values,
|
|
1911
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
1912
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
1913
|
+
});
|
|
1914
|
+
}
|
|
1915
|
+
if (scorerNames.includes("cost")) {
|
|
1916
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
1917
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
1918
|
+
for (const p of providerData) {
|
|
1919
|
+
costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
|
|
1920
|
+
tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
|
|
1921
|
+
}
|
|
1922
|
+
const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
|
|
1923
|
+
const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
|
|
1924
|
+
stats.set("cost", {
|
|
1925
|
+
values: costValues,
|
|
1926
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
1927
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
1928
|
+
});
|
|
1929
|
+
stats.set("tokens", {
|
|
1930
|
+
values: tokenValues,
|
|
1931
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
1932
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
1933
|
+
});
|
|
1934
|
+
}
|
|
1935
|
+
for (const name of scorerNames) {
|
|
1936
|
+
if (name === "latency" || name === "cost") continue;
|
|
1937
|
+
const values = /* @__PURE__ */ new Map();
|
|
1938
|
+
for (const p of providerData) {
|
|
1939
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
|
|
1940
|
+
}
|
|
1941
|
+
const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
|
|
1942
|
+
stats.set(name, {
|
|
1943
|
+
values,
|
|
1944
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
1945
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
1946
|
+
});
|
|
1947
|
+
}
|
|
1948
|
+
return stats;
|
|
1949
|
+
}
|
|
1950
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
1951
|
+
if (value === void 0) return dim("\u2014");
|
|
1952
|
+
if (providerCount < 2) return text;
|
|
1953
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
1954
|
+
if (colStats.best === colStats.worst) return text;
|
|
1955
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
1956
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
1957
|
+
return `${yellow}${text}${reset}`;
|
|
1685
1958
|
}
|
|
1686
|
-
function
|
|
1959
|
+
function computeMedals(columnStats, providerIds) {
|
|
1960
|
+
const medals = /* @__PURE__ */ new Map();
|
|
1961
|
+
if (providerIds.length < 2) {
|
|
1962
|
+
for (const id of providerIds) medals.set(id, "");
|
|
1963
|
+
return medals;
|
|
1964
|
+
}
|
|
1965
|
+
const wins = /* @__PURE__ */ new Map();
|
|
1966
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
1967
|
+
for (const [, colStats] of columnStats) {
|
|
1968
|
+
if (colStats.best === void 0) continue;
|
|
1969
|
+
for (const [providerId, value] of colStats.values) {
|
|
1970
|
+
if (value !== void 0 && value === colStats.best) {
|
|
1971
|
+
wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1974
|
+
}
|
|
1975
|
+
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
1976
|
+
if (totalWins === 0) {
|
|
1977
|
+
for (const id of providerIds) medals.set(id, "");
|
|
1978
|
+
return medals;
|
|
1979
|
+
}
|
|
1980
|
+
const sorted = [...wins.entries()].sort(
|
|
1981
|
+
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
1982
|
+
);
|
|
1983
|
+
const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
|
|
1984
|
+
let rank = 0;
|
|
1985
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
1986
|
+
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
1987
|
+
rank = i;
|
|
1988
|
+
}
|
|
1989
|
+
medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
|
|
1990
|
+
}
|
|
1991
|
+
return medals;
|
|
1992
|
+
}
|
|
1993
|
+
function consoleReporter(results, options) {
|
|
1994
|
+
const showSparklines = options?.sparklines ?? true;
|
|
1687
1995
|
if (results.length === 0) {
|
|
1688
1996
|
console.log("\nNo results to display.\n");
|
|
1689
1997
|
return;
|
|
@@ -1693,78 +2001,155 @@ function consoleReporter(results) {
|
|
|
1693
2001
|
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
1694
2002
|
const hasCost = scorerNames.includes("cost");
|
|
1695
2003
|
const hasErrors = results.some((r) => r.error);
|
|
2004
|
+
const multi = providers.length >= 2;
|
|
1696
2005
|
const runsPerCell = Math.max(...results.map((r) => r.run));
|
|
1697
|
-
const runLabel = runsPerCell > 1 ? `
|
|
2006
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
1698
2007
|
console.log("");
|
|
1699
|
-
console.log(` ${
|
|
1700
|
-
console.log(` ${dim("\
|
|
2008
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
2009
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1701
2010
|
console.log("");
|
|
1702
2011
|
for (const task of tasks) {
|
|
1703
2012
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
else if (name === "cost") {
|
|
1708
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
1709
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
1710
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
1711
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
1712
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
1713
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
1714
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
1715
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
1716
|
-
}
|
|
1717
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
1718
|
-
const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
|
|
1719
|
-
console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
|
|
1720
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
1721
|
-
for (const provider of providers) {
|
|
1722
|
-
const taskResults = results.filter(
|
|
1723
|
-
(r) => r.taskName === task && r.providerId === provider
|
|
1724
|
-
);
|
|
2013
|
+
console.log("");
|
|
2014
|
+
const providerData = providers.map((providerId) => {
|
|
2015
|
+
const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
|
|
1725
2016
|
const errorResults2 = taskResults.filter((r) => r.error);
|
|
1726
2017
|
const successResults = taskResults.filter((r) => !r.error);
|
|
1727
|
-
if (successResults.length === 0
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
2018
|
+
if (successResults.length === 0) {
|
|
2019
|
+
return {
|
|
2020
|
+
providerId,
|
|
2021
|
+
avgScores: {},
|
|
2022
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
2023
|
+
latencyMs: void 0,
|
|
2024
|
+
allErrors: errorResults2.length > 0,
|
|
2025
|
+
errorCount: errorResults2.length
|
|
2026
|
+
};
|
|
2027
|
+
}
|
|
2028
|
+
return {
|
|
2029
|
+
providerId,
|
|
2030
|
+
avgScores: averageScores(successResults),
|
|
2031
|
+
avgDetails: averageDetails(successResults),
|
|
2032
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
2033
|
+
allErrors: false,
|
|
2034
|
+
errorCount: errorResults2.length
|
|
2035
|
+
};
|
|
2036
|
+
});
|
|
2037
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2038
|
+
const medals = computeMedals(columnStats, providers);
|
|
2039
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
2040
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
2041
|
+
const cols = [
|
|
2042
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
2043
|
+
];
|
|
2044
|
+
for (const name of scorerNames) {
|
|
2045
|
+
if (name === "latency") {
|
|
2046
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
2047
|
+
} else if (name === "cost") {
|
|
2048
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
2049
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
2050
|
+
} else {
|
|
2051
|
+
const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
|
|
2052
|
+
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
1738
2053
|
}
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
2054
|
+
}
|
|
2055
|
+
if (hasErrors) {
|
|
2056
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
2057
|
+
}
|
|
2058
|
+
const widths = cols.map((c) => c.width);
|
|
2059
|
+
const aligns = cols.map((c) => c.align);
|
|
2060
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
2061
|
+
const headerCells = cols.map((c) => bold(c.label));
|
|
2062
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2063
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2064
|
+
for (const pd of providerData) {
|
|
2065
|
+
const medal = medals.get(pd.providerId) ?? "";
|
|
2066
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2067
|
+
const cells = [providerCell];
|
|
2068
|
+
if (pd.allErrors) {
|
|
2069
|
+
for (const col of cols.slice(1)) {
|
|
2070
|
+
if (col.label === "Status") {
|
|
2071
|
+
cells.push(`${red}FAIL${reset}`);
|
|
2072
|
+
} else {
|
|
2073
|
+
cells.push(dim("\u2014"));
|
|
2074
|
+
}
|
|
2075
|
+
}
|
|
2076
|
+
} else {
|
|
2077
|
+
for (const col of cols.slice(1)) {
|
|
2078
|
+
if (col.label === "Status") {
|
|
2079
|
+
cells.push(
|
|
2080
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
2081
|
+
);
|
|
2082
|
+
continue;
|
|
2083
|
+
}
|
|
2084
|
+
const statsKey = col.statsKey;
|
|
2085
|
+
const colStats = columnStats.get(statsKey);
|
|
2086
|
+
if (statsKey === "latency") {
|
|
2087
|
+
const ms = pd.latencyMs;
|
|
2088
|
+
if (ms === void 0) {
|
|
2089
|
+
cells.push(dim("\u2014"));
|
|
2090
|
+
} else {
|
|
2091
|
+
const text = `${Math.round(ms)}ms`;
|
|
2092
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
2093
|
+
}
|
|
2094
|
+
} else if (statsKey === "cost") {
|
|
2095
|
+
const cost = pd.avgDetails.costUsd;
|
|
2096
|
+
if (cost === void 0) {
|
|
2097
|
+
cells.push(dim("\u2014"));
|
|
2098
|
+
} else {
|
|
2099
|
+
const text = formatCost(cost);
|
|
2100
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
2101
|
+
}
|
|
2102
|
+
} else if (statsKey === "tokens") {
|
|
2103
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
2104
|
+
if (tokens === void 0) {
|
|
2105
|
+
cells.push(dim("\u2014"));
|
|
2106
|
+
} else {
|
|
2107
|
+
const text = `${tokens}`;
|
|
2108
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
2109
|
+
}
|
|
2110
|
+
} else {
|
|
2111
|
+
const val = pd.avgScores[statsKey];
|
|
2112
|
+
if (val === void 0) {
|
|
2113
|
+
cells.push(dim("\u2014"));
|
|
2114
|
+
} else {
|
|
2115
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
2116
|
+
let coloredPct;
|
|
2117
|
+
if (multi && colStats) {
|
|
2118
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
2119
|
+
} else {
|
|
2120
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
2121
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
2122
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
2123
|
+
}
|
|
2124
|
+
if (showSparklines) {
|
|
2125
|
+
const { fill, track } = sparkBar(val);
|
|
2126
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
2127
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
2128
|
+
} else {
|
|
2129
|
+
cells.push(coloredPct);
|
|
2130
|
+
}
|
|
2131
|
+
}
|
|
2132
|
+
}
|
|
1753
2133
|
}
|
|
1754
2134
|
}
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
2135
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2136
|
+
}
|
|
2137
|
+
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2138
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
|
|
2139
|
+
if (winnerId) {
|
|
2140
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2141
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
2142
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
1758
2143
|
}
|
|
1759
|
-
console.log(` ${cells.join("")}`);
|
|
1760
2144
|
}
|
|
2145
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
1761
2146
|
console.log("");
|
|
1762
2147
|
}
|
|
1763
2148
|
printSummary(results, providers);
|
|
1764
2149
|
const errorResults = results.filter((r) => r.error);
|
|
1765
2150
|
if (errorResults.length > 0) {
|
|
1766
2151
|
console.log(` ${bold("Errors")}`);
|
|
1767
|
-
console.log(` ${dim("\
|
|
2152
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1768
2153
|
const seen = /* @__PURE__ */ new Set();
|
|
1769
2154
|
for (const r of errorResults) {
|
|
1770
2155
|
const key = `${r.providerId}::${r.error}`;
|
|
@@ -1772,7 +2157,7 @@ function consoleReporter(results) {
|
|
|
1772
2157
|
seen.add(key);
|
|
1773
2158
|
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
1774
2159
|
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
1775
|
-
console.log(` ${red}\
|
|
2160
|
+
console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
1776
2161
|
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
1777
2162
|
if (hint) console.log(` ${dim(hint)}`);
|
|
1778
2163
|
}
|
|
@@ -1786,15 +2171,20 @@ function consoleReporter(results) {
|
|
|
1786
2171
|
function printSummary(results, providers) {
|
|
1787
2172
|
const successResults = results.filter((r) => !r.error);
|
|
1788
2173
|
if (successResults.length === 0) return;
|
|
1789
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
1790
2174
|
console.log(` ${bold("Summary")}`);
|
|
2175
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1791
2176
|
console.log("");
|
|
1792
2177
|
const single = providers.length === 1;
|
|
1793
2178
|
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
1794
2179
|
const byCorrectness = rankProviders(successResults, providers, correctnessKey);
|
|
1795
2180
|
if (byCorrectness) {
|
|
1796
|
-
const
|
|
1797
|
-
|
|
2181
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2182
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2183
|
+
if (single) {
|
|
2184
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2185
|
+
} else {
|
|
2186
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2187
|
+
}
|
|
1798
2188
|
}
|
|
1799
2189
|
const byLatency = providers.map((id) => {
|
|
1800
2190
|
const runs = successResults.filter((r) => r.providerId === id);
|
|
@@ -1802,8 +2192,13 @@ function printSummary(results, providers) {
|
|
|
1802
2192
|
return { id, avg: avg ?? Infinity };
|
|
1803
2193
|
}).sort((a, b) => a.avg - b.avg)[0];
|
|
1804
2194
|
if (byLatency && byLatency.avg !== Infinity) {
|
|
1805
|
-
const
|
|
1806
|
-
|
|
2195
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2196
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2197
|
+
if (single) {
|
|
2198
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2199
|
+
} else {
|
|
2200
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2201
|
+
}
|
|
1807
2202
|
}
|
|
1808
2203
|
const byCost = providers.map((id) => {
|
|
1809
2204
|
const runs = successResults.filter((r) => r.providerId === id);
|
|
@@ -1815,8 +2210,32 @@ function printSummary(results, providers) {
|
|
|
1815
2210
|
return { id, avg };
|
|
1816
2211
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
1817
2212
|
if (byCost?.avg !== void 0) {
|
|
1818
|
-
const
|
|
1819
|
-
|
|
2213
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2214
|
+
const costStr = formatCost(byCost.avg);
|
|
2215
|
+
if (single) {
|
|
2216
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2217
|
+
} else {
|
|
2218
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2219
|
+
}
|
|
2220
|
+
}
|
|
2221
|
+
if (!single) {
|
|
2222
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2223
|
+
for (const id of providers) wins.set(id, 0);
|
|
2224
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2225
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2226
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2227
|
+
const maxWins = Math.max(...wins.values());
|
|
2228
|
+
if (maxWins > 0) {
|
|
2229
|
+
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2230
|
+
console.log("");
|
|
2231
|
+
if (topProviders.length === 1) {
|
|
2232
|
+
const [winnerId, winCount] = topProviders[0];
|
|
2233
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2234
|
+
} else {
|
|
2235
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2236
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2237
|
+
}
|
|
2238
|
+
}
|
|
1820
2239
|
}
|
|
1821
2240
|
console.log("");
|
|
1822
2241
|
}
|
|
@@ -1878,14 +2297,6 @@ function formatCost(usd) {
|
|
|
1878
2297
|
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
1879
2298
|
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1880
2299
|
}
|
|
1881
|
-
function pad(str, width, align) {
|
|
1882
|
-
if (align === "right") return str.padStart(width);
|
|
1883
|
-
return str.padEnd(width);
|
|
1884
|
-
}
|
|
1885
|
-
function colorLen(str) {
|
|
1886
|
-
const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
|
|
1887
|
-
return str.length - stripped.length;
|
|
1888
|
-
}
|
|
1889
2300
|
function apiKeyHint(providerId, error) {
|
|
1890
2301
|
const lower = error.toLowerCase();
|
|
1891
2302
|
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
@@ -1989,7 +2400,7 @@ function defineArena(config) {
|
|
|
1989
2400
|
throw new Error("At least one task is required");
|
|
1990
2401
|
}
|
|
1991
2402
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
1992
|
-
const scorerFns = resolveScorers(scorerNames, config.judgeModel);
|
|
2403
|
+
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
1993
2404
|
const runs = config.runs ?? 1;
|
|
1994
2405
|
return {
|
|
1995
2406
|
config,
|
|
@@ -1999,141 +2410,13 @@ function defineArena(config) {
|
|
|
1999
2410
|
tasks: config.tasks,
|
|
2000
2411
|
scorers: scorerFns,
|
|
2001
2412
|
runs,
|
|
2413
|
+
timeout: config.timeout,
|
|
2002
2414
|
onResult: options?.onResult
|
|
2003
2415
|
});
|
|
2004
2416
|
}
|
|
2005
2417
|
};
|
|
2006
2418
|
}
|
|
2007
2419
|
|
|
2008
|
-
// src/providers/openai.ts
|
|
2009
|
-
var import_openai2 = __toESM(require("openai"), 1);
|
|
2010
|
-
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
2011
|
-
function openai(model, options) {
|
|
2012
|
-
const client = new import_openai2.default({
|
|
2013
|
-
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
2014
|
-
baseURL: options?.baseURL
|
|
2015
|
-
});
|
|
2016
|
-
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
2017
|
-
}
|
|
2018
|
-
function openaiCompatible(options) {
|
|
2019
|
-
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
2020
|
-
const client = new import_openai2.default({
|
|
2021
|
-
apiKey,
|
|
2022
|
-
baseURL: options.baseURL
|
|
2023
|
-
});
|
|
2024
|
-
if (options.free) {
|
|
2025
|
-
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
2026
|
-
}
|
|
2027
|
-
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
2028
|
-
}
|
|
2029
|
-
function azureOpenai(model, options) {
|
|
2030
|
-
const deployment = options?.deployment ?? model;
|
|
2031
|
-
const client = new import_openai2.AzureOpenAI({
|
|
2032
|
-
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
2033
|
-
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
2034
|
-
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
2035
|
-
deployment
|
|
2036
|
-
});
|
|
2037
|
-
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
2038
|
-
}
|
|
2039
|
-
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
2040
|
-
return {
|
|
2041
|
-
id,
|
|
2042
|
-
name,
|
|
2043
|
-
model,
|
|
2044
|
-
async run(input) {
|
|
2045
|
-
const start = Date.now();
|
|
2046
|
-
const params = {
|
|
2047
|
-
model: requestModel,
|
|
2048
|
-
messages: [{ role: "user", content: input.prompt }]
|
|
2049
|
-
};
|
|
2050
|
-
if (input.schema) {
|
|
2051
|
-
params.response_format = { type: "json_object" };
|
|
2052
|
-
params.messages = [
|
|
2053
|
-
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
2054
|
-
...params.messages
|
|
2055
|
-
];
|
|
2056
|
-
}
|
|
2057
|
-
if (input.tools?.length) {
|
|
2058
|
-
params.tools = input.tools.map(toolDefToOpenAI);
|
|
2059
|
-
params.tool_choice = "auto";
|
|
2060
|
-
}
|
|
2061
|
-
const response = await client.chat.completions.create(params);
|
|
2062
|
-
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
2063
|
-
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
2064
|
-
const choice = response.choices[0];
|
|
2065
|
-
const toolCallsRaw = choice?.message?.tool_calls;
|
|
2066
|
-
const collectedToolCalls = [];
|
|
2067
|
-
let finalResponse = response;
|
|
2068
|
-
if (toolCallsRaw?.length && input.tools?.length) {
|
|
2069
|
-
const toolMessages = [
|
|
2070
|
-
...params.messages,
|
|
2071
|
-
choice.message
|
|
2072
|
-
];
|
|
2073
|
-
for (const tc of toolCallsRaw) {
|
|
2074
|
-
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
2075
|
-
let args;
|
|
2076
|
-
try {
|
|
2077
|
-
args = JSON.parse(tc.function.arguments);
|
|
2078
|
-
} catch {
|
|
2079
|
-
args = tc.function.arguments;
|
|
2080
|
-
}
|
|
2081
|
-
let result;
|
|
2082
|
-
if (toolDef?.handler) {
|
|
2083
|
-
result = await toolDef.handler(args);
|
|
2084
|
-
}
|
|
2085
|
-
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
2086
|
-
toolMessages.push({
|
|
2087
|
-
role: "tool",
|
|
2088
|
-
tool_call_id: tc.id,
|
|
2089
|
-
content: JSON.stringify(result ?? {})
|
|
2090
|
-
});
|
|
2091
|
-
}
|
|
2092
|
-
const followUp = await client.chat.completions.create({
|
|
2093
|
-
model: requestModel,
|
|
2094
|
-
messages: toolMessages
|
|
2095
|
-
});
|
|
2096
|
-
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
2097
|
-
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
2098
|
-
finalResponse = followUp;
|
|
2099
|
-
}
|
|
2100
|
-
const latencyMs = Date.now() - start;
|
|
2101
|
-
const finalChoice = finalResponse.choices[0];
|
|
2102
|
-
let rawContent = finalChoice?.message?.content ?? "";
|
|
2103
|
-
if (stripThinking) {
|
|
2104
|
-
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
2105
|
-
}
|
|
2106
|
-
let output = rawContent;
|
|
2107
|
-
if (input.schema) {
|
|
2108
|
-
try {
|
|
2109
|
-
output = JSON.parse(rawContent);
|
|
2110
|
-
} catch {
|
|
2111
|
-
}
|
|
2112
|
-
}
|
|
2113
|
-
return {
|
|
2114
|
-
output,
|
|
2115
|
-
usage: {
|
|
2116
|
-
promptTokens: totalPromptTokens || void 0,
|
|
2117
|
-
completionTokens: totalCompletionTokens || void 0
|
|
2118
|
-
},
|
|
2119
|
-
latencyMs,
|
|
2120
|
-
raw: finalResponse,
|
|
2121
|
-
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
2122
|
-
};
|
|
2123
|
-
}
|
|
2124
|
-
};
|
|
2125
|
-
}
|
|
2126
|
-
function toolDefToOpenAI(tool) {
|
|
2127
|
-
return {
|
|
2128
|
-
type: "function",
|
|
2129
|
-
function: {
|
|
2130
|
-
name: tool.name,
|
|
2131
|
-
description: tool.description,
|
|
2132
|
-
parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
2133
|
-
}
|
|
2134
|
-
};
|
|
2135
|
-
}
|
|
2136
|
-
|
|
2137
2420
|
// src/providers/anthropic.ts
|
|
2138
2421
|
var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
|
|
2139
2422
|
function anthropic(model, options) {
|
|
@@ -2153,7 +2436,7 @@ function anthropic(model, options) {
|
|
|
2153
2436
|
max_tokens: maxTokens,
|
|
2154
2437
|
system: systemMessage,
|
|
2155
2438
|
messages: [{ role: "user", content: input.prompt }]
|
|
2156
|
-
});
|
|
2439
|
+
}, { signal: input.signal });
|
|
2157
2440
|
const latencyMs = Date.now() - start;
|
|
2158
2441
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2159
2442
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2178,7 +2461,7 @@ function anthropic(model, options) {
|
|
|
2178
2461
|
}
|
|
2179
2462
|
|
|
2180
2463
|
// src/providers/gemini.ts
|
|
2181
|
-
var
|
|
2464
|
+
var import_openai4 = __toESM(require("openai"), 1);
|
|
2182
2465
|
function gemini(model, options) {
|
|
2183
2466
|
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
2184
2467
|
if (!apiKey) {
|
|
@@ -2186,22 +2469,409 @@ function gemini(model, options) {
|
|
|
2186
2469
|
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
2187
2470
|
);
|
|
2188
2471
|
}
|
|
2189
|
-
const client = new
|
|
2472
|
+
const client = new import_openai4.default({
|
|
2190
2473
|
apiKey,
|
|
2191
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
2474
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2475
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2192
2476
|
});
|
|
2193
2477
|
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2194
2478
|
}
|
|
2479
|
+
|
|
2480
|
+
// src/reporter/markdown.ts
|
|
2481
|
+
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2482
|
+
function markdownReporter(report, _current) {
|
|
2483
|
+
const lines = [COMMENT_MARKER, ""];
|
|
2484
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
2485
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
2486
|
+
lines.push("");
|
|
2487
|
+
if (report.comparisons.length > 0) {
|
|
2488
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
2489
|
+
lines.push("");
|
|
2490
|
+
}
|
|
2491
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
2492
|
+
lines.push(markdownCostSummary(report.cost));
|
|
2493
|
+
lines.push("");
|
|
2494
|
+
}
|
|
2495
|
+
if (report.flakyResults.length > 0) {
|
|
2496
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
2497
|
+
lines.push("");
|
|
2498
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
2499
|
+
lines.push("");
|
|
2500
|
+
for (const f of report.flakyResults) {
|
|
2501
|
+
lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
|
|
2502
|
+
}
|
|
2503
|
+
lines.push("");
|
|
2504
|
+
}
|
|
2505
|
+
if (report.failureReasons.length > 0) {
|
|
2506
|
+
lines.push("### Failure Reasons");
|
|
2507
|
+
lines.push("");
|
|
2508
|
+
for (const reason of report.failureReasons) {
|
|
2509
|
+
lines.push(`- ${reason}`);
|
|
2510
|
+
}
|
|
2511
|
+
lines.push("");
|
|
2512
|
+
}
|
|
2513
|
+
lines.push("---");
|
|
2514
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
2515
|
+
return lines.join("\n");
|
|
2516
|
+
}
|
|
2517
|
+
function markdownComparisonTable(comparisons) {
|
|
2518
|
+
const lines = [];
|
|
2519
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
2520
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
2521
|
+
for (const c of comparisons) {
|
|
2522
|
+
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2523
|
+
const currentStr = formatStats(c.current);
|
|
2524
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2525
|
+
const status = statusIndicator(c);
|
|
2526
|
+
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2527
|
+
}
|
|
2528
|
+
return lines.join("\n");
|
|
2529
|
+
}
|
|
2530
|
+
function markdownCostSummary(cost) {
|
|
2531
|
+
const lines = [];
|
|
2532
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
2533
|
+
lines.push("");
|
|
2534
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
2535
|
+
if (cost.budget !== void 0) {
|
|
2536
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
2537
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
2538
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
2539
|
+
}
|
|
2540
|
+
if (cost.perProvider.size > 1) {
|
|
2541
|
+
lines.push("");
|
|
2542
|
+
lines.push("| Provider | Cost |");
|
|
2543
|
+
lines.push("|----------|------|");
|
|
2544
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
2545
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
2546
|
+
}
|
|
2547
|
+
}
|
|
2548
|
+
return lines.join("\n");
|
|
2549
|
+
}
|
|
2550
|
+
function formatStats(stats) {
|
|
2551
|
+
if (stats.n > 1) {
|
|
2552
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
2553
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
2554
|
+
}
|
|
2555
|
+
return stats.mean.toFixed(3);
|
|
2556
|
+
}
|
|
2557
|
+
function formatDelta(delta) {
|
|
2558
|
+
const sign = delta >= 0 ? "+" : "";
|
|
2559
|
+
return `${sign}${delta.toFixed(3)}`;
|
|
2560
|
+
}
|
|
2561
|
+
function statusIndicator(c) {
|
|
2562
|
+
if (c.regressed) return "\u{1F534} regressed";
|
|
2563
|
+
if (c.improved) return "\u{1F7E2} improved";
|
|
2564
|
+
if (c.baseline === null) return "\u{1F195} new";
|
|
2565
|
+
return "\u26AA unchanged";
|
|
2566
|
+
}
|
|
2567
|
+
|
|
2568
|
+
// src/ci.ts
|
|
2569
|
+
var import_node_fs = require("fs");
|
|
2570
|
+
var import_node_path = require("path");
|
|
2571
|
+
var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
2572
|
+
var FLAKY_CV_THRESHOLD = 0.3;
|
|
2573
|
+
var T_CRITICAL_95 = {
|
|
2574
|
+
1: 12.706,
|
|
2575
|
+
2: 4.303,
|
|
2576
|
+
3: 3.182,
|
|
2577
|
+
4: 2.776,
|
|
2578
|
+
5: 2.571,
|
|
2579
|
+
6: 2.447,
|
|
2580
|
+
7: 2.365,
|
|
2581
|
+
8: 2.306,
|
|
2582
|
+
9: 2.262,
|
|
2583
|
+
10: 2.228,
|
|
2584
|
+
15: 2.131,
|
|
2585
|
+
20: 2.086,
|
|
2586
|
+
25: 2.06,
|
|
2587
|
+
30: 2.042
|
|
2588
|
+
};
|
|
2589
|
+
function tCritical(df) {
|
|
2590
|
+
if (df <= 0) return 1.96;
|
|
2591
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2592
|
+
const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2593
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
2594
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
2595
|
+
if (df > keys[i] && df < keys[i + 1]) {
|
|
2596
|
+
const low = keys[i], high = keys[i + 1];
|
|
2597
|
+
const ratio = (df - low) / (high - low);
|
|
2598
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
2599
|
+
}
|
|
2600
|
+
}
|
|
2601
|
+
return 1.96;
|
|
2602
|
+
}
|
|
2603
|
+
function computeScorerStats(samples) {
|
|
2604
|
+
const n = samples.length;
|
|
2605
|
+
if (n === 0) {
|
|
2606
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
2607
|
+
}
|
|
2608
|
+
const mean = samples.reduce((a, b) => a + b, 0) / n;
|
|
2609
|
+
if (n === 1) {
|
|
2610
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
2611
|
+
}
|
|
2612
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
|
|
2613
|
+
const stddev = Math.sqrt(variance);
|
|
2614
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
2615
|
+
const se = stddev / Math.sqrt(n);
|
|
2616
|
+
const t = tCritical(n - 1);
|
|
2617
|
+
return {
|
|
2618
|
+
mean,
|
|
2619
|
+
stddev,
|
|
2620
|
+
cv,
|
|
2621
|
+
n,
|
|
2622
|
+
ci95Lower: mean - t * se,
|
|
2623
|
+
ci95Upper: mean + t * se
|
|
2624
|
+
};
|
|
2625
|
+
}
|
|
2626
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
2627
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
2628
|
+
}
|
|
2629
|
+
function computeStats(results) {
|
|
2630
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
2631
|
+
for (const r of results) {
|
|
2632
|
+
if (r.error) continue;
|
|
2633
|
+
for (const score of r.scores) {
|
|
2634
|
+
if (score.value < 0) continue;
|
|
2635
|
+
const key = groupKey(r.providerId, r.taskName, score.name);
|
|
2636
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
2637
|
+
grouped.get(key).push(score.value);
|
|
2638
|
+
}
|
|
2639
|
+
}
|
|
2640
|
+
const stats = /* @__PURE__ */ new Map();
|
|
2641
|
+
for (const [key, samples] of grouped) {
|
|
2642
|
+
stats.set(key, computeScorerStats(samples));
|
|
2643
|
+
}
|
|
2644
|
+
return stats;
|
|
2645
|
+
}
|
|
2646
|
+
function computeCostSummary(results, budget) {
|
|
2647
|
+
let totalUsd = 0;
|
|
2648
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
2649
|
+
for (const r of results) {
|
|
2650
|
+
if (r.error) continue;
|
|
2651
|
+
const costScore = r.scores.find((s) => s.name === "cost");
|
|
2652
|
+
if (!costScore || costScore.value < 0) continue;
|
|
2653
|
+
const details = costScore.details;
|
|
2654
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
2655
|
+
if (usd <= 0) continue;
|
|
2656
|
+
totalUsd += usd;
|
|
2657
|
+
perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
|
|
2658
|
+
}
|
|
2659
|
+
return {
|
|
2660
|
+
totalUsd,
|
|
2661
|
+
perProvider,
|
|
2662
|
+
budget,
|
|
2663
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
2664
|
+
};
|
|
2665
|
+
}
|
|
2666
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
2667
|
+
const comparisons = [];
|
|
2668
|
+
const failureReasons = [];
|
|
2669
|
+
for (const [key, current] of currentStats) {
|
|
2670
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
2671
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
2672
|
+
let delta = null;
|
|
2673
|
+
let regressed = false;
|
|
2674
|
+
let improved = false;
|
|
2675
|
+
if (baseline) {
|
|
2676
|
+
delta = current.mean - baseline.mean;
|
|
2677
|
+
const threshold = thresholds.get(scorerName);
|
|
2678
|
+
if (threshold !== void 0) {
|
|
2679
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
2680
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
2681
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
2682
|
+
}
|
|
2683
|
+
}
|
|
2684
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
2685
|
+
comparisons.push({
|
|
2686
|
+
providerId,
|
|
2687
|
+
taskName,
|
|
2688
|
+
scorerName,
|
|
2689
|
+
baseline,
|
|
2690
|
+
current,
|
|
2691
|
+
delta,
|
|
2692
|
+
regressed,
|
|
2693
|
+
improved,
|
|
2694
|
+
flaky
|
|
2695
|
+
});
|
|
2696
|
+
}
|
|
2697
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
2698
|
+
const regressions = comparisons.filter((c) => c.regressed);
|
|
2699
|
+
if (regressions.length > 0) {
|
|
2700
|
+
for (const r of regressions) {
|
|
2701
|
+
failureReasons.push(
|
|
2702
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
|
|
2703
|
+
);
|
|
2704
|
+
}
|
|
2705
|
+
}
|
|
2706
|
+
if (cost.overBudget) {
|
|
2707
|
+
failureReasons.push(
|
|
2708
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
2709
|
+
);
|
|
2710
|
+
}
|
|
2711
|
+
const flakyResults = comparisons.filter((c) => c.flaky);
|
|
2712
|
+
const failed = failureReasons.length > 0;
|
|
2713
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
2714
|
+
}
|
|
2715
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
2716
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
2717
|
+
const delta = current.mean - baseline.mean;
|
|
2718
|
+
if (lowerIsBetter) return delta > threshold;
|
|
2719
|
+
return delta < -threshold;
|
|
2720
|
+
}
|
|
2721
|
+
if (lowerIsBetter) {
|
|
2722
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2723
|
+
}
|
|
2724
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
2725
|
+
}
|
|
2726
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
2727
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
2728
|
+
const delta = current.mean - baseline.mean;
|
|
2729
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
2730
|
+
return delta > threshold;
|
|
2731
|
+
}
|
|
2732
|
+
if (lowerIsBetter) {
|
|
2733
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
2734
|
+
}
|
|
2735
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2736
|
+
}
|
|
2737
|
+
function formatDelta2(delta) {
|
|
2738
|
+
const sign = delta >= 0 ? "+" : "";
|
|
2739
|
+
return `${sign}${delta.toFixed(4)}`;
|
|
2740
|
+
}
|
|
2741
|
+
function loadBaseline(path) {
|
|
2742
|
+
try {
|
|
2743
|
+
const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
|
|
2744
|
+
const data = JSON.parse(raw);
|
|
2745
|
+
const results = data.results ?? data;
|
|
2746
|
+
if (!Array.isArray(results)) return null;
|
|
2747
|
+
return {
|
|
2748
|
+
timestamp: data.timestamp ?? "unknown",
|
|
2749
|
+
results
|
|
2750
|
+
};
|
|
2751
|
+
} catch {
|
|
2752
|
+
return null;
|
|
2753
|
+
}
|
|
2754
|
+
}
|
|
2755
|
+
function saveBaseline(path, results) {
|
|
2756
|
+
(0, import_node_fs.mkdirSync)((0, import_node_path.dirname)(path), { recursive: true });
|
|
2757
|
+
const data = {
|
|
2758
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2759
|
+
results
|
|
2760
|
+
};
|
|
2761
|
+
(0, import_node_fs.writeFileSync)(path, JSON.stringify(data, null, 2));
|
|
2762
|
+
}
|
|
2763
|
+
|
|
2764
|
+
// src/github.ts
|
|
2765
|
+
var import_node_fs2 = require("fs");
|
|
2766
|
+
function detectGitHubContext() {
|
|
2767
|
+
const token = process.env.GITHUB_TOKEN;
|
|
2768
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
2769
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
2770
|
+
if (!token || !repository) return null;
|
|
2771
|
+
const [owner, repo] = repository.split("/");
|
|
2772
|
+
if (!owner || !repo) return null;
|
|
2773
|
+
let prNumber;
|
|
2774
|
+
if (eventPath) {
|
|
2775
|
+
try {
|
|
2776
|
+
const event = JSON.parse((0, import_node_fs2.readFileSync)(eventPath, "utf-8"));
|
|
2777
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
2778
|
+
const pr = event.pull_request;
|
|
2779
|
+
prNumber = pr.number;
|
|
2780
|
+
}
|
|
2781
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
2782
|
+
const issue = event.issue;
|
|
2783
|
+
if (issue.pull_request) {
|
|
2784
|
+
prNumber = issue.number;
|
|
2785
|
+
}
|
|
2786
|
+
}
|
|
2787
|
+
} catch {
|
|
2788
|
+
}
|
|
2789
|
+
}
|
|
2790
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
2791
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
2792
|
+
}
|
|
2793
|
+
if (!prNumber) return null;
|
|
2794
|
+
return { token, owner, repo, prNumber };
|
|
2795
|
+
}
|
|
2796
|
+
var API_BASE = "https://api.github.com";
|
|
2797
|
+
async function findExistingComment(ctx, marker) {
|
|
2798
|
+
let page = 1;
|
|
2799
|
+
const perPage = 50;
|
|
2800
|
+
while (true) {
|
|
2801
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2802
|
+
const res = await fetch(url, {
|
|
2803
|
+
headers: {
|
|
2804
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2805
|
+
Accept: "application/vnd.github+json",
|
|
2806
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2807
|
+
}
|
|
2808
|
+
});
|
|
2809
|
+
if (!res.ok) return null;
|
|
2810
|
+
const comments = await res.json();
|
|
2811
|
+
if (comments.length === 0) break;
|
|
2812
|
+
for (const comment of comments) {
|
|
2813
|
+
if (comment.body?.includes(marker)) {
|
|
2814
|
+
return comment.id;
|
|
2815
|
+
}
|
|
2816
|
+
}
|
|
2817
|
+
if (comments.length < perPage) break;
|
|
2818
|
+
page++;
|
|
2819
|
+
}
|
|
2820
|
+
return null;
|
|
2821
|
+
}
|
|
2822
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
2823
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
2824
|
+
if (existingId) {
|
|
2825
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2826
|
+
const res = await fetch(url, {
|
|
2827
|
+
method: "PATCH",
|
|
2828
|
+
headers: {
|
|
2829
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2830
|
+
Accept: "application/vnd.github+json",
|
|
2831
|
+
"Content-Type": "application/json",
|
|
2832
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2833
|
+
},
|
|
2834
|
+
body: JSON.stringify({ body })
|
|
2835
|
+
});
|
|
2836
|
+
if (!res.ok) {
|
|
2837
|
+
const text = await res.text();
|
|
2838
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
2839
|
+
}
|
|
2840
|
+
} else {
|
|
2841
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2842
|
+
const res = await fetch(url, {
|
|
2843
|
+
method: "POST",
|
|
2844
|
+
headers: {
|
|
2845
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2846
|
+
Accept: "application/vnd.github+json",
|
|
2847
|
+
"Content-Type": "application/json",
|
|
2848
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2849
|
+
},
|
|
2850
|
+
body: JSON.stringify({ body })
|
|
2851
|
+
});
|
|
2852
|
+
if (!res.ok) {
|
|
2853
|
+
const text = await res.text();
|
|
2854
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
2855
|
+
}
|
|
2856
|
+
}
|
|
2857
|
+
}
|
|
2195
2858
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2196
2859
|
0 && (module.exports = {
|
|
2197
2860
|
anthropic,
|
|
2198
2861
|
azureOpenai,
|
|
2862
|
+
compareResults,
|
|
2863
|
+
computeStats,
|
|
2199
2864
|
consoleReporter,
|
|
2200
2865
|
defineArena,
|
|
2866
|
+
detectGitHubContext,
|
|
2201
2867
|
gemini,
|
|
2202
2868
|
jsonReporter,
|
|
2869
|
+
loadBaseline,
|
|
2870
|
+
markdownReporter,
|
|
2203
2871
|
openai,
|
|
2204
2872
|
openaiCompatible,
|
|
2205
|
-
registerPricing
|
|
2873
|
+
registerPricing,
|
|
2874
|
+
saveBaseline,
|
|
2875
|
+
upsertPrComment
|
|
2206
2876
|
});
|
|
2207
2877
|
//# sourceMappingURL=index.cjs.map
|