agent-duelist 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +150 -58
- package/dist/cli.js +870 -123
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +897 -227
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -3
- package/dist/index.d.ts +67 -3
- package/dist/index.js +887 -224
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.js
CHANGED
|
@@ -1435,7 +1435,142 @@ function jaccardSimilarity(a, b) {
|
|
|
1435
1435
|
}
|
|
1436
1436
|
|
|
1437
1437
|
// src/scorers/llm-judge.ts
|
|
1438
|
+
import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
1439
|
+
|
|
1440
|
+
// src/providers/openai.ts
|
|
1438
1441
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1442
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1443
|
+
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1444
|
+
function openai(model, options) {
|
|
1445
|
+
const client = new OpenAI({
|
|
1446
|
+
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1447
|
+
baseURL: options?.baseURL,
|
|
1448
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1449
|
+
});
|
|
1450
|
+
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1451
|
+
}
|
|
1452
|
+
function openaiCompatible(options) {
|
|
1453
|
+
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1454
|
+
const client = new OpenAI({
|
|
1455
|
+
apiKey,
|
|
1456
|
+
baseURL: options.baseURL,
|
|
1457
|
+
timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1458
|
+
});
|
|
1459
|
+
if (options.free) {
|
|
1460
|
+
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1461
|
+
}
|
|
1462
|
+
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1463
|
+
}
|
|
1464
|
+
function azureOpenai(model, options) {
|
|
1465
|
+
const deployment = options?.deployment ?? model;
|
|
1466
|
+
const client = new AzureOpenAI({
|
|
1467
|
+
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1468
|
+
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1469
|
+
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1470
|
+
deployment,
|
|
1471
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1472
|
+
});
|
|
1473
|
+
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1474
|
+
}
|
|
1475
|
+
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1476
|
+
return {
|
|
1477
|
+
id,
|
|
1478
|
+
name,
|
|
1479
|
+
model,
|
|
1480
|
+
async run(input) {
|
|
1481
|
+
const start = Date.now();
|
|
1482
|
+
const params = {
|
|
1483
|
+
model: requestModel,
|
|
1484
|
+
messages: [{ role: "user", content: input.prompt }]
|
|
1485
|
+
};
|
|
1486
|
+
if (input.schema) {
|
|
1487
|
+
params.response_format = { type: "json_object" };
|
|
1488
|
+
params.messages = [
|
|
1489
|
+
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
1490
|
+
...params.messages
|
|
1491
|
+
];
|
|
1492
|
+
}
|
|
1493
|
+
if (input.tools?.length) {
|
|
1494
|
+
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1495
|
+
params.tool_choice = "auto";
|
|
1496
|
+
}
|
|
1497
|
+
const response = await client.chat.completions.create(params, { signal: input.signal });
|
|
1498
|
+
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1499
|
+
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1500
|
+
const choice = response.choices[0];
|
|
1501
|
+
const toolCallsRaw = choice?.message?.tool_calls;
|
|
1502
|
+
const collectedToolCalls = [];
|
|
1503
|
+
let finalResponse = response;
|
|
1504
|
+
if (toolCallsRaw?.length && input.tools?.length) {
|
|
1505
|
+
const toolMessages = [
|
|
1506
|
+
...params.messages,
|
|
1507
|
+
choice.message
|
|
1508
|
+
];
|
|
1509
|
+
for (const tc of toolCallsRaw) {
|
|
1510
|
+
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
1511
|
+
let args;
|
|
1512
|
+
try {
|
|
1513
|
+
args = JSON.parse(tc.function.arguments);
|
|
1514
|
+
} catch {
|
|
1515
|
+
args = tc.function.arguments;
|
|
1516
|
+
}
|
|
1517
|
+
let result;
|
|
1518
|
+
if (toolDef?.handler) {
|
|
1519
|
+
result = await toolDef.handler(args);
|
|
1520
|
+
}
|
|
1521
|
+
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
1522
|
+
toolMessages.push({
|
|
1523
|
+
role: "tool",
|
|
1524
|
+
tool_call_id: tc.id,
|
|
1525
|
+
content: JSON.stringify(result ?? {})
|
|
1526
|
+
});
|
|
1527
|
+
}
|
|
1528
|
+
const followUp = await client.chat.completions.create({
|
|
1529
|
+
model: requestModel,
|
|
1530
|
+
messages: toolMessages
|
|
1531
|
+
}, { signal: input.signal });
|
|
1532
|
+
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1533
|
+
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1534
|
+
finalResponse = followUp;
|
|
1535
|
+
}
|
|
1536
|
+
const latencyMs = Date.now() - start;
|
|
1537
|
+
const finalChoice = finalResponse.choices[0];
|
|
1538
|
+
let rawContent = finalChoice?.message?.content ?? "";
|
|
1539
|
+
if (stripThinking) {
|
|
1540
|
+
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1541
|
+
}
|
|
1542
|
+
let output = rawContent;
|
|
1543
|
+
if (input.schema) {
|
|
1544
|
+
try {
|
|
1545
|
+
output = JSON.parse(rawContent);
|
|
1546
|
+
} catch {
|
|
1547
|
+
}
|
|
1548
|
+
}
|
|
1549
|
+
return {
|
|
1550
|
+
output,
|
|
1551
|
+
usage: {
|
|
1552
|
+
promptTokens: totalPromptTokens || void 0,
|
|
1553
|
+
completionTokens: totalCompletionTokens || void 0
|
|
1554
|
+
},
|
|
1555
|
+
latencyMs,
|
|
1556
|
+
raw: finalResponse,
|
|
1557
|
+
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
1558
|
+
};
|
|
1559
|
+
}
|
|
1560
|
+
};
|
|
1561
|
+
}
|
|
1562
|
+
function toolDefToOpenAI(tool) {
|
|
1563
|
+
return {
|
|
1564
|
+
type: "function",
|
|
1565
|
+
function: {
|
|
1566
|
+
name: tool.name,
|
|
1567
|
+
description: tool.description,
|
|
1568
|
+
parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
|
|
1569
|
+
}
|
|
1570
|
+
};
|
|
1571
|
+
}
|
|
1572
|
+
|
|
1573
|
+
// src/scorers/llm-judge.ts
|
|
1439
1574
|
var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
|
|
1440
1575
|
|
|
1441
1576
|
Criteria:
|
|
@@ -1451,40 +1586,42 @@ conciseness: <number>
|
|
|
1451
1586
|
Task: {task}
|
|
1452
1587
|
Expected: {expected}
|
|
1453
1588
|
Actual: {actual}`;
|
|
1454
|
-
function resolveJudgeClient(configModel) {
|
|
1455
|
-
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-
|
|
1589
|
+
function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1590
|
+
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
|
|
1456
1591
|
if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
|
|
1457
1592
|
return {
|
|
1458
|
-
client: new
|
|
1593
|
+
client: new OpenAI2({
|
|
1459
1594
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
1460
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
1595
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1596
|
+
timeout: timeoutMs
|
|
1461
1597
|
}),
|
|
1462
1598
|
model
|
|
1463
1599
|
};
|
|
1464
1600
|
}
|
|
1465
1601
|
if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
|
|
1466
1602
|
return {
|
|
1467
|
-
client: new
|
|
1603
|
+
client: new AzureOpenAI2({
|
|
1468
1604
|
apiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
1469
1605
|
endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
1470
1606
|
apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1471
|
-
deployment: model
|
|
1607
|
+
deployment: model,
|
|
1608
|
+
timeout: timeoutMs
|
|
1472
1609
|
}),
|
|
1473
1610
|
model
|
|
1474
1611
|
};
|
|
1475
1612
|
}
|
|
1476
1613
|
const apiKey = process.env.OPENAI_API_KEY;
|
|
1477
1614
|
if (!apiKey) return void 0;
|
|
1478
|
-
return { client: new
|
|
1615
|
+
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
1479
1616
|
}
|
|
1480
|
-
function createLlmJudgeScorer(judgeModel) {
|
|
1617
|
+
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1481
1618
|
let cached = void 0;
|
|
1482
1619
|
return async ({ task, result }) => {
|
|
1483
1620
|
if (task.expected === void 0) {
|
|
1484
1621
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
1485
1622
|
}
|
|
1486
1623
|
if (cached === void 0) {
|
|
1487
|
-
cached = resolveJudgeClient(judgeModel) ?? null;
|
|
1624
|
+
cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
|
|
1488
1625
|
}
|
|
1489
1626
|
if (!cached) {
|
|
1490
1627
|
return {
|
|
@@ -1557,10 +1694,10 @@ var staticScorers = {
|
|
|
1557
1694
|
"fuzzy-similarity": fuzzySimilarityScorer,
|
|
1558
1695
|
"tool-usage": toolUsageScorer
|
|
1559
1696
|
};
|
|
1560
|
-
function resolveScorers(names, judgeModel) {
|
|
1697
|
+
function resolveScorers(names, judgeModel, timeoutMs) {
|
|
1561
1698
|
return names.map((name) => {
|
|
1562
1699
|
if (name === "llm-judge-correctness") {
|
|
1563
|
-
return createLlmJudgeScorer(judgeModel);
|
|
1700
|
+
return createLlmJudgeScorer(judgeModel, timeoutMs);
|
|
1564
1701
|
}
|
|
1565
1702
|
const scorer = staticScorers[name];
|
|
1566
1703
|
if (!scorer) {
|
|
@@ -1571,19 +1708,41 @@ function resolveScorers(names, judgeModel) {
|
|
|
1571
1708
|
}
|
|
1572
1709
|
|
|
1573
1710
|
// src/runner.ts
|
|
1711
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
1712
|
+
function withTimeout(run, ms) {
|
|
1713
|
+
return new Promise((resolve, reject) => {
|
|
1714
|
+
const controller = new AbortController();
|
|
1715
|
+
const timer = setTimeout(() => {
|
|
1716
|
+
controller.abort();
|
|
1717
|
+
reject(new Error(`Request timed out after ${ms}ms`));
|
|
1718
|
+
}, ms);
|
|
1719
|
+
run(controller.signal).then(
|
|
1720
|
+
(v) => {
|
|
1721
|
+
clearTimeout(timer);
|
|
1722
|
+
resolve(v);
|
|
1723
|
+
},
|
|
1724
|
+
(e) => {
|
|
1725
|
+
clearTimeout(timer);
|
|
1726
|
+
reject(e);
|
|
1727
|
+
}
|
|
1728
|
+
);
|
|
1729
|
+
});
|
|
1730
|
+
}
|
|
1574
1731
|
async function runBenchmarks(options) {
|
|
1575
1732
|
const { providers, tasks, scorers, runs, onResult } = options;
|
|
1733
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1576
1734
|
const results = [];
|
|
1577
1735
|
for (const task of tasks) {
|
|
1578
1736
|
for (const provider of providers) {
|
|
1579
1737
|
for (let run = 1; run <= runs; run++) {
|
|
1580
1738
|
let result;
|
|
1581
1739
|
try {
|
|
1582
|
-
const taskResult = await provider.run({
|
|
1740
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1583
1741
|
prompt: task.prompt,
|
|
1584
1742
|
schema: task.schema,
|
|
1585
|
-
tools: task.tools
|
|
1586
|
-
|
|
1743
|
+
tools: task.tools,
|
|
1744
|
+
signal
|
|
1745
|
+
}), timeout);
|
|
1587
1746
|
const scores = await Promise.all(
|
|
1588
1747
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1589
1748
|
);
|
|
@@ -1626,20 +1785,162 @@ var green = "\x1B[32m";
|
|
|
1626
1785
|
var red = "\x1B[31m";
|
|
1627
1786
|
var yellow = "\x1B[33m";
|
|
1628
1787
|
var cyan = "\x1B[36m";
|
|
1788
|
+
var brightGreen = "\x1B[92m";
|
|
1789
|
+
var brightWhite = "\x1B[97m";
|
|
1629
1790
|
function bold(s) {
|
|
1630
1791
|
return `${boldCode}${s}${reset}`;
|
|
1631
1792
|
}
|
|
1632
1793
|
function dim(s) {
|
|
1633
1794
|
return `${dimCode}${s}${reset}`;
|
|
1634
1795
|
}
|
|
1635
|
-
function
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1796
|
+
function stripAnsi(s) {
|
|
1797
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
1798
|
+
}
|
|
1799
|
+
function displayWidth(s) {
|
|
1800
|
+
const stripped = stripAnsi(s);
|
|
1801
|
+
let width = 0;
|
|
1802
|
+
for (const ch of stripped) {
|
|
1803
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
1804
|
+
if (code >= 126976) width += 2;
|
|
1805
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
1806
|
+
else width += 1;
|
|
1807
|
+
}
|
|
1808
|
+
return width;
|
|
1809
|
+
}
|
|
1810
|
+
function padCell(str, targetWidth, align) {
|
|
1811
|
+
const dw = displayWidth(str);
|
|
1812
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
1813
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
1814
|
+
return str + " ".repeat(padding);
|
|
1815
|
+
}
|
|
1816
|
+
function sparkBar(ratio, width = 8) {
|
|
1817
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1818
|
+
const fillLen = Math.round(clamped * width);
|
|
1819
|
+
const fill = "\u2593".repeat(fillLen);
|
|
1820
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
1821
|
+
return { fill, track };
|
|
1822
|
+
}
|
|
1823
|
+
function drawTableLine(widths, position) {
|
|
1824
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
1825
|
+
if (position === "bottom") {
|
|
1826
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
1827
|
+
}
|
|
1828
|
+
if (position === "merge") {
|
|
1829
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
1830
|
+
}
|
|
1831
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
1832
|
+
if (position === "top") {
|
|
1833
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
1834
|
+
}
|
|
1835
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
1836
|
+
}
|
|
1837
|
+
function drawTableRow(cells, widths, aligns) {
|
|
1838
|
+
const parts = cells.map(
|
|
1839
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
1840
|
+
);
|
|
1841
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
1842
|
+
}
|
|
1843
|
+
function drawSpanRow(content, widths) {
|
|
1844
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
1845
|
+
const dw = displayWidth(content);
|
|
1846
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
1847
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
1848
|
+
}
|
|
1849
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
1850
|
+
const stats = /* @__PURE__ */ new Map();
|
|
1851
|
+
const valid = providerData.filter((p) => !p.allErrors);
|
|
1852
|
+
if (scorerNames.includes("latency")) {
|
|
1853
|
+
const values = /* @__PURE__ */ new Map();
|
|
1854
|
+
for (const p of providerData) {
|
|
1855
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
|
|
1856
|
+
}
|
|
1857
|
+
const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
|
|
1858
|
+
stats.set("latency", {
|
|
1859
|
+
values,
|
|
1860
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
1861
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
1862
|
+
});
|
|
1863
|
+
}
|
|
1864
|
+
if (scorerNames.includes("cost")) {
|
|
1865
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
1866
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
1867
|
+
for (const p of providerData) {
|
|
1868
|
+
costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
|
|
1869
|
+
tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
|
|
1870
|
+
}
|
|
1871
|
+
const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
|
|
1872
|
+
const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
|
|
1873
|
+
stats.set("cost", {
|
|
1874
|
+
values: costValues,
|
|
1875
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
1876
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
1877
|
+
});
|
|
1878
|
+
stats.set("tokens", {
|
|
1879
|
+
values: tokenValues,
|
|
1880
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
1881
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
1882
|
+
});
|
|
1883
|
+
}
|
|
1884
|
+
for (const name of scorerNames) {
|
|
1885
|
+
if (name === "latency" || name === "cost") continue;
|
|
1886
|
+
const values = /* @__PURE__ */ new Map();
|
|
1887
|
+
for (const p of providerData) {
|
|
1888
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
|
|
1889
|
+
}
|
|
1890
|
+
const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
|
|
1891
|
+
stats.set(name, {
|
|
1892
|
+
values,
|
|
1893
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
1894
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
1895
|
+
});
|
|
1896
|
+
}
|
|
1897
|
+
return stats;
|
|
1898
|
+
}
|
|
1899
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
1900
|
+
if (value === void 0) return dim("\u2014");
|
|
1901
|
+
if (providerCount < 2) return text;
|
|
1902
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
1903
|
+
if (colStats.best === colStats.worst) return text;
|
|
1904
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
1905
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
1906
|
+
return `${yellow}${text}${reset}`;
|
|
1641
1907
|
}
|
|
1642
|
-
function
|
|
1908
|
+
function computeMedals(columnStats, providerIds) {
|
|
1909
|
+
const medals = /* @__PURE__ */ new Map();
|
|
1910
|
+
if (providerIds.length < 2) {
|
|
1911
|
+
for (const id of providerIds) medals.set(id, "");
|
|
1912
|
+
return medals;
|
|
1913
|
+
}
|
|
1914
|
+
const wins = /* @__PURE__ */ new Map();
|
|
1915
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
1916
|
+
for (const [, colStats] of columnStats) {
|
|
1917
|
+
if (colStats.best === void 0) continue;
|
|
1918
|
+
for (const [providerId, value] of colStats.values) {
|
|
1919
|
+
if (value !== void 0 && value === colStats.best) {
|
|
1920
|
+
wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
|
|
1921
|
+
}
|
|
1922
|
+
}
|
|
1923
|
+
}
|
|
1924
|
+
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
1925
|
+
if (totalWins === 0) {
|
|
1926
|
+
for (const id of providerIds) medals.set(id, "");
|
|
1927
|
+
return medals;
|
|
1928
|
+
}
|
|
1929
|
+
const sorted = [...wins.entries()].sort(
|
|
1930
|
+
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
1931
|
+
);
|
|
1932
|
+
const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
|
|
1933
|
+
let rank = 0;
|
|
1934
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
1935
|
+
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
1936
|
+
rank = i;
|
|
1937
|
+
}
|
|
1938
|
+
medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
|
|
1939
|
+
}
|
|
1940
|
+
return medals;
|
|
1941
|
+
}
|
|
1942
|
+
function consoleReporter(results, options) {
|
|
1943
|
+
const showSparklines = options?.sparklines ?? true;
|
|
1643
1944
|
if (results.length === 0) {
|
|
1644
1945
|
console.log("\nNo results to display.\n");
|
|
1645
1946
|
return;
|
|
@@ -1649,78 +1950,155 @@ function consoleReporter(results) {
|
|
|
1649
1950
|
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
1650
1951
|
const hasCost = scorerNames.includes("cost");
|
|
1651
1952
|
const hasErrors = results.some((r) => r.error);
|
|
1953
|
+
const multi = providers.length >= 2;
|
|
1652
1954
|
const runsPerCell = Math.max(...results.map((r) => r.run));
|
|
1653
|
-
const runLabel = runsPerCell > 1 ? `
|
|
1955
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
1654
1956
|
console.log("");
|
|
1655
|
-
console.log(` ${
|
|
1656
|
-
console.log(` ${dim("\
|
|
1957
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
1958
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1657
1959
|
console.log("");
|
|
1658
1960
|
for (const task of tasks) {
|
|
1659
1961
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
else if (name === "cost") {
|
|
1664
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
1665
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
1666
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
1667
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
1668
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
1669
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
1670
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
1671
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
1672
|
-
}
|
|
1673
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
1674
|
-
const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
|
|
1675
|
-
console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
|
|
1676
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
1677
|
-
for (const provider of providers) {
|
|
1678
|
-
const taskResults = results.filter(
|
|
1679
|
-
(r) => r.taskName === task && r.providerId === provider
|
|
1680
|
-
);
|
|
1962
|
+
console.log("");
|
|
1963
|
+
const providerData = providers.map((providerId) => {
|
|
1964
|
+
const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
|
|
1681
1965
|
const errorResults2 = taskResults.filter((r) => r.error);
|
|
1682
1966
|
const successResults = taskResults.filter((r) => !r.error);
|
|
1683
|
-
if (successResults.length === 0
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1967
|
+
if (successResults.length === 0) {
|
|
1968
|
+
return {
|
|
1969
|
+
providerId,
|
|
1970
|
+
avgScores: {},
|
|
1971
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1972
|
+
latencyMs: void 0,
|
|
1973
|
+
allErrors: errorResults2.length > 0,
|
|
1974
|
+
errorCount: errorResults2.length
|
|
1975
|
+
};
|
|
1976
|
+
}
|
|
1977
|
+
return {
|
|
1978
|
+
providerId,
|
|
1979
|
+
avgScores: averageScores(successResults),
|
|
1980
|
+
avgDetails: averageDetails(successResults),
|
|
1981
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1982
|
+
allErrors: false,
|
|
1983
|
+
errorCount: errorResults2.length
|
|
1984
|
+
};
|
|
1985
|
+
});
|
|
1986
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
1987
|
+
const medals = computeMedals(columnStats, providers);
|
|
1988
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
1989
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
1990
|
+
const cols = [
|
|
1991
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
1992
|
+
];
|
|
1993
|
+
for (const name of scorerNames) {
|
|
1994
|
+
if (name === "latency") {
|
|
1995
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
1996
|
+
} else if (name === "cost") {
|
|
1997
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
1998
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
1999
|
+
} else {
|
|
2000
|
+
const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
|
|
2001
|
+
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
1694
2002
|
}
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
2003
|
+
}
|
|
2004
|
+
if (hasErrors) {
|
|
2005
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
2006
|
+
}
|
|
2007
|
+
const widths = cols.map((c) => c.width);
|
|
2008
|
+
const aligns = cols.map((c) => c.align);
|
|
2009
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
2010
|
+
const headerCells = cols.map((c) => bold(c.label));
|
|
2011
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2012
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2013
|
+
for (const pd of providerData) {
|
|
2014
|
+
const medal = medals.get(pd.providerId) ?? "";
|
|
2015
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2016
|
+
const cells = [providerCell];
|
|
2017
|
+
if (pd.allErrors) {
|
|
2018
|
+
for (const col of cols.slice(1)) {
|
|
2019
|
+
if (col.label === "Status") {
|
|
2020
|
+
cells.push(`${red}FAIL${reset}`);
|
|
2021
|
+
} else {
|
|
2022
|
+
cells.push(dim("\u2014"));
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
} else {
|
|
2026
|
+
for (const col of cols.slice(1)) {
|
|
2027
|
+
if (col.label === "Status") {
|
|
2028
|
+
cells.push(
|
|
2029
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
2030
|
+
);
|
|
2031
|
+
continue;
|
|
2032
|
+
}
|
|
2033
|
+
const statsKey = col.statsKey;
|
|
2034
|
+
const colStats = columnStats.get(statsKey);
|
|
2035
|
+
if (statsKey === "latency") {
|
|
2036
|
+
const ms = pd.latencyMs;
|
|
2037
|
+
if (ms === void 0) {
|
|
2038
|
+
cells.push(dim("\u2014"));
|
|
2039
|
+
} else {
|
|
2040
|
+
const text = `${Math.round(ms)}ms`;
|
|
2041
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
2042
|
+
}
|
|
2043
|
+
} else if (statsKey === "cost") {
|
|
2044
|
+
const cost = pd.avgDetails.costUsd;
|
|
2045
|
+
if (cost === void 0) {
|
|
2046
|
+
cells.push(dim("\u2014"));
|
|
2047
|
+
} else {
|
|
2048
|
+
const text = formatCost(cost);
|
|
2049
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
2050
|
+
}
|
|
2051
|
+
} else if (statsKey === "tokens") {
|
|
2052
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
2053
|
+
if (tokens === void 0) {
|
|
2054
|
+
cells.push(dim("\u2014"));
|
|
2055
|
+
} else {
|
|
2056
|
+
const text = `${tokens}`;
|
|
2057
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
2058
|
+
}
|
|
2059
|
+
} else {
|
|
2060
|
+
const val = pd.avgScores[statsKey];
|
|
2061
|
+
if (val === void 0) {
|
|
2062
|
+
cells.push(dim("\u2014"));
|
|
2063
|
+
} else {
|
|
2064
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
2065
|
+
let coloredPct;
|
|
2066
|
+
if (multi && colStats) {
|
|
2067
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
2068
|
+
} else {
|
|
2069
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
2070
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
2071
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
2072
|
+
}
|
|
2073
|
+
if (showSparklines) {
|
|
2074
|
+
const { fill, track } = sparkBar(val);
|
|
2075
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
2076
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
2077
|
+
} else {
|
|
2078
|
+
cells.push(coloredPct);
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
1709
2082
|
}
|
|
1710
2083
|
}
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
2084
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2085
|
+
}
|
|
2086
|
+
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2087
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
|
|
2088
|
+
if (winnerId) {
|
|
2089
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2090
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
2091
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
1714
2092
|
}
|
|
1715
|
-
console.log(` ${cells.join("")}`);
|
|
1716
2093
|
}
|
|
2094
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
1717
2095
|
console.log("");
|
|
1718
2096
|
}
|
|
1719
2097
|
printSummary(results, providers);
|
|
1720
2098
|
const errorResults = results.filter((r) => r.error);
|
|
1721
2099
|
if (errorResults.length > 0) {
|
|
1722
2100
|
console.log(` ${bold("Errors")}`);
|
|
1723
|
-
console.log(` ${dim("\
|
|
2101
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1724
2102
|
const seen = /* @__PURE__ */ new Set();
|
|
1725
2103
|
for (const r of errorResults) {
|
|
1726
2104
|
const key = `${r.providerId}::${r.error}`;
|
|
@@ -1728,7 +2106,7 @@ function consoleReporter(results) {
|
|
|
1728
2106
|
seen.add(key);
|
|
1729
2107
|
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
1730
2108
|
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
1731
|
-
console.log(` ${red}\
|
|
2109
|
+
console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
1732
2110
|
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
1733
2111
|
if (hint) console.log(` ${dim(hint)}`);
|
|
1734
2112
|
}
|
|
@@ -1742,15 +2120,20 @@ function consoleReporter(results) {
|
|
|
1742
2120
|
function printSummary(results, providers) {
|
|
1743
2121
|
const successResults = results.filter((r) => !r.error);
|
|
1744
2122
|
if (successResults.length === 0) return;
|
|
1745
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
1746
2123
|
console.log(` ${bold("Summary")}`);
|
|
2124
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
1747
2125
|
console.log("");
|
|
1748
2126
|
const single = providers.length === 1;
|
|
1749
2127
|
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
1750
2128
|
const byCorrectness = rankProviders(successResults, providers, correctnessKey);
|
|
1751
2129
|
if (byCorrectness) {
|
|
1752
|
-
const
|
|
1753
|
-
|
|
2130
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2131
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2132
|
+
if (single) {
|
|
2133
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2134
|
+
} else {
|
|
2135
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2136
|
+
}
|
|
1754
2137
|
}
|
|
1755
2138
|
const byLatency = providers.map((id) => {
|
|
1756
2139
|
const runs = successResults.filter((r) => r.providerId === id);
|
|
@@ -1758,8 +2141,13 @@ function printSummary(results, providers) {
|
|
|
1758
2141
|
return { id, avg: avg ?? Infinity };
|
|
1759
2142
|
}).sort((a, b) => a.avg - b.avg)[0];
|
|
1760
2143
|
if (byLatency && byLatency.avg !== Infinity) {
|
|
1761
|
-
const
|
|
1762
|
-
|
|
2144
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2145
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2146
|
+
if (single) {
|
|
2147
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2148
|
+
} else {
|
|
2149
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2150
|
+
}
|
|
1763
2151
|
}
|
|
1764
2152
|
const byCost = providers.map((id) => {
|
|
1765
2153
|
const runs = successResults.filter((r) => r.providerId === id);
|
|
@@ -1771,8 +2159,32 @@ function printSummary(results, providers) {
|
|
|
1771
2159
|
return { id, avg };
|
|
1772
2160
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
1773
2161
|
if (byCost?.avg !== void 0) {
|
|
1774
|
-
const
|
|
1775
|
-
|
|
2162
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2163
|
+
const costStr = formatCost(byCost.avg);
|
|
2164
|
+
if (single) {
|
|
2165
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2166
|
+
} else {
|
|
2167
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2168
|
+
}
|
|
2169
|
+
}
|
|
2170
|
+
if (!single) {
|
|
2171
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2172
|
+
for (const id of providers) wins.set(id, 0);
|
|
2173
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2174
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2175
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2176
|
+
const maxWins = Math.max(...wins.values());
|
|
2177
|
+
if (maxWins > 0) {
|
|
2178
|
+
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2179
|
+
console.log("");
|
|
2180
|
+
if (topProviders.length === 1) {
|
|
2181
|
+
const [winnerId, winCount] = topProviders[0];
|
|
2182
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2183
|
+
} else {
|
|
2184
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2185
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
1776
2188
|
}
|
|
1777
2189
|
console.log("");
|
|
1778
2190
|
}
|
|
@@ -1834,14 +2246,6 @@ function formatCost(usd) {
|
|
|
1834
2246
|
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
1835
2247
|
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1836
2248
|
}
|
|
1837
|
-
function pad(str, width, align) {
|
|
1838
|
-
if (align === "right") return str.padStart(width);
|
|
1839
|
-
return str.padEnd(width);
|
|
1840
|
-
}
|
|
1841
|
-
function colorLen(str) {
|
|
1842
|
-
const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
|
|
1843
|
-
return str.length - stripped.length;
|
|
1844
|
-
}
|
|
1845
2249
|
function apiKeyHint(providerId, error) {
|
|
1846
2250
|
const lower = error.toLowerCase();
|
|
1847
2251
|
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
@@ -1945,7 +2349,7 @@ function defineArena(config) {
|
|
|
1945
2349
|
throw new Error("At least one task is required");
|
|
1946
2350
|
}
|
|
1947
2351
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
1948
|
-
const scorerFns = resolveScorers(scorerNames, config.judgeModel);
|
|
2352
|
+
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
1949
2353
|
const runs = config.runs ?? 1;
|
|
1950
2354
|
return {
|
|
1951
2355
|
config,
|
|
@@ -1955,141 +2359,13 @@ function defineArena(config) {
|
|
|
1955
2359
|
tasks: config.tasks,
|
|
1956
2360
|
scorers: scorerFns,
|
|
1957
2361
|
runs,
|
|
2362
|
+
timeout: config.timeout,
|
|
1958
2363
|
onResult: options?.onResult
|
|
1959
2364
|
});
|
|
1960
2365
|
}
|
|
1961
2366
|
};
|
|
1962
2367
|
}
|
|
1963
2368
|
|
|
1964
|
-
// src/providers/openai.ts
|
|
1965
|
-
import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
1966
|
-
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1967
|
-
function openai(model, options) {
|
|
1968
|
-
const client = new OpenAI2({
|
|
1969
|
-
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1970
|
-
baseURL: options?.baseURL
|
|
1971
|
-
});
|
|
1972
|
-
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1973
|
-
}
|
|
1974
|
-
function openaiCompatible(options) {
|
|
1975
|
-
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1976
|
-
const client = new OpenAI2({
|
|
1977
|
-
apiKey,
|
|
1978
|
-
baseURL: options.baseURL
|
|
1979
|
-
});
|
|
1980
|
-
if (options.free) {
|
|
1981
|
-
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1982
|
-
}
|
|
1983
|
-
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1984
|
-
}
|
|
1985
|
-
function azureOpenai(model, options) {
|
|
1986
|
-
const deployment = options?.deployment ?? model;
|
|
1987
|
-
const client = new AzureOpenAI2({
|
|
1988
|
-
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1989
|
-
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1990
|
-
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1991
|
-
deployment
|
|
1992
|
-
});
|
|
1993
|
-
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1994
|
-
}
|
|
1995
|
-
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1996
|
-
return {
|
|
1997
|
-
id,
|
|
1998
|
-
name,
|
|
1999
|
-
model,
|
|
2000
|
-
async run(input) {
|
|
2001
|
-
const start = Date.now();
|
|
2002
|
-
const params = {
|
|
2003
|
-
model: requestModel,
|
|
2004
|
-
messages: [{ role: "user", content: input.prompt }]
|
|
2005
|
-
};
|
|
2006
|
-
if (input.schema) {
|
|
2007
|
-
params.response_format = { type: "json_object" };
|
|
2008
|
-
params.messages = [
|
|
2009
|
-
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
2010
|
-
...params.messages
|
|
2011
|
-
];
|
|
2012
|
-
}
|
|
2013
|
-
if (input.tools?.length) {
|
|
2014
|
-
params.tools = input.tools.map(toolDefToOpenAI);
|
|
2015
|
-
params.tool_choice = "auto";
|
|
2016
|
-
}
|
|
2017
|
-
const response = await client.chat.completions.create(params);
|
|
2018
|
-
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
2019
|
-
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
2020
|
-
const choice = response.choices[0];
|
|
2021
|
-
const toolCallsRaw = choice?.message?.tool_calls;
|
|
2022
|
-
const collectedToolCalls = [];
|
|
2023
|
-
let finalResponse = response;
|
|
2024
|
-
if (toolCallsRaw?.length && input.tools?.length) {
|
|
2025
|
-
const toolMessages = [
|
|
2026
|
-
...params.messages,
|
|
2027
|
-
choice.message
|
|
2028
|
-
];
|
|
2029
|
-
for (const tc of toolCallsRaw) {
|
|
2030
|
-
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
2031
|
-
let args;
|
|
2032
|
-
try {
|
|
2033
|
-
args = JSON.parse(tc.function.arguments);
|
|
2034
|
-
} catch {
|
|
2035
|
-
args = tc.function.arguments;
|
|
2036
|
-
}
|
|
2037
|
-
let result;
|
|
2038
|
-
if (toolDef?.handler) {
|
|
2039
|
-
result = await toolDef.handler(args);
|
|
2040
|
-
}
|
|
2041
|
-
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
2042
|
-
toolMessages.push({
|
|
2043
|
-
role: "tool",
|
|
2044
|
-
tool_call_id: tc.id,
|
|
2045
|
-
content: JSON.stringify(result ?? {})
|
|
2046
|
-
});
|
|
2047
|
-
}
|
|
2048
|
-
const followUp = await client.chat.completions.create({
|
|
2049
|
-
model: requestModel,
|
|
2050
|
-
messages: toolMessages
|
|
2051
|
-
});
|
|
2052
|
-
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
2053
|
-
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
2054
|
-
finalResponse = followUp;
|
|
2055
|
-
}
|
|
2056
|
-
const latencyMs = Date.now() - start;
|
|
2057
|
-
const finalChoice = finalResponse.choices[0];
|
|
2058
|
-
let rawContent = finalChoice?.message?.content ?? "";
|
|
2059
|
-
if (stripThinking) {
|
|
2060
|
-
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
2061
|
-
}
|
|
2062
|
-
let output = rawContent;
|
|
2063
|
-
if (input.schema) {
|
|
2064
|
-
try {
|
|
2065
|
-
output = JSON.parse(rawContent);
|
|
2066
|
-
} catch {
|
|
2067
|
-
}
|
|
2068
|
-
}
|
|
2069
|
-
return {
|
|
2070
|
-
output,
|
|
2071
|
-
usage: {
|
|
2072
|
-
promptTokens: totalPromptTokens || void 0,
|
|
2073
|
-
completionTokens: totalCompletionTokens || void 0
|
|
2074
|
-
},
|
|
2075
|
-
latencyMs,
|
|
2076
|
-
raw: finalResponse,
|
|
2077
|
-
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
2078
|
-
};
|
|
2079
|
-
}
|
|
2080
|
-
};
|
|
2081
|
-
}
|
|
2082
|
-
function toolDefToOpenAI(tool) {
|
|
2083
|
-
return {
|
|
2084
|
-
type: "function",
|
|
2085
|
-
function: {
|
|
2086
|
-
name: tool.name,
|
|
2087
|
-
description: tool.description,
|
|
2088
|
-
parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
|
|
2089
|
-
}
|
|
2090
|
-
};
|
|
2091
|
-
}
|
|
2092
|
-
|
|
2093
2369
|
// src/providers/anthropic.ts
|
|
2094
2370
|
import Anthropic from "@anthropic-ai/sdk";
|
|
2095
2371
|
function anthropic(model, options) {
|
|
@@ -2109,7 +2385,7 @@ function anthropic(model, options) {
|
|
|
2109
2385
|
max_tokens: maxTokens,
|
|
2110
2386
|
system: systemMessage,
|
|
2111
2387
|
messages: [{ role: "user", content: input.prompt }]
|
|
2112
|
-
});
|
|
2388
|
+
}, { signal: input.signal });
|
|
2113
2389
|
const latencyMs = Date.now() - start;
|
|
2114
2390
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2115
2391
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2144,19 +2420,406 @@ function gemini(model, options) {
|
|
|
2144
2420
|
}
|
|
2145
2421
|
const client = new OpenAI3({
|
|
2146
2422
|
apiKey,
|
|
2147
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
2423
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2424
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2148
2425
|
});
|
|
2149
2426
|
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2150
2427
|
}
|
|
2428
|
+
|
|
2429
|
+
// src/reporter/markdown.ts
|
|
2430
|
+
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2431
|
+
function markdownReporter(report, _current) {
|
|
2432
|
+
const lines = [COMMENT_MARKER, ""];
|
|
2433
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
2434
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
2435
|
+
lines.push("");
|
|
2436
|
+
if (report.comparisons.length > 0) {
|
|
2437
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
2438
|
+
lines.push("");
|
|
2439
|
+
}
|
|
2440
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
2441
|
+
lines.push(markdownCostSummary(report.cost));
|
|
2442
|
+
lines.push("");
|
|
2443
|
+
}
|
|
2444
|
+
if (report.flakyResults.length > 0) {
|
|
2445
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
2446
|
+
lines.push("");
|
|
2447
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
2448
|
+
lines.push("");
|
|
2449
|
+
for (const f of report.flakyResults) {
|
|
2450
|
+
lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
|
|
2451
|
+
}
|
|
2452
|
+
lines.push("");
|
|
2453
|
+
}
|
|
2454
|
+
if (report.failureReasons.length > 0) {
|
|
2455
|
+
lines.push("### Failure Reasons");
|
|
2456
|
+
lines.push("");
|
|
2457
|
+
for (const reason of report.failureReasons) {
|
|
2458
|
+
lines.push(`- ${reason}`);
|
|
2459
|
+
}
|
|
2460
|
+
lines.push("");
|
|
2461
|
+
}
|
|
2462
|
+
lines.push("---");
|
|
2463
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
2464
|
+
return lines.join("\n");
|
|
2465
|
+
}
|
|
2466
|
+
function markdownComparisonTable(comparisons) {
|
|
2467
|
+
const lines = [];
|
|
2468
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
2469
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
2470
|
+
for (const c of comparisons) {
|
|
2471
|
+
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2472
|
+
const currentStr = formatStats(c.current);
|
|
2473
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2474
|
+
const status = statusIndicator(c);
|
|
2475
|
+
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2476
|
+
}
|
|
2477
|
+
return lines.join("\n");
|
|
2478
|
+
}
|
|
2479
|
+
function markdownCostSummary(cost) {
|
|
2480
|
+
const lines = [];
|
|
2481
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
2482
|
+
lines.push("");
|
|
2483
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
2484
|
+
if (cost.budget !== void 0) {
|
|
2485
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
2486
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
2487
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
2488
|
+
}
|
|
2489
|
+
if (cost.perProvider.size > 1) {
|
|
2490
|
+
lines.push("");
|
|
2491
|
+
lines.push("| Provider | Cost |");
|
|
2492
|
+
lines.push("|----------|------|");
|
|
2493
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
2494
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
2495
|
+
}
|
|
2496
|
+
}
|
|
2497
|
+
return lines.join("\n");
|
|
2498
|
+
}
|
|
2499
|
+
function formatStats(stats) {
|
|
2500
|
+
if (stats.n > 1) {
|
|
2501
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
2502
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
2503
|
+
}
|
|
2504
|
+
return stats.mean.toFixed(3);
|
|
2505
|
+
}
|
|
2506
|
+
function formatDelta(delta) {
|
|
2507
|
+
const sign = delta >= 0 ? "+" : "";
|
|
2508
|
+
return `${sign}${delta.toFixed(3)}`;
|
|
2509
|
+
}
|
|
2510
|
+
function statusIndicator(c) {
|
|
2511
|
+
if (c.regressed) return "\u{1F534} regressed";
|
|
2512
|
+
if (c.improved) return "\u{1F7E2} improved";
|
|
2513
|
+
if (c.baseline === null) return "\u{1F195} new";
|
|
2514
|
+
return "\u26AA unchanged";
|
|
2515
|
+
}
|
|
2516
|
+
|
|
2517
|
+
// src/ci.ts
|
|
2518
|
+
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
2519
|
+
import { dirname } from "path";
|
|
2520
|
+
var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
2521
|
+
var FLAKY_CV_THRESHOLD = 0.3;
|
|
2522
|
+
var T_CRITICAL_95 = {
|
|
2523
|
+
1: 12.706,
|
|
2524
|
+
2: 4.303,
|
|
2525
|
+
3: 3.182,
|
|
2526
|
+
4: 2.776,
|
|
2527
|
+
5: 2.571,
|
|
2528
|
+
6: 2.447,
|
|
2529
|
+
7: 2.365,
|
|
2530
|
+
8: 2.306,
|
|
2531
|
+
9: 2.262,
|
|
2532
|
+
10: 2.228,
|
|
2533
|
+
15: 2.131,
|
|
2534
|
+
20: 2.086,
|
|
2535
|
+
25: 2.06,
|
|
2536
|
+
30: 2.042
|
|
2537
|
+
};
|
|
2538
|
+
function tCritical(df) {
|
|
2539
|
+
if (df <= 0) return 1.96;
|
|
2540
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2541
|
+
const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2542
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
2543
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
2544
|
+
if (df > keys[i] && df < keys[i + 1]) {
|
|
2545
|
+
const low = keys[i], high = keys[i + 1];
|
|
2546
|
+
const ratio = (df - low) / (high - low);
|
|
2547
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
2548
|
+
}
|
|
2549
|
+
}
|
|
2550
|
+
return 1.96;
|
|
2551
|
+
}
|
|
2552
|
+
function computeScorerStats(samples) {
|
|
2553
|
+
const n = samples.length;
|
|
2554
|
+
if (n === 0) {
|
|
2555
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
2556
|
+
}
|
|
2557
|
+
const mean = samples.reduce((a, b) => a + b, 0) / n;
|
|
2558
|
+
if (n === 1) {
|
|
2559
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
2560
|
+
}
|
|
2561
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
|
|
2562
|
+
const stddev = Math.sqrt(variance);
|
|
2563
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
2564
|
+
const se = stddev / Math.sqrt(n);
|
|
2565
|
+
const t = tCritical(n - 1);
|
|
2566
|
+
return {
|
|
2567
|
+
mean,
|
|
2568
|
+
stddev,
|
|
2569
|
+
cv,
|
|
2570
|
+
n,
|
|
2571
|
+
ci95Lower: mean - t * se,
|
|
2572
|
+
ci95Upper: mean + t * se
|
|
2573
|
+
};
|
|
2574
|
+
}
|
|
2575
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
2576
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
2577
|
+
}
|
|
2578
|
+
function computeStats(results) {
|
|
2579
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
2580
|
+
for (const r of results) {
|
|
2581
|
+
if (r.error) continue;
|
|
2582
|
+
for (const score of r.scores) {
|
|
2583
|
+
if (score.value < 0) continue;
|
|
2584
|
+
const key = groupKey(r.providerId, r.taskName, score.name);
|
|
2585
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
2586
|
+
grouped.get(key).push(score.value);
|
|
2587
|
+
}
|
|
2588
|
+
}
|
|
2589
|
+
const stats = /* @__PURE__ */ new Map();
|
|
2590
|
+
for (const [key, samples] of grouped) {
|
|
2591
|
+
stats.set(key, computeScorerStats(samples));
|
|
2592
|
+
}
|
|
2593
|
+
return stats;
|
|
2594
|
+
}
|
|
2595
|
+
function computeCostSummary(results, budget) {
|
|
2596
|
+
let totalUsd = 0;
|
|
2597
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
2598
|
+
for (const r of results) {
|
|
2599
|
+
if (r.error) continue;
|
|
2600
|
+
const costScore = r.scores.find((s) => s.name === "cost");
|
|
2601
|
+
if (!costScore || costScore.value < 0) continue;
|
|
2602
|
+
const details = costScore.details;
|
|
2603
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
2604
|
+
if (usd <= 0) continue;
|
|
2605
|
+
totalUsd += usd;
|
|
2606
|
+
perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
|
|
2607
|
+
}
|
|
2608
|
+
return {
|
|
2609
|
+
totalUsd,
|
|
2610
|
+
perProvider,
|
|
2611
|
+
budget,
|
|
2612
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
2613
|
+
};
|
|
2614
|
+
}
|
|
2615
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
2616
|
+
const comparisons = [];
|
|
2617
|
+
const failureReasons = [];
|
|
2618
|
+
for (const [key, current] of currentStats) {
|
|
2619
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
2620
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
2621
|
+
let delta = null;
|
|
2622
|
+
let regressed = false;
|
|
2623
|
+
let improved = false;
|
|
2624
|
+
if (baseline) {
|
|
2625
|
+
delta = current.mean - baseline.mean;
|
|
2626
|
+
const threshold = thresholds.get(scorerName);
|
|
2627
|
+
if (threshold !== void 0) {
|
|
2628
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
2629
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
2630
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
2631
|
+
}
|
|
2632
|
+
}
|
|
2633
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
2634
|
+
comparisons.push({
|
|
2635
|
+
providerId,
|
|
2636
|
+
taskName,
|
|
2637
|
+
scorerName,
|
|
2638
|
+
baseline,
|
|
2639
|
+
current,
|
|
2640
|
+
delta,
|
|
2641
|
+
regressed,
|
|
2642
|
+
improved,
|
|
2643
|
+
flaky
|
|
2644
|
+
});
|
|
2645
|
+
}
|
|
2646
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
2647
|
+
const regressions = comparisons.filter((c) => c.regressed);
|
|
2648
|
+
if (regressions.length > 0) {
|
|
2649
|
+
for (const r of regressions) {
|
|
2650
|
+
failureReasons.push(
|
|
2651
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
|
|
2652
|
+
);
|
|
2653
|
+
}
|
|
2654
|
+
}
|
|
2655
|
+
if (cost.overBudget) {
|
|
2656
|
+
failureReasons.push(
|
|
2657
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
2658
|
+
);
|
|
2659
|
+
}
|
|
2660
|
+
const flakyResults = comparisons.filter((c) => c.flaky);
|
|
2661
|
+
const failed = failureReasons.length > 0;
|
|
2662
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
2663
|
+
}
|
|
2664
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
2665
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
2666
|
+
const delta = current.mean - baseline.mean;
|
|
2667
|
+
if (lowerIsBetter) return delta > threshold;
|
|
2668
|
+
return delta < -threshold;
|
|
2669
|
+
}
|
|
2670
|
+
if (lowerIsBetter) {
|
|
2671
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2672
|
+
}
|
|
2673
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
2674
|
+
}
|
|
2675
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
2676
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
2677
|
+
const delta = current.mean - baseline.mean;
|
|
2678
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
2679
|
+
return delta > threshold;
|
|
2680
|
+
}
|
|
2681
|
+
if (lowerIsBetter) {
|
|
2682
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
2683
|
+
}
|
|
2684
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2685
|
+
}
|
|
2686
|
+
function formatDelta2(delta) {
|
|
2687
|
+
const sign = delta >= 0 ? "+" : "";
|
|
2688
|
+
return `${sign}${delta.toFixed(4)}`;
|
|
2689
|
+
}
|
|
2690
|
+
function loadBaseline(path) {
|
|
2691
|
+
try {
|
|
2692
|
+
const raw = readFileSync(path, "utf-8");
|
|
2693
|
+
const data = JSON.parse(raw);
|
|
2694
|
+
const results = data.results ?? data;
|
|
2695
|
+
if (!Array.isArray(results)) return null;
|
|
2696
|
+
return {
|
|
2697
|
+
timestamp: data.timestamp ?? "unknown",
|
|
2698
|
+
results
|
|
2699
|
+
};
|
|
2700
|
+
} catch {
|
|
2701
|
+
return null;
|
|
2702
|
+
}
|
|
2703
|
+
}
|
|
2704
|
+
function saveBaseline(path, results) {
|
|
2705
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
2706
|
+
const data = {
|
|
2707
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2708
|
+
results
|
|
2709
|
+
};
|
|
2710
|
+
writeFileSync(path, JSON.stringify(data, null, 2));
|
|
2711
|
+
}
|
|
2712
|
+
|
|
2713
|
+
// src/github.ts
|
|
2714
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
2715
|
+
function detectGitHubContext() {
|
|
2716
|
+
const token = process.env.GITHUB_TOKEN;
|
|
2717
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
2718
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
2719
|
+
if (!token || !repository) return null;
|
|
2720
|
+
const [owner, repo] = repository.split("/");
|
|
2721
|
+
if (!owner || !repo) return null;
|
|
2722
|
+
let prNumber;
|
|
2723
|
+
if (eventPath) {
|
|
2724
|
+
try {
|
|
2725
|
+
const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
|
|
2726
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
2727
|
+
const pr = event.pull_request;
|
|
2728
|
+
prNumber = pr.number;
|
|
2729
|
+
}
|
|
2730
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
2731
|
+
const issue = event.issue;
|
|
2732
|
+
if (issue.pull_request) {
|
|
2733
|
+
prNumber = issue.number;
|
|
2734
|
+
}
|
|
2735
|
+
}
|
|
2736
|
+
} catch {
|
|
2737
|
+
}
|
|
2738
|
+
}
|
|
2739
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
2740
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
2741
|
+
}
|
|
2742
|
+
if (!prNumber) return null;
|
|
2743
|
+
return { token, owner, repo, prNumber };
|
|
2744
|
+
}
|
|
2745
|
+
var API_BASE = "https://api.github.com";
|
|
2746
|
+
async function findExistingComment(ctx, marker) {
|
|
2747
|
+
let page = 1;
|
|
2748
|
+
const perPage = 50;
|
|
2749
|
+
while (true) {
|
|
2750
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2751
|
+
const res = await fetch(url, {
|
|
2752
|
+
headers: {
|
|
2753
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2754
|
+
Accept: "application/vnd.github+json",
|
|
2755
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2756
|
+
}
|
|
2757
|
+
});
|
|
2758
|
+
if (!res.ok) return null;
|
|
2759
|
+
const comments = await res.json();
|
|
2760
|
+
if (comments.length === 0) break;
|
|
2761
|
+
for (const comment of comments) {
|
|
2762
|
+
if (comment.body?.includes(marker)) {
|
|
2763
|
+
return comment.id;
|
|
2764
|
+
}
|
|
2765
|
+
}
|
|
2766
|
+
if (comments.length < perPage) break;
|
|
2767
|
+
page++;
|
|
2768
|
+
}
|
|
2769
|
+
return null;
|
|
2770
|
+
}
|
|
2771
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
2772
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
2773
|
+
if (existingId) {
|
|
2774
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2775
|
+
const res = await fetch(url, {
|
|
2776
|
+
method: "PATCH",
|
|
2777
|
+
headers: {
|
|
2778
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2779
|
+
Accept: "application/vnd.github+json",
|
|
2780
|
+
"Content-Type": "application/json",
|
|
2781
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2782
|
+
},
|
|
2783
|
+
body: JSON.stringify({ body })
|
|
2784
|
+
});
|
|
2785
|
+
if (!res.ok) {
|
|
2786
|
+
const text = await res.text();
|
|
2787
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
2788
|
+
}
|
|
2789
|
+
} else {
|
|
2790
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2791
|
+
const res = await fetch(url, {
|
|
2792
|
+
method: "POST",
|
|
2793
|
+
headers: {
|
|
2794
|
+
Authorization: `Bearer ${ctx.token}`,
|
|
2795
|
+
Accept: "application/vnd.github+json",
|
|
2796
|
+
"Content-Type": "application/json",
|
|
2797
|
+
"X-GitHub-Api-Version": "2022-11-28"
|
|
2798
|
+
},
|
|
2799
|
+
body: JSON.stringify({ body })
|
|
2800
|
+
});
|
|
2801
|
+
if (!res.ok) {
|
|
2802
|
+
const text = await res.text();
|
|
2803
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
2804
|
+
}
|
|
2805
|
+
}
|
|
2806
|
+
}
|
|
2151
2807
|
export {
|
|
2152
2808
|
anthropic,
|
|
2153
2809
|
azureOpenai,
|
|
2810
|
+
compareResults,
|
|
2811
|
+
computeStats,
|
|
2154
2812
|
consoleReporter,
|
|
2155
2813
|
defineArena,
|
|
2814
|
+
detectGitHubContext,
|
|
2156
2815
|
gemini,
|
|
2157
2816
|
jsonReporter,
|
|
2817
|
+
loadBaseline,
|
|
2818
|
+
markdownReporter,
|
|
2158
2819
|
openai,
|
|
2159
2820
|
openaiCompatible,
|
|
2160
|
-
registerPricing
|
|
2821
|
+
registerPricing,
|
|
2822
|
+
saveBaseline,
|
|
2823
|
+
upsertPrComment
|
|
2161
2824
|
};
|
|
2162
2825
|
//# sourceMappingURL=index.js.map
|