agent-duelist 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1435,7 +1435,142 @@ function jaccardSimilarity(a, b) {
1435
1435
  }
1436
1436
 
1437
1437
  // src/scorers/llm-judge.ts
1438
+ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
1439
+
1440
+ // src/providers/openai.ts
1438
1441
  import OpenAI, { AzureOpenAI } from "openai";
1442
+ import { zodToJsonSchema } from "zod-to-json-schema";
1443
+ var REQUEST_TIMEOUT_MS = 6e4;
1444
+ function openai(model, options) {
1445
+ const client = new OpenAI({
1446
+ apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1447
+ baseURL: options?.baseURL,
1448
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1449
+ });
1450
+ return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1451
+ }
1452
+ function openaiCompatible(options) {
1453
+ const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1454
+ const client = new OpenAI({
1455
+ apiKey,
1456
+ baseURL: options.baseURL,
1457
+ timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
1458
+ });
1459
+ if (options.free) {
1460
+ registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1461
+ }
1462
+ return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1463
+ }
1464
+ function azureOpenai(model, options) {
1465
+ const deployment = options?.deployment ?? model;
1466
+ const client = new AzureOpenAI({
1467
+ apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1468
+ endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1469
+ apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1470
+ deployment,
1471
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1472
+ });
1473
+ return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1474
+ }
1475
+ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1476
+ return {
1477
+ id,
1478
+ name,
1479
+ model,
1480
+ async run(input) {
1481
+ const start = Date.now();
1482
+ const params = {
1483
+ model: requestModel,
1484
+ messages: [{ role: "user", content: input.prompt }]
1485
+ };
1486
+ if (input.schema) {
1487
+ params.response_format = { type: "json_object" };
1488
+ params.messages = [
1489
+ { role: "system", content: "Respond with valid JSON matching the requested schema." },
1490
+ ...params.messages
1491
+ ];
1492
+ }
1493
+ if (input.tools?.length) {
1494
+ params.tools = input.tools.map(toolDefToOpenAI);
1495
+ params.tool_choice = "auto";
1496
+ }
1497
+ const response = await client.chat.completions.create(params, { signal: input.signal });
1498
+ let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1499
+ let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1500
+ const choice = response.choices[0];
1501
+ const toolCallsRaw = choice?.message?.tool_calls;
1502
+ const collectedToolCalls = [];
1503
+ let finalResponse = response;
1504
+ if (toolCallsRaw?.length && input.tools?.length) {
1505
+ const toolMessages = [
1506
+ ...params.messages,
1507
+ choice.message
1508
+ ];
1509
+ for (const tc of toolCallsRaw) {
1510
+ const toolDef = input.tools.find((t) => t.name === tc.function.name);
1511
+ let args;
1512
+ try {
1513
+ args = JSON.parse(tc.function.arguments);
1514
+ } catch {
1515
+ args = tc.function.arguments;
1516
+ }
1517
+ let result;
1518
+ if (toolDef?.handler) {
1519
+ result = await toolDef.handler(args);
1520
+ }
1521
+ collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
1522
+ toolMessages.push({
1523
+ role: "tool",
1524
+ tool_call_id: tc.id,
1525
+ content: JSON.stringify(result ?? {})
1526
+ });
1527
+ }
1528
+ const followUp = await client.chat.completions.create({
1529
+ model: requestModel,
1530
+ messages: toolMessages
1531
+ }, { signal: input.signal });
1532
+ totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1533
+ totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1534
+ finalResponse = followUp;
1535
+ }
1536
+ const latencyMs = Date.now() - start;
1537
+ const finalChoice = finalResponse.choices[0];
1538
+ let rawContent = finalChoice?.message?.content ?? "";
1539
+ if (stripThinking) {
1540
+ rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1541
+ }
1542
+ let output = rawContent;
1543
+ if (input.schema) {
1544
+ try {
1545
+ output = JSON.parse(rawContent);
1546
+ } catch {
1547
+ }
1548
+ }
1549
+ return {
1550
+ output,
1551
+ usage: {
1552
+ promptTokens: totalPromptTokens || void 0,
1553
+ completionTokens: totalCompletionTokens || void 0
1554
+ },
1555
+ latencyMs,
1556
+ raw: finalResponse,
1557
+ toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
1558
+ };
1559
+ }
1560
+ };
1561
+ }
1562
+ function toolDefToOpenAI(tool) {
1563
+ return {
1564
+ type: "function",
1565
+ function: {
1566
+ name: tool.name,
1567
+ description: tool.description,
1568
+ parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
1569
+ }
1570
+ };
1571
+ }
1572
+
1573
+ // src/scorers/llm-judge.ts
1439
1574
  var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
1440
1575
 
1441
1576
  Criteria:
@@ -1451,40 +1586,42 @@ conciseness: <number>
1451
1586
  Task: {task}
1452
1587
  Expected: {expected}
1453
1588
  Actual: {actual}`;
1454
- function resolveJudgeClient(configModel) {
1455
- const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-4o-mini";
1589
+ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1590
+ const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
1456
1591
  if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
1457
1592
  return {
1458
- client: new OpenAI({
1593
+ client: new OpenAI2({
1459
1594
  apiKey: process.env.GOOGLE_API_KEY,
1460
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
1595
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1596
+ timeout: timeoutMs
1461
1597
  }),
1462
1598
  model
1463
1599
  };
1464
1600
  }
1465
1601
  if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
1466
1602
  return {
1467
- client: new AzureOpenAI({
1603
+ client: new AzureOpenAI2({
1468
1604
  apiKey: process.env.AZURE_OPENAI_API_KEY,
1469
1605
  endpoint: process.env.AZURE_OPENAI_ENDPOINT,
1470
1606
  apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1471
- deployment: model
1607
+ deployment: model,
1608
+ timeout: timeoutMs
1472
1609
  }),
1473
1610
  model
1474
1611
  };
1475
1612
  }
1476
1613
  const apiKey = process.env.OPENAI_API_KEY;
1477
1614
  if (!apiKey) return void 0;
1478
- return { client: new OpenAI({ apiKey }), model };
1615
+ return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
1479
1616
  }
1480
- function createLlmJudgeScorer(judgeModel) {
1617
+ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1481
1618
  let cached = void 0;
1482
1619
  return async ({ task, result }) => {
1483
1620
  if (task.expected === void 0) {
1484
1621
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
1485
1622
  }
1486
1623
  if (cached === void 0) {
1487
- cached = resolveJudgeClient(judgeModel) ?? null;
1624
+ cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
1488
1625
  }
1489
1626
  if (!cached) {
1490
1627
  return {
@@ -1557,10 +1694,10 @@ var staticScorers = {
1557
1694
  "fuzzy-similarity": fuzzySimilarityScorer,
1558
1695
  "tool-usage": toolUsageScorer
1559
1696
  };
1560
- function resolveScorers(names, judgeModel) {
1697
+ function resolveScorers(names, judgeModel, timeoutMs) {
1561
1698
  return names.map((name) => {
1562
1699
  if (name === "llm-judge-correctness") {
1563
- return createLlmJudgeScorer(judgeModel);
1700
+ return createLlmJudgeScorer(judgeModel, timeoutMs);
1564
1701
  }
1565
1702
  const scorer = staticScorers[name];
1566
1703
  if (!scorer) {
@@ -1571,19 +1708,41 @@ function resolveScorers(names, judgeModel) {
1571
1708
  }
1572
1709
 
1573
1710
  // src/runner.ts
1711
+ var DEFAULT_TIMEOUT_MS = 6e4;
1712
+ function withTimeout(run, ms) {
1713
+ return new Promise((resolve, reject) => {
1714
+ const controller = new AbortController();
1715
+ const timer = setTimeout(() => {
1716
+ controller.abort();
1717
+ reject(new Error(`Request timed out after ${ms}ms`));
1718
+ }, ms);
1719
+ run(controller.signal).then(
1720
+ (v) => {
1721
+ clearTimeout(timer);
1722
+ resolve(v);
1723
+ },
1724
+ (e) => {
1725
+ clearTimeout(timer);
1726
+ reject(e);
1727
+ }
1728
+ );
1729
+ });
1730
+ }
1574
1731
  async function runBenchmarks(options) {
1575
1732
  const { providers, tasks, scorers, runs, onResult } = options;
1733
+ const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1576
1734
  const results = [];
1577
1735
  for (const task of tasks) {
1578
1736
  for (const provider of providers) {
1579
1737
  for (let run = 1; run <= runs; run++) {
1580
1738
  let result;
1581
1739
  try {
1582
- const taskResult = await provider.run({
1740
+ const taskResult = await withTimeout((signal) => provider.run({
1583
1741
  prompt: task.prompt,
1584
1742
  schema: task.schema,
1585
- tools: task.tools
1586
- });
1743
+ tools: task.tools,
1744
+ signal
1745
+ }), timeout);
1587
1746
  const scores = await Promise.all(
1588
1747
  scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1589
1748
  );
@@ -1626,20 +1785,162 @@ var green = "\x1B[32m";
1626
1785
  var red = "\x1B[31m";
1627
1786
  var yellow = "\x1B[33m";
1628
1787
  var cyan = "\x1B[36m";
1788
+ var brightGreen = "\x1B[92m";
1789
+ var brightWhite = "\x1B[97m";
1629
1790
  function bold(s) {
1630
1791
  return `${boldCode}${s}${reset}`;
1631
1792
  }
1632
1793
  function dim(s) {
1633
1794
  return `${dimCode}${s}${reset}`;
1634
1795
  }
1635
- function colorScore(value) {
1636
- const pct = Math.round(value * 100);
1637
- const str = `${pct}%`;
1638
- if (value >= 0.8) return `${green}${str}${reset}`;
1639
- if (value >= 0.5) return `${yellow}${str}${reset}`;
1640
- return `${red}${str}${reset}`;
1796
+ function stripAnsi(s) {
1797
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
1798
+ }
1799
+ function displayWidth(s) {
1800
+ const stripped = stripAnsi(s);
1801
+ let width = 0;
1802
+ for (const ch of stripped) {
1803
+ const code = ch.codePointAt(0) ?? 0;
1804
+ if (code >= 126976) width += 2;
1805
+ else if (code >= 9728 && code <= 10175) width += 2;
1806
+ else width += 1;
1807
+ }
1808
+ return width;
1809
+ }
1810
+ function padCell(str, targetWidth, align) {
1811
+ const dw = displayWidth(str);
1812
+ const padding = Math.max(0, targetWidth - dw);
1813
+ if (align === "right") return " ".repeat(padding) + str;
1814
+ return str + " ".repeat(padding);
1815
+ }
1816
+ function sparkBar(ratio, width = 8) {
1817
+ const clamped = Math.max(0, Math.min(1, ratio));
1818
+ const fillLen = Math.round(clamped * width);
1819
+ const fill = "\u2593".repeat(fillLen);
1820
+ const track = "\u2591".repeat(width - fillLen);
1821
+ return { fill, track };
1822
+ }
1823
+ function drawTableLine(widths, position) {
1824
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1825
+ if (position === "bottom") {
1826
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
1827
+ }
1828
+ if (position === "merge") {
1829
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
1830
+ }
1831
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
1832
+ if (position === "top") {
1833
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
1834
+ }
1835
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
1836
+ }
1837
+ function drawTableRow(cells, widths, aligns) {
1838
+ const parts = cells.map(
1839
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
1840
+ );
1841
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
1842
+ }
1843
+ function drawSpanRow(content, widths) {
1844
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1845
+ const dw = displayWidth(content);
1846
+ const padding = Math.max(0, totalInner - dw - 1);
1847
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
1848
+ }
1849
+ function computeColumnStats(providerData, scorerNames) {
1850
+ const stats = /* @__PURE__ */ new Map();
1851
+ const valid = providerData.filter((p) => !p.allErrors);
1852
+ if (scorerNames.includes("latency")) {
1853
+ const values = /* @__PURE__ */ new Map();
1854
+ for (const p of providerData) {
1855
+ values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
1856
+ }
1857
+ const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
1858
+ stats.set("latency", {
1859
+ values,
1860
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
1861
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
1862
+ });
1863
+ }
1864
+ if (scorerNames.includes("cost")) {
1865
+ const costValues = /* @__PURE__ */ new Map();
1866
+ const tokenValues = /* @__PURE__ */ new Map();
1867
+ for (const p of providerData) {
1868
+ costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
1869
+ tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
1870
+ }
1871
+ const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
1872
+ const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
1873
+ stats.set("cost", {
1874
+ values: costValues,
1875
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
1876
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
1877
+ });
1878
+ stats.set("tokens", {
1879
+ values: tokenValues,
1880
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
1881
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
1882
+ });
1883
+ }
1884
+ for (const name of scorerNames) {
1885
+ if (name === "latency" || name === "cost") continue;
1886
+ const values = /* @__PURE__ */ new Map();
1887
+ for (const p of providerData) {
1888
+ values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
1889
+ }
1890
+ const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
1891
+ stats.set(name, {
1892
+ values,
1893
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
1894
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
1895
+ });
1896
+ }
1897
+ return stats;
1898
+ }
1899
+ function colorByRank(text, value, colStats, providerCount) {
1900
+ if (value === void 0) return dim("\u2014");
1901
+ if (providerCount < 2) return text;
1902
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
1903
+ if (colStats.best === colStats.worst) return text;
1904
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
1905
+ if (value === colStats.worst) return `${red}${text}${reset}`;
1906
+ return `${yellow}${text}${reset}`;
1641
1907
  }
1642
- function consoleReporter(results) {
1908
+ function computeMedals(columnStats, providerIds) {
1909
+ const medals = /* @__PURE__ */ new Map();
1910
+ if (providerIds.length < 2) {
1911
+ for (const id of providerIds) medals.set(id, "");
1912
+ return medals;
1913
+ }
1914
+ const wins = /* @__PURE__ */ new Map();
1915
+ for (const id of providerIds) wins.set(id, 0);
1916
+ for (const [, colStats] of columnStats) {
1917
+ if (colStats.best === void 0) continue;
1918
+ for (const [providerId, value] of colStats.values) {
1919
+ if (value !== void 0 && value === colStats.best) {
1920
+ wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
1921
+ }
1922
+ }
1923
+ }
1924
+ const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
1925
+ if (totalWins === 0) {
1926
+ for (const id of providerIds) medals.set(id, "");
1927
+ return medals;
1928
+ }
1929
+ const sorted = [...wins.entries()].sort(
1930
+ (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
1931
+ );
1932
+ const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
1933
+ let rank = 0;
1934
+ for (let i = 0; i < sorted.length; i++) {
1935
+ if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
1936
+ rank = i;
1937
+ }
1938
+ medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
1939
+ }
1940
+ return medals;
1941
+ }
1942
+ function consoleReporter(results, options) {
1943
+ const showSparklines = options?.sparklines ?? true;
1643
1944
  if (results.length === 0) {
1644
1945
  console.log("\nNo results to display.\n");
1645
1946
  return;
@@ -1649,78 +1950,155 @@ function consoleReporter(results) {
1649
1950
  const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
1650
1951
  const hasCost = scorerNames.includes("cost");
1651
1952
  const hasErrors = results.some((r) => r.error);
1953
+ const multi = providers.length >= 2;
1652
1954
  const runsPerCell = Math.max(...results.map((r) => r.run));
1653
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
1955
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
1654
1956
  console.log("");
1655
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
1656
- console.log(` ${dim("\u2500".repeat(70))}`);
1957
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
1958
+ console.log(` ${dim("\u2501".repeat(72))}`);
1657
1959
  console.log("");
1658
1960
  for (const task of tasks) {
1659
1961
  console.log(` ${bold(`Task: ${task}`)}`);
1660
- const cols = [{ label: "Provider", width: 22, align: "left" }];
1661
- for (const name of scorerNames) {
1662
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
1663
- else if (name === "cost") {
1664
- cols.push({ label: "Cost", width: 12, align: "right" });
1665
- cols.push({ label: "Tokens", width: 9, align: "right" });
1666
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
1667
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
1668
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
1669
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
1670
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
1671
- else cols.push({ label: name, width: 10, align: "right" });
1672
- }
1673
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
1674
- const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
1675
- console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
1676
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
1677
- for (const provider of providers) {
1678
- const taskResults = results.filter(
1679
- (r) => r.taskName === task && r.providerId === provider
1680
- );
1962
+ console.log("");
1963
+ const providerData = providers.map((providerId) => {
1964
+ const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
1681
1965
  const errorResults2 = taskResults.filter((r) => r.error);
1682
1966
  const successResults = taskResults.filter((r) => !r.error);
1683
- if (successResults.length === 0 && errorResults2.length > 0) {
1684
- const cells2 = [pad(provider, 24, "left")];
1685
- for (const name of scorerNames) {
1686
- if (name === "cost") {
1687
- cells2.push(pad("\u2014", 14, "right"));
1688
- cells2.push(pad("\u2014", 11, "right"));
1689
- } else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
1690
- }
1691
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
1692
- console.log(` ${cells2.join("")}`);
1693
- continue;
1967
+ if (successResults.length === 0) {
1968
+ return {
1969
+ providerId,
1970
+ avgScores: {},
1971
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
1972
+ latencyMs: void 0,
1973
+ allErrors: errorResults2.length > 0,
1974
+ errorCount: errorResults2.length
1975
+ };
1976
+ }
1977
+ return {
1978
+ providerId,
1979
+ avgScores: averageScores(successResults),
1980
+ avgDetails: averageDetails(successResults),
1981
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1982
+ allErrors: false,
1983
+ errorCount: errorResults2.length
1984
+ };
1985
+ });
1986
+ const columnStats = computeColumnStats(providerData, scorerNames);
1987
+ const medals = computeMedals(columnStats, providers);
1988
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
1989
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
1990
+ const cols = [
1991
+ { label: "Provider", width: providerWidth, align: "left" }
1992
+ ];
1993
+ for (const name of scorerNames) {
1994
+ if (name === "latency") {
1995
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
1996
+ } else if (name === "cost") {
1997
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
1998
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
1999
+ } else {
2000
+ const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
2001
+ cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
1694
2002
  }
1695
- const avgScores = averageScores(successResults);
1696
- const avgDetails = averageDetails(successResults);
1697
- const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
1698
- const cells = [pad(provider, 24, "left")];
1699
- for (const name of scorerNames) {
1700
- if (name === "latency") {
1701
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
1702
- } else if (name === "cost") {
1703
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
1704
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
1705
- } else {
1706
- const val = avgScores[name];
1707
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
1708
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
2003
+ }
2004
+ if (hasErrors) {
2005
+ cols.push({ label: "Status", width: 8, align: "left" });
2006
+ }
2007
+ const widths = cols.map((c) => c.width);
2008
+ const aligns = cols.map((c) => c.align);
2009
+ console.log(` ${drawTableLine(widths, "top")}`);
2010
+ const headerCells = cols.map((c) => bold(c.label));
2011
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2012
+ console.log(` ${drawTableLine(widths, "header")}`);
2013
+ for (const pd of providerData) {
2014
+ const medal = medals.get(pd.providerId) ?? "";
2015
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2016
+ const cells = [providerCell];
2017
+ if (pd.allErrors) {
2018
+ for (const col of cols.slice(1)) {
2019
+ if (col.label === "Status") {
2020
+ cells.push(`${red}FAIL${reset}`);
2021
+ } else {
2022
+ cells.push(dim("\u2014"));
2023
+ }
2024
+ }
2025
+ } else {
2026
+ for (const col of cols.slice(1)) {
2027
+ if (col.label === "Status") {
2028
+ cells.push(
2029
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
2030
+ );
2031
+ continue;
2032
+ }
2033
+ const statsKey = col.statsKey;
2034
+ const colStats = columnStats.get(statsKey);
2035
+ if (statsKey === "latency") {
2036
+ const ms = pd.latencyMs;
2037
+ if (ms === void 0) {
2038
+ cells.push(dim("\u2014"));
2039
+ } else {
2040
+ const text = `${Math.round(ms)}ms`;
2041
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
2042
+ }
2043
+ } else if (statsKey === "cost") {
2044
+ const cost = pd.avgDetails.costUsd;
2045
+ if (cost === void 0) {
2046
+ cells.push(dim("\u2014"));
2047
+ } else {
2048
+ const text = formatCost(cost);
2049
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
2050
+ }
2051
+ } else if (statsKey === "tokens") {
2052
+ const tokens = pd.avgDetails.totalTokens;
2053
+ if (tokens === void 0) {
2054
+ cells.push(dim("\u2014"));
2055
+ } else {
2056
+ const text = `${tokens}`;
2057
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
2058
+ }
2059
+ } else {
2060
+ const val = pd.avgScores[statsKey];
2061
+ if (val === void 0) {
2062
+ cells.push(dim("\u2014"));
2063
+ } else {
2064
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
2065
+ let coloredPct;
2066
+ if (multi && colStats) {
2067
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
2068
+ } else {
2069
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
2070
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
2071
+ else coloredPct = `${red}${pctStr}${reset}`;
2072
+ }
2073
+ if (showSparklines) {
2074
+ const { fill, track } = sparkBar(val);
2075
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
2076
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
2077
+ } else {
2078
+ cells.push(coloredPct);
2079
+ }
2080
+ }
2081
+ }
1709
2082
  }
1710
2083
  }
1711
- if (hasErrors) {
1712
- const failCount = errorResults2.length;
1713
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
2084
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
2085
+ }
2086
+ if (multi && providerData.some((p) => !p.allErrors)) {
2087
+ const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
2088
+ if (winnerId) {
2089
+ console.log(` ${drawTableLine(widths, "merge")}`);
2090
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
2091
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
1714
2092
  }
1715
- console.log(` ${cells.join("")}`);
1716
2093
  }
2094
+ console.log(` ${drawTableLine(widths, "bottom")}`);
1717
2095
  console.log("");
1718
2096
  }
1719
2097
  printSummary(results, providers);
1720
2098
  const errorResults = results.filter((r) => r.error);
1721
2099
  if (errorResults.length > 0) {
1722
2100
  console.log(` ${bold("Errors")}`);
1723
- console.log(` ${dim("\u2500".repeat(70))}`);
2101
+ console.log(` ${dim("\u2501".repeat(72))}`);
1724
2102
  const seen = /* @__PURE__ */ new Set();
1725
2103
  for (const r of errorResults) {
1726
2104
  const key = `${r.providerId}::${r.error}`;
@@ -1728,7 +2106,7 @@ function consoleReporter(results) {
1728
2106
  seen.add(key);
1729
2107
  const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
1730
2108
  const suffix = count > 1 ? ` (\xD7${count})` : "";
1731
- console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
2109
+ console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
1732
2110
  const hint = apiKeyHint(r.providerId, r.error ?? "");
1733
2111
  if (hint) console.log(` ${dim(hint)}`);
1734
2112
  }
@@ -1742,15 +2120,20 @@ function consoleReporter(results) {
1742
2120
  function printSummary(results, providers) {
1743
2121
  const successResults = results.filter((r) => !r.error);
1744
2122
  if (successResults.length === 0) return;
1745
- console.log(` ${dim("\u2500".repeat(70))}`);
1746
2123
  console.log(` ${bold("Summary")}`);
2124
+ console.log(` ${dim("\u2501".repeat(72))}`);
1747
2125
  console.log("");
1748
2126
  const single = providers.length === 1;
1749
2127
  const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
1750
2128
  const byCorrectness = rankProviders(successResults, providers, correctnessKey);
1751
2129
  if (byCorrectness) {
1752
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
1753
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
2130
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2131
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2132
+ if (single) {
2133
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2134
+ } else {
2135
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2136
+ }
1754
2137
  }
1755
2138
  const byLatency = providers.map((id) => {
1756
2139
  const runs = successResults.filter((r) => r.providerId === id);
@@ -1758,8 +2141,13 @@ function printSummary(results, providers) {
1758
2141
  return { id, avg: avg ?? Infinity };
1759
2142
  }).sort((a, b) => a.avg - b.avg)[0];
1760
2143
  if (byLatency && byLatency.avg !== Infinity) {
1761
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
1762
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
2144
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2145
+ const msStr = `${Math.round(byLatency.avg)}ms`;
2146
+ if (single) {
2147
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2148
+ } else {
2149
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2150
+ }
1763
2151
  }
1764
2152
  const byCost = providers.map((id) => {
1765
2153
  const runs = successResults.filter((r) => r.providerId === id);
@@ -1771,8 +2159,32 @@ function printSummary(results, providers) {
1771
2159
  return { id, avg };
1772
2160
  }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
1773
2161
  if (byCost?.avg !== void 0) {
1774
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
1775
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
2162
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2163
+ const costStr = formatCost(byCost.avg);
2164
+ if (single) {
2165
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2166
+ } else {
2167
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2168
+ }
2169
+ }
2170
+ if (!single) {
2171
+ const wins = /* @__PURE__ */ new Map();
2172
+ for (const id of providers) wins.set(id, 0);
2173
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2174
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2175
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2176
+ const maxWins = Math.max(...wins.values());
2177
+ if (maxWins > 0) {
2178
+ const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2179
+ console.log("");
2180
+ if (topProviders.length === 1) {
2181
+ const [winnerId, winCount] = topProviders[0];
2182
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2183
+ } else {
2184
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2185
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2186
+ }
2187
+ }
1776
2188
  }
1777
2189
  console.log("");
1778
2190
  }
@@ -1834,14 +2246,6 @@ function formatCost(usd) {
1834
2246
  const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
1835
2247
  return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1836
2248
  }
1837
- function pad(str, width, align) {
1838
- if (align === "right") return str.padStart(width);
1839
- return str.padEnd(width);
1840
- }
1841
- function colorLen(str) {
1842
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
1843
- return str.length - stripped.length;
1844
- }
1845
2249
  function apiKeyHint(providerId, error) {
1846
2250
  const lower = error.toLowerCase();
1847
2251
  const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
@@ -1945,7 +2349,7 @@ function defineArena(config) {
1945
2349
  throw new Error("At least one task is required");
1946
2350
  }
1947
2351
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
1948
- const scorerFns = resolveScorers(scorerNames, config.judgeModel);
2352
+ const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
1949
2353
  const runs = config.runs ?? 1;
1950
2354
  return {
1951
2355
  config,
@@ -1955,141 +2359,13 @@ function defineArena(config) {
1955
2359
  tasks: config.tasks,
1956
2360
  scorers: scorerFns,
1957
2361
  runs,
2362
+ timeout: config.timeout,
1958
2363
  onResult: options?.onResult
1959
2364
  });
1960
2365
  }
1961
2366
  };
1962
2367
  }
1963
2368
 
1964
- // src/providers/openai.ts
1965
- import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
1966
- import { zodToJsonSchema } from "zod-to-json-schema";
1967
- function openai(model, options) {
1968
- const client = new OpenAI2({
1969
- apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1970
- baseURL: options?.baseURL
1971
- });
1972
- return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1973
- }
1974
- function openaiCompatible(options) {
1975
- const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1976
- const client = new OpenAI2({
1977
- apiKey,
1978
- baseURL: options.baseURL
1979
- });
1980
- if (options.free) {
1981
- registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1982
- }
1983
- return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1984
- }
1985
- function azureOpenai(model, options) {
1986
- const deployment = options?.deployment ?? model;
1987
- const client = new AzureOpenAI2({
1988
- apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1989
- endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1990
- apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1991
- deployment
1992
- });
1993
- return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1994
- }
1995
- function makeProvider(id, name, model, client, requestModel, stripThinking) {
1996
- return {
1997
- id,
1998
- name,
1999
- model,
2000
- async run(input) {
2001
- const start = Date.now();
2002
- const params = {
2003
- model: requestModel,
2004
- messages: [{ role: "user", content: input.prompt }]
2005
- };
2006
- if (input.schema) {
2007
- params.response_format = { type: "json_object" };
2008
- params.messages = [
2009
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
2010
- ...params.messages
2011
- ];
2012
- }
2013
- if (input.tools?.length) {
2014
- params.tools = input.tools.map(toolDefToOpenAI);
2015
- params.tool_choice = "auto";
2016
- }
2017
- const response = await client.chat.completions.create(params);
2018
- let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
2019
- let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
2020
- const choice = response.choices[0];
2021
- const toolCallsRaw = choice?.message?.tool_calls;
2022
- const collectedToolCalls = [];
2023
- let finalResponse = response;
2024
- if (toolCallsRaw?.length && input.tools?.length) {
2025
- const toolMessages = [
2026
- ...params.messages,
2027
- choice.message
2028
- ];
2029
- for (const tc of toolCallsRaw) {
2030
- const toolDef = input.tools.find((t) => t.name === tc.function.name);
2031
- let args;
2032
- try {
2033
- args = JSON.parse(tc.function.arguments);
2034
- } catch {
2035
- args = tc.function.arguments;
2036
- }
2037
- let result;
2038
- if (toolDef?.handler) {
2039
- result = await toolDef.handler(args);
2040
- }
2041
- collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
2042
- toolMessages.push({
2043
- role: "tool",
2044
- tool_call_id: tc.id,
2045
- content: JSON.stringify(result ?? {})
2046
- });
2047
- }
2048
- const followUp = await client.chat.completions.create({
2049
- model: requestModel,
2050
- messages: toolMessages
2051
- });
2052
- totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
2053
- totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
2054
- finalResponse = followUp;
2055
- }
2056
- const latencyMs = Date.now() - start;
2057
- const finalChoice = finalResponse.choices[0];
2058
- let rawContent = finalChoice?.message?.content ?? "";
2059
- if (stripThinking) {
2060
- rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
2061
- }
2062
- let output = rawContent;
2063
- if (input.schema) {
2064
- try {
2065
- output = JSON.parse(rawContent);
2066
- } catch {
2067
- }
2068
- }
2069
- return {
2070
- output,
2071
- usage: {
2072
- promptTokens: totalPromptTokens || void 0,
2073
- completionTokens: totalCompletionTokens || void 0
2074
- },
2075
- latencyMs,
2076
- raw: finalResponse,
2077
- toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
2078
- };
2079
- }
2080
- };
2081
- }
2082
- function toolDefToOpenAI(tool) {
2083
- return {
2084
- type: "function",
2085
- function: {
2086
- name: tool.name,
2087
- description: tool.description,
2088
- parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
2089
- }
2090
- };
2091
- }
2092
-
2093
2369
  // src/providers/anthropic.ts
2094
2370
  import Anthropic from "@anthropic-ai/sdk";
2095
2371
  function anthropic(model, options) {
@@ -2109,7 +2385,7 @@ function anthropic(model, options) {
2109
2385
  max_tokens: maxTokens,
2110
2386
  system: systemMessage,
2111
2387
  messages: [{ role: "user", content: input.prompt }]
2112
- });
2388
+ }, { signal: input.signal });
2113
2389
  const latencyMs = Date.now() - start;
2114
2390
  const textBlock = response.content.find((b) => b.type === "text");
2115
2391
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
@@ -2144,19 +2420,406 @@ function gemini(model, options) {
2144
2420
  }
2145
2421
  const client = new OpenAI3({
2146
2422
  apiKey,
2147
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
2423
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
2424
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
2148
2425
  });
2149
2426
  return makeProvider(`google/${model}`, "Google AI", model, client, model);
2150
2427
  }
2428
+
2429
+ // src/reporter/markdown.ts
2430
+ var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2431
+ function markdownReporter(report, _current) {
2432
+ const lines = [COMMENT_MARKER, ""];
2433
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
2434
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
2435
+ lines.push("");
2436
+ if (report.comparisons.length > 0) {
2437
+ lines.push(markdownComparisonTable(report.comparisons));
2438
+ lines.push("");
2439
+ }
2440
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
2441
+ lines.push(markdownCostSummary(report.cost));
2442
+ lines.push("");
2443
+ }
2444
+ if (report.flakyResults.length > 0) {
2445
+ lines.push("### \u26A0\uFE0F Flaky Results");
2446
+ lines.push("");
2447
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
2448
+ lines.push("");
2449
+ for (const f of report.flakyResults) {
2450
+ lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
2451
+ }
2452
+ lines.push("");
2453
+ }
2454
+ if (report.failureReasons.length > 0) {
2455
+ lines.push("### Failure Reasons");
2456
+ lines.push("");
2457
+ for (const reason of report.failureReasons) {
2458
+ lines.push(`- ${reason}`);
2459
+ }
2460
+ lines.push("");
2461
+ }
2462
+ lines.push("---");
2463
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
2464
+ return lines.join("\n");
2465
+ }
2466
+ function markdownComparisonTable(comparisons) {
2467
+ const lines = [];
2468
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
2469
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
2470
+ for (const c of comparisons) {
2471
+ const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2472
+ const currentStr = formatStats(c.current);
2473
+ const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
2474
+ const status = statusIndicator(c);
2475
+ lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2476
+ }
2477
+ return lines.join("\n");
2478
+ }
2479
+ function markdownCostSummary(cost) {
2480
+ const lines = [];
2481
+ lines.push("### \u{1F4B0} Cost Summary");
2482
+ lines.push("");
2483
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
2484
+ if (cost.budget !== void 0) {
2485
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
2486
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
2487
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
2488
+ }
2489
+ if (cost.perProvider.size > 1) {
2490
+ lines.push("");
2491
+ lines.push("| Provider | Cost |");
2492
+ lines.push("|----------|------|");
2493
+ for (const [provider, usd] of cost.perProvider) {
2494
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
2495
+ }
2496
+ }
2497
+ return lines.join("\n");
2498
+ }
2499
+ function formatStats(stats) {
2500
+ if (stats.n > 1) {
2501
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
2502
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
2503
+ }
2504
+ return stats.mean.toFixed(3);
2505
+ }
2506
+ function formatDelta(delta) {
2507
+ const sign = delta >= 0 ? "+" : "";
2508
+ return `${sign}${delta.toFixed(3)}`;
2509
+ }
2510
+ function statusIndicator(c) {
2511
+ if (c.regressed) return "\u{1F534} regressed";
2512
+ if (c.improved) return "\u{1F7E2} improved";
2513
+ if (c.baseline === null) return "\u{1F195} new";
2514
+ return "\u26AA unchanged";
2515
+ }
2516
+
2517
+ // src/ci.ts
2518
+ import { readFileSync, writeFileSync, mkdirSync } from "fs";
2519
+ import { dirname } from "path";
2520
+ var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
2521
+ var FLAKY_CV_THRESHOLD = 0.3;
2522
+ var T_CRITICAL_95 = {
2523
+ 1: 12.706,
2524
+ 2: 4.303,
2525
+ 3: 3.182,
2526
+ 4: 2.776,
2527
+ 5: 2.571,
2528
+ 6: 2.447,
2529
+ 7: 2.365,
2530
+ 8: 2.306,
2531
+ 9: 2.262,
2532
+ 10: 2.228,
2533
+ 15: 2.131,
2534
+ 20: 2.086,
2535
+ 25: 2.06,
2536
+ 30: 2.042
2537
+ };
2538
+ function tCritical(df) {
2539
+ if (df <= 0) return 1.96;
2540
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
2541
+ const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
2542
+ if (df > keys[keys.length - 1]) return 1.96;
2543
+ for (let i = 0; i < keys.length - 1; i++) {
2544
+ if (df > keys[i] && df < keys[i + 1]) {
2545
+ const low = keys[i], high = keys[i + 1];
2546
+ const ratio = (df - low) / (high - low);
2547
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
2548
+ }
2549
+ }
2550
+ return 1.96;
2551
+ }
2552
+ function computeScorerStats(samples) {
2553
+ const n = samples.length;
2554
+ if (n === 0) {
2555
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
2556
+ }
2557
+ const mean = samples.reduce((a, b) => a + b, 0) / n;
2558
+ if (n === 1) {
2559
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
2560
+ }
2561
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
2562
+ const stddev = Math.sqrt(variance);
2563
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
2564
+ const se = stddev / Math.sqrt(n);
2565
+ const t = tCritical(n - 1);
2566
+ return {
2567
+ mean,
2568
+ stddev,
2569
+ cv,
2570
+ n,
2571
+ ci95Lower: mean - t * se,
2572
+ ci95Upper: mean + t * se
2573
+ };
2574
+ }
2575
+ function groupKey(providerId, taskName, scorerName) {
2576
+ return `${providerId}::${taskName}::${scorerName}`;
2577
+ }
2578
+ function computeStats(results) {
2579
+ const grouped = /* @__PURE__ */ new Map();
2580
+ for (const r of results) {
2581
+ if (r.error) continue;
2582
+ for (const score of r.scores) {
2583
+ if (score.value < 0) continue;
2584
+ const key = groupKey(r.providerId, r.taskName, score.name);
2585
+ if (!grouped.has(key)) grouped.set(key, []);
2586
+ grouped.get(key).push(score.value);
2587
+ }
2588
+ }
2589
+ const stats = /* @__PURE__ */ new Map();
2590
+ for (const [key, samples] of grouped) {
2591
+ stats.set(key, computeScorerStats(samples));
2592
+ }
2593
+ return stats;
2594
+ }
2595
+ function computeCostSummary(results, budget) {
2596
+ let totalUsd = 0;
2597
+ const perProvider = /* @__PURE__ */ new Map();
2598
+ for (const r of results) {
2599
+ if (r.error) continue;
2600
+ const costScore = r.scores.find((s) => s.name === "cost");
2601
+ if (!costScore || costScore.value < 0) continue;
2602
+ const details = costScore.details;
2603
+ const usd = details?.estimatedUsd ?? 0;
2604
+ if (usd <= 0) continue;
2605
+ totalUsd += usd;
2606
+ perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
2607
+ }
2608
+ return {
2609
+ totalUsd,
2610
+ perProvider,
2611
+ budget,
2612
+ overBudget: budget !== void 0 && totalUsd > budget
2613
+ };
2614
+ }
2615
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
2616
+ const comparisons = [];
2617
+ const failureReasons = [];
2618
+ for (const [key, current] of currentStats) {
2619
+ const [providerId, taskName, scorerName] = key.split("::");
2620
+ const baseline = baselineStats?.get(key) ?? null;
2621
+ let delta = null;
2622
+ let regressed = false;
2623
+ let improved = false;
2624
+ if (baseline) {
2625
+ delta = current.mean - baseline.mean;
2626
+ const threshold = thresholds.get(scorerName);
2627
+ if (threshold !== void 0) {
2628
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
2629
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
2630
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
2631
+ }
2632
+ }
2633
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
2634
+ comparisons.push({
2635
+ providerId,
2636
+ taskName,
2637
+ scorerName,
2638
+ baseline,
2639
+ current,
2640
+ delta,
2641
+ regressed,
2642
+ improved,
2643
+ flaky
2644
+ });
2645
+ }
2646
+ const cost = computeCostSummary(currentResults ?? [], budget);
2647
+ const regressions = comparisons.filter((c) => c.regressed);
2648
+ if (regressions.length > 0) {
2649
+ for (const r of regressions) {
2650
+ failureReasons.push(
2651
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
2652
+ );
2653
+ }
2654
+ }
2655
+ if (cost.overBudget) {
2656
+ failureReasons.push(
2657
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
2658
+ );
2659
+ }
2660
+ const flakyResults = comparisons.filter((c) => c.flaky);
2661
+ const failed = failureReasons.length > 0;
2662
+ return { comparisons, cost, failed, flakyResults, failureReasons };
2663
+ }
2664
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
2665
+ if (baseline.n === 1 && current.n === 1) {
2666
+ const delta = current.mean - baseline.mean;
2667
+ if (lowerIsBetter) return delta > threshold;
2668
+ return delta < -threshold;
2669
+ }
2670
+ if (lowerIsBetter) {
2671
+ return current.ci95Lower - baseline.ci95Upper > threshold;
2672
+ }
2673
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
2674
+ }
2675
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
2676
+ if (baseline.n === 1 && current.n === 1) {
2677
+ const delta = current.mean - baseline.mean;
2678
+ if (lowerIsBetter) return delta < -threshold;
2679
+ return delta > threshold;
2680
+ }
2681
+ if (lowerIsBetter) {
2682
+ return baseline.ci95Lower - current.ci95Upper > threshold;
2683
+ }
2684
+ return current.ci95Lower - baseline.ci95Upper > threshold;
2685
+ }
2686
+ function formatDelta2(delta) {
2687
+ const sign = delta >= 0 ? "+" : "";
2688
+ return `${sign}${delta.toFixed(4)}`;
2689
+ }
2690
+ function loadBaseline(path) {
2691
+ try {
2692
+ const raw = readFileSync(path, "utf-8");
2693
+ const data = JSON.parse(raw);
2694
+ const results = data.results ?? data;
2695
+ if (!Array.isArray(results)) return null;
2696
+ return {
2697
+ timestamp: data.timestamp ?? "unknown",
2698
+ results
2699
+ };
2700
+ } catch {
2701
+ return null;
2702
+ }
2703
+ }
2704
+ function saveBaseline(path, results) {
2705
+ mkdirSync(dirname(path), { recursive: true });
2706
+ const data = {
2707
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2708
+ results
2709
+ };
2710
+ writeFileSync(path, JSON.stringify(data, null, 2));
2711
+ }
2712
+
2713
+ // src/github.ts
2714
+ import { readFileSync as readFileSync2 } from "fs";
2715
+ function detectGitHubContext() {
2716
+ const token = process.env.GITHUB_TOKEN;
2717
+ const repository = process.env.GITHUB_REPOSITORY;
2718
+ const eventPath = process.env.GITHUB_EVENT_PATH;
2719
+ if (!token || !repository) return null;
2720
+ const [owner, repo] = repository.split("/");
2721
+ if (!owner || !repo) return null;
2722
+ let prNumber;
2723
+ if (eventPath) {
2724
+ try {
2725
+ const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
2726
+ if (event.pull_request && typeof event.pull_request === "object") {
2727
+ const pr = event.pull_request;
2728
+ prNumber = pr.number;
2729
+ }
2730
+ if (!prNumber && event.issue && typeof event.issue === "object") {
2731
+ const issue = event.issue;
2732
+ if (issue.pull_request) {
2733
+ prNumber = issue.number;
2734
+ }
2735
+ }
2736
+ } catch {
2737
+ }
2738
+ }
2739
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
2740
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
2741
+ }
2742
+ if (!prNumber) return null;
2743
+ return { token, owner, repo, prNumber };
2744
+ }
2745
+ var API_BASE = "https://api.github.com";
2746
+ async function findExistingComment(ctx, marker) {
2747
+ let page = 1;
2748
+ const perPage = 50;
2749
+ while (true) {
2750
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
2751
+ const res = await fetch(url, {
2752
+ headers: {
2753
+ Authorization: `Bearer ${ctx.token}`,
2754
+ Accept: "application/vnd.github+json",
2755
+ "X-GitHub-Api-Version": "2022-11-28"
2756
+ }
2757
+ });
2758
+ if (!res.ok) return null;
2759
+ const comments = await res.json();
2760
+ if (comments.length === 0) break;
2761
+ for (const comment of comments) {
2762
+ if (comment.body?.includes(marker)) {
2763
+ return comment.id;
2764
+ }
2765
+ }
2766
+ if (comments.length < perPage) break;
2767
+ page++;
2768
+ }
2769
+ return null;
2770
+ }
2771
+ async function upsertPrComment(ctx, body, marker) {
2772
+ const existingId = await findExistingComment(ctx, marker);
2773
+ if (existingId) {
2774
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
2775
+ const res = await fetch(url, {
2776
+ method: "PATCH",
2777
+ headers: {
2778
+ Authorization: `Bearer ${ctx.token}`,
2779
+ Accept: "application/vnd.github+json",
2780
+ "Content-Type": "application/json",
2781
+ "X-GitHub-Api-Version": "2022-11-28"
2782
+ },
2783
+ body: JSON.stringify({ body })
2784
+ });
2785
+ if (!res.ok) {
2786
+ const text = await res.text();
2787
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
2788
+ }
2789
+ } else {
2790
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
2791
+ const res = await fetch(url, {
2792
+ method: "POST",
2793
+ headers: {
2794
+ Authorization: `Bearer ${ctx.token}`,
2795
+ Accept: "application/vnd.github+json",
2796
+ "Content-Type": "application/json",
2797
+ "X-GitHub-Api-Version": "2022-11-28"
2798
+ },
2799
+ body: JSON.stringify({ body })
2800
+ });
2801
+ if (!res.ok) {
2802
+ const text = await res.text();
2803
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
2804
+ }
2805
+ }
2806
+ }
2151
2807
  export {
2152
2808
  anthropic,
2153
2809
  azureOpenai,
2810
+ compareResults,
2811
+ computeStats,
2154
2812
  consoleReporter,
2155
2813
  defineArena,
2814
+ detectGitHubContext,
2156
2815
  gemini,
2157
2816
  jsonReporter,
2817
+ loadBaseline,
2818
+ markdownReporter,
2158
2819
  openai,
2159
2820
  openaiCompatible,
2160
- registerPricing
2821
+ registerPricing,
2822
+ saveBaseline,
2823
+ upsertPrComment
2161
2824
  };
2162
2825
  //# sourceMappingURL=index.js.map