agent-duelist 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,13 +32,20 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  anthropic: () => anthropic,
34
34
  azureOpenai: () => azureOpenai,
35
+ compareResults: () => compareResults,
36
+ computeStats: () => computeStats,
35
37
  consoleReporter: () => consoleReporter,
36
38
  defineArena: () => defineArena,
39
+ detectGitHubContext: () => detectGitHubContext,
37
40
  gemini: () => gemini,
38
41
  jsonReporter: () => jsonReporter,
42
+ loadBaseline: () => loadBaseline,
43
+ markdownReporter: () => markdownReporter,
39
44
  openai: () => openai,
40
45
  openaiCompatible: () => openaiCompatible,
41
- registerPricing: () => registerPricing
46
+ registerPricing: () => registerPricing,
47
+ saveBaseline: () => saveBaseline,
48
+ upsertPrComment: () => upsertPrComment
42
49
  });
43
50
  module.exports = __toCommonJS(index_exports);
44
51
 
@@ -1479,7 +1486,142 @@ function jaccardSimilarity(a, b) {
1479
1486
  }
1480
1487
 
1481
1488
  // src/scorers/llm-judge.ts
1489
+ var import_openai2 = __toESM(require("openai"), 1);
1490
+
1491
+ // src/providers/openai.ts
1482
1492
  var import_openai = __toESM(require("openai"), 1);
1493
+ var import_zod_to_json_schema = require("zod-to-json-schema");
1494
+ var REQUEST_TIMEOUT_MS = 6e4;
1495
+ function openai(model, options) {
1496
+ const client = new import_openai.default({
1497
+ apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1498
+ baseURL: options?.baseURL,
1499
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1500
+ });
1501
+ return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1502
+ }
1503
+ function openaiCompatible(options) {
1504
+ const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1505
+ const client = new import_openai.default({
1506
+ apiKey,
1507
+ baseURL: options.baseURL,
1508
+ timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
1509
+ });
1510
+ if (options.free) {
1511
+ registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1512
+ }
1513
+ return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1514
+ }
1515
+ function azureOpenai(model, options) {
1516
+ const deployment = options?.deployment ?? model;
1517
+ const client = new import_openai.AzureOpenAI({
1518
+ apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1519
+ endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1520
+ apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1521
+ deployment,
1522
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1523
+ });
1524
+ return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1525
+ }
1526
+ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1527
+ return {
1528
+ id,
1529
+ name,
1530
+ model,
1531
+ async run(input) {
1532
+ const start = Date.now();
1533
+ const params = {
1534
+ model: requestModel,
1535
+ messages: [{ role: "user", content: input.prompt }]
1536
+ };
1537
+ if (input.schema) {
1538
+ params.response_format = { type: "json_object" };
1539
+ params.messages = [
1540
+ { role: "system", content: "Respond with valid JSON matching the requested schema." },
1541
+ ...params.messages
1542
+ ];
1543
+ }
1544
+ if (input.tools?.length) {
1545
+ params.tools = input.tools.map(toolDefToOpenAI);
1546
+ params.tool_choice = "auto";
1547
+ }
1548
+ const response = await client.chat.completions.create(params, { signal: input.signal });
1549
+ let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1550
+ let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1551
+ const choice = response.choices[0];
1552
+ const toolCallsRaw = choice?.message?.tool_calls;
1553
+ const collectedToolCalls = [];
1554
+ let finalResponse = response;
1555
+ if (toolCallsRaw?.length && input.tools?.length) {
1556
+ const toolMessages = [
1557
+ ...params.messages,
1558
+ choice.message
1559
+ ];
1560
+ for (const tc of toolCallsRaw) {
1561
+ const toolDef = input.tools.find((t) => t.name === tc.function.name);
1562
+ let args;
1563
+ try {
1564
+ args = JSON.parse(tc.function.arguments);
1565
+ } catch {
1566
+ args = tc.function.arguments;
1567
+ }
1568
+ let result;
1569
+ if (toolDef?.handler) {
1570
+ result = await toolDef.handler(args);
1571
+ }
1572
+ collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
1573
+ toolMessages.push({
1574
+ role: "tool",
1575
+ tool_call_id: tc.id,
1576
+ content: JSON.stringify(result ?? {})
1577
+ });
1578
+ }
1579
+ const followUp = await client.chat.completions.create({
1580
+ model: requestModel,
1581
+ messages: toolMessages
1582
+ }, { signal: input.signal });
1583
+ totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1584
+ totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1585
+ finalResponse = followUp;
1586
+ }
1587
+ const latencyMs = Date.now() - start;
1588
+ const finalChoice = finalResponse.choices[0];
1589
+ let rawContent = finalChoice?.message?.content ?? "";
1590
+ if (stripThinking) {
1591
+ rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1592
+ }
1593
+ let output = rawContent;
1594
+ if (input.schema) {
1595
+ try {
1596
+ output = JSON.parse(rawContent);
1597
+ } catch {
1598
+ }
1599
+ }
1600
+ return {
1601
+ output,
1602
+ usage: {
1603
+ promptTokens: totalPromptTokens || void 0,
1604
+ completionTokens: totalCompletionTokens || void 0
1605
+ },
1606
+ latencyMs,
1607
+ raw: finalResponse,
1608
+ toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
1609
+ };
1610
+ }
1611
+ };
1612
+ }
1613
+ function toolDefToOpenAI(tool) {
1614
+ return {
1615
+ type: "function",
1616
+ function: {
1617
+ name: tool.name,
1618
+ description: tool.description,
1619
+ parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
1620
+ }
1621
+ };
1622
+ }
1623
+
1624
+ // src/scorers/llm-judge.ts
1483
1625
  var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
1484
1626
 
1485
1627
  Criteria:
@@ -1495,40 +1637,42 @@ conciseness: <number>
1495
1637
  Task: {task}
1496
1638
  Expected: {expected}
1497
1639
  Actual: {actual}`;
1498
- function resolveJudgeClient(configModel) {
1499
- const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-4o-mini";
1640
+ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1641
+ const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
1500
1642
  if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
1501
1643
  return {
1502
- client: new import_openai.default({
1644
+ client: new import_openai2.default({
1503
1645
  apiKey: process.env.GOOGLE_API_KEY,
1504
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
1646
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1647
+ timeout: timeoutMs
1505
1648
  }),
1506
1649
  model
1507
1650
  };
1508
1651
  }
1509
1652
  if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
1510
1653
  return {
1511
- client: new import_openai.AzureOpenAI({
1654
+ client: new import_openai2.AzureOpenAI({
1512
1655
  apiKey: process.env.AZURE_OPENAI_API_KEY,
1513
1656
  endpoint: process.env.AZURE_OPENAI_ENDPOINT,
1514
1657
  apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1515
- deployment: model
1658
+ deployment: model,
1659
+ timeout: timeoutMs
1516
1660
  }),
1517
1661
  model
1518
1662
  };
1519
1663
  }
1520
1664
  const apiKey = process.env.OPENAI_API_KEY;
1521
1665
  if (!apiKey) return void 0;
1522
- return { client: new import_openai.default({ apiKey }), model };
1666
+ return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
1523
1667
  }
1524
- function createLlmJudgeScorer(judgeModel) {
1668
+ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1525
1669
  let cached = void 0;
1526
1670
  return async ({ task, result }) => {
1527
1671
  if (task.expected === void 0) {
1528
1672
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
1529
1673
  }
1530
1674
  if (cached === void 0) {
1531
- cached = resolveJudgeClient(judgeModel) ?? null;
1675
+ cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
1532
1676
  }
1533
1677
  if (!cached) {
1534
1678
  return {
@@ -1601,10 +1745,10 @@ var staticScorers = {
1601
1745
  "fuzzy-similarity": fuzzySimilarityScorer,
1602
1746
  "tool-usage": toolUsageScorer
1603
1747
  };
1604
- function resolveScorers(names, judgeModel) {
1748
+ function resolveScorers(names, judgeModel, timeoutMs) {
1605
1749
  return names.map((name) => {
1606
1750
  if (name === "llm-judge-correctness") {
1607
- return createLlmJudgeScorer(judgeModel);
1751
+ return createLlmJudgeScorer(judgeModel, timeoutMs);
1608
1752
  }
1609
1753
  const scorer = staticScorers[name];
1610
1754
  if (!scorer) {
@@ -1615,19 +1759,41 @@ function resolveScorers(names, judgeModel) {
1615
1759
  }
1616
1760
 
1617
1761
  // src/runner.ts
1762
+ var DEFAULT_TIMEOUT_MS = 6e4;
1763
+ function withTimeout(run, ms) {
1764
+ return new Promise((resolve, reject) => {
1765
+ const controller = new AbortController();
1766
+ const timer = setTimeout(() => {
1767
+ controller.abort();
1768
+ reject(new Error(`Request timed out after ${ms}ms`));
1769
+ }, ms);
1770
+ run(controller.signal).then(
1771
+ (v) => {
1772
+ clearTimeout(timer);
1773
+ resolve(v);
1774
+ },
1775
+ (e) => {
1776
+ clearTimeout(timer);
1777
+ reject(e);
1778
+ }
1779
+ );
1780
+ });
1781
+ }
1618
1782
  async function runBenchmarks(options) {
1619
1783
  const { providers, tasks, scorers, runs, onResult } = options;
1784
+ const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1620
1785
  const results = [];
1621
1786
  for (const task of tasks) {
1622
1787
  for (const provider of providers) {
1623
1788
  for (let run = 1; run <= runs; run++) {
1624
1789
  let result;
1625
1790
  try {
1626
- const taskResult = await provider.run({
1791
+ const taskResult = await withTimeout((signal) => provider.run({
1627
1792
  prompt: task.prompt,
1628
1793
  schema: task.schema,
1629
- tools: task.tools
1630
- });
1794
+ tools: task.tools,
1795
+ signal
1796
+ }), timeout);
1631
1797
  const scores = await Promise.all(
1632
1798
  scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1633
1799
  );
@@ -1670,20 +1836,162 @@ var green = "\x1B[32m";
1670
1836
  var red = "\x1B[31m";
1671
1837
  var yellow = "\x1B[33m";
1672
1838
  var cyan = "\x1B[36m";
1839
+ var brightGreen = "\x1B[92m";
1840
+ var brightWhite = "\x1B[97m";
1673
1841
  function bold(s) {
1674
1842
  return `${boldCode}${s}${reset}`;
1675
1843
  }
1676
1844
  function dim(s) {
1677
1845
  return `${dimCode}${s}${reset}`;
1678
1846
  }
1679
- function colorScore(value) {
1680
- const pct = Math.round(value * 100);
1681
- const str = `${pct}%`;
1682
- if (value >= 0.8) return `${green}${str}${reset}`;
1683
- if (value >= 0.5) return `${yellow}${str}${reset}`;
1684
- return `${red}${str}${reset}`;
1847
+ function stripAnsi(s) {
1848
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
1849
+ }
1850
+ function displayWidth(s) {
1851
+ const stripped = stripAnsi(s);
1852
+ let width = 0;
1853
+ for (const ch of stripped) {
1854
+ const code = ch.codePointAt(0) ?? 0;
1855
+ if (code >= 126976) width += 2;
1856
+ else if (code >= 9728 && code <= 10175) width += 2;
1857
+ else width += 1;
1858
+ }
1859
+ return width;
1860
+ }
1861
+ function padCell(str, targetWidth, align) {
1862
+ const dw = displayWidth(str);
1863
+ const padding = Math.max(0, targetWidth - dw);
1864
+ if (align === "right") return " ".repeat(padding) + str;
1865
+ return str + " ".repeat(padding);
1866
+ }
1867
+ function sparkBar(ratio, width = 8) {
1868
+ const clamped = Math.max(0, Math.min(1, ratio));
1869
+ const fillLen = Math.round(clamped * width);
1870
+ const fill = "\u2593".repeat(fillLen);
1871
+ const track = "\u2591".repeat(width - fillLen);
1872
+ return { fill, track };
1873
+ }
1874
+ function drawTableLine(widths, position) {
1875
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1876
+ if (position === "bottom") {
1877
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
1878
+ }
1879
+ if (position === "merge") {
1880
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
1881
+ }
1882
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
1883
+ if (position === "top") {
1884
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
1885
+ }
1886
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
1887
+ }
1888
+ function drawTableRow(cells, widths, aligns) {
1889
+ const parts = cells.map(
1890
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
1891
+ );
1892
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
1893
+ }
1894
+ function drawSpanRow(content, widths) {
1895
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1896
+ const dw = displayWidth(content);
1897
+ const padding = Math.max(0, totalInner - dw - 1);
1898
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
1899
+ }
1900
+ function computeColumnStats(providerData, scorerNames) {
1901
+ const stats = /* @__PURE__ */ new Map();
1902
+ const valid = providerData.filter((p) => !p.allErrors);
1903
+ if (scorerNames.includes("latency")) {
1904
+ const values = /* @__PURE__ */ new Map();
1905
+ for (const p of providerData) {
1906
+ values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
1907
+ }
1908
+ const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
1909
+ stats.set("latency", {
1910
+ values,
1911
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
1912
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
1913
+ });
1914
+ }
1915
+ if (scorerNames.includes("cost")) {
1916
+ const costValues = /* @__PURE__ */ new Map();
1917
+ const tokenValues = /* @__PURE__ */ new Map();
1918
+ for (const p of providerData) {
1919
+ costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
1920
+ tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
1921
+ }
1922
+ const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
1923
+ const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
1924
+ stats.set("cost", {
1925
+ values: costValues,
1926
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
1927
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
1928
+ });
1929
+ stats.set("tokens", {
1930
+ values: tokenValues,
1931
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
1932
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
1933
+ });
1934
+ }
1935
+ for (const name of scorerNames) {
1936
+ if (name === "latency" || name === "cost") continue;
1937
+ const values = /* @__PURE__ */ new Map();
1938
+ for (const p of providerData) {
1939
+ values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
1940
+ }
1941
+ const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
1942
+ stats.set(name, {
1943
+ values,
1944
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
1945
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
1946
+ });
1947
+ }
1948
+ return stats;
1949
+ }
1950
+ function colorByRank(text, value, colStats, providerCount) {
1951
+ if (value === void 0) return dim("\u2014");
1952
+ if (providerCount < 2) return text;
1953
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
1954
+ if (colStats.best === colStats.worst) return text;
1955
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
1956
+ if (value === colStats.worst) return `${red}${text}${reset}`;
1957
+ return `${yellow}${text}${reset}`;
1685
1958
  }
1686
- function consoleReporter(results) {
1959
+ function computeMedals(columnStats, providerIds) {
1960
+ const medals = /* @__PURE__ */ new Map();
1961
+ if (providerIds.length < 2) {
1962
+ for (const id of providerIds) medals.set(id, "");
1963
+ return medals;
1964
+ }
1965
+ const wins = /* @__PURE__ */ new Map();
1966
+ for (const id of providerIds) wins.set(id, 0);
1967
+ for (const [, colStats] of columnStats) {
1968
+ if (colStats.best === void 0) continue;
1969
+ for (const [providerId, value] of colStats.values) {
1970
+ if (value !== void 0 && value === colStats.best) {
1971
+ wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
1972
+ }
1973
+ }
1974
+ }
1975
+ const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
1976
+ if (totalWins === 0) {
1977
+ for (const id of providerIds) medals.set(id, "");
1978
+ return medals;
1979
+ }
1980
+ const sorted = [...wins.entries()].sort(
1981
+ (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
1982
+ );
1983
+ const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
1984
+ let rank = 0;
1985
+ for (let i = 0; i < sorted.length; i++) {
1986
+ if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
1987
+ rank = i;
1988
+ }
1989
+ medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
1990
+ }
1991
+ return medals;
1992
+ }
1993
+ function consoleReporter(results, options) {
1994
+ const showSparklines = options?.sparklines ?? true;
1687
1995
  if (results.length === 0) {
1688
1996
  console.log("\nNo results to display.\n");
1689
1997
  return;
@@ -1693,78 +2001,155 @@ function consoleReporter(results) {
1693
2001
  const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
1694
2002
  const hasCost = scorerNames.includes("cost");
1695
2003
  const hasErrors = results.some((r) => r.error);
2004
+ const multi = providers.length >= 2;
1696
2005
  const runsPerCell = Math.max(...results.map((r) => r.run));
1697
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
2006
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
1698
2007
  console.log("");
1699
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
1700
- console.log(` ${dim("\u2500".repeat(70))}`);
2008
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
2009
+ console.log(` ${dim("\u2501".repeat(72))}`);
1701
2010
  console.log("");
1702
2011
  for (const task of tasks) {
1703
2012
  console.log(` ${bold(`Task: ${task}`)}`);
1704
- const cols = [{ label: "Provider", width: 22, align: "left" }];
1705
- for (const name of scorerNames) {
1706
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
1707
- else if (name === "cost") {
1708
- cols.push({ label: "Cost", width: 12, align: "right" });
1709
- cols.push({ label: "Tokens", width: 9, align: "right" });
1710
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
1711
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
1712
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
1713
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
1714
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
1715
- else cols.push({ label: name, width: 10, align: "right" });
1716
- }
1717
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
1718
- const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
1719
- console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
1720
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
1721
- for (const provider of providers) {
1722
- const taskResults = results.filter(
1723
- (r) => r.taskName === task && r.providerId === provider
1724
- );
2013
+ console.log("");
2014
+ const providerData = providers.map((providerId) => {
2015
+ const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
1725
2016
  const errorResults2 = taskResults.filter((r) => r.error);
1726
2017
  const successResults = taskResults.filter((r) => !r.error);
1727
- if (successResults.length === 0 && errorResults2.length > 0) {
1728
- const cells2 = [pad(provider, 24, "left")];
1729
- for (const name of scorerNames) {
1730
- if (name === "cost") {
1731
- cells2.push(pad("\u2014", 14, "right"));
1732
- cells2.push(pad("\u2014", 11, "right"));
1733
- } else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
1734
- }
1735
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
1736
- console.log(` ${cells2.join("")}`);
1737
- continue;
2018
+ if (successResults.length === 0) {
2019
+ return {
2020
+ providerId,
2021
+ avgScores: {},
2022
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
2023
+ latencyMs: void 0,
2024
+ allErrors: errorResults2.length > 0,
2025
+ errorCount: errorResults2.length
2026
+ };
2027
+ }
2028
+ return {
2029
+ providerId,
2030
+ avgScores: averageScores(successResults),
2031
+ avgDetails: averageDetails(successResults),
2032
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
2033
+ allErrors: false,
2034
+ errorCount: errorResults2.length
2035
+ };
2036
+ });
2037
+ const columnStats = computeColumnStats(providerData, scorerNames);
2038
+ const medals = computeMedals(columnStats, providers);
2039
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
2040
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
2041
+ const cols = [
2042
+ { label: "Provider", width: providerWidth, align: "left" }
2043
+ ];
2044
+ for (const name of scorerNames) {
2045
+ if (name === "latency") {
2046
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
2047
+ } else if (name === "cost") {
2048
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
2049
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
2050
+ } else {
2051
+ const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
2052
+ cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
1738
2053
  }
1739
- const avgScores = averageScores(successResults);
1740
- const avgDetails = averageDetails(successResults);
1741
- const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
1742
- const cells = [pad(provider, 24, "left")];
1743
- for (const name of scorerNames) {
1744
- if (name === "latency") {
1745
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
1746
- } else if (name === "cost") {
1747
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
1748
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
1749
- } else {
1750
- const val = avgScores[name];
1751
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
1752
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
2054
+ }
2055
+ if (hasErrors) {
2056
+ cols.push({ label: "Status", width: 8, align: "left" });
2057
+ }
2058
+ const widths = cols.map((c) => c.width);
2059
+ const aligns = cols.map((c) => c.align);
2060
+ console.log(` ${drawTableLine(widths, "top")}`);
2061
+ const headerCells = cols.map((c) => bold(c.label));
2062
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2063
+ console.log(` ${drawTableLine(widths, "header")}`);
2064
+ for (const pd of providerData) {
2065
+ const medal = medals.get(pd.providerId) ?? "";
2066
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2067
+ const cells = [providerCell];
2068
+ if (pd.allErrors) {
2069
+ for (const col of cols.slice(1)) {
2070
+ if (col.label === "Status") {
2071
+ cells.push(`${red}FAIL${reset}`);
2072
+ } else {
2073
+ cells.push(dim("\u2014"));
2074
+ }
2075
+ }
2076
+ } else {
2077
+ for (const col of cols.slice(1)) {
2078
+ if (col.label === "Status") {
2079
+ cells.push(
2080
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
2081
+ );
2082
+ continue;
2083
+ }
2084
+ const statsKey = col.statsKey;
2085
+ const colStats = columnStats.get(statsKey);
2086
+ if (statsKey === "latency") {
2087
+ const ms = pd.latencyMs;
2088
+ if (ms === void 0) {
2089
+ cells.push(dim("\u2014"));
2090
+ } else {
2091
+ const text = `${Math.round(ms)}ms`;
2092
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
2093
+ }
2094
+ } else if (statsKey === "cost") {
2095
+ const cost = pd.avgDetails.costUsd;
2096
+ if (cost === void 0) {
2097
+ cells.push(dim("\u2014"));
2098
+ } else {
2099
+ const text = formatCost(cost);
2100
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
2101
+ }
2102
+ } else if (statsKey === "tokens") {
2103
+ const tokens = pd.avgDetails.totalTokens;
2104
+ if (tokens === void 0) {
2105
+ cells.push(dim("\u2014"));
2106
+ } else {
2107
+ const text = `${tokens}`;
2108
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
2109
+ }
2110
+ } else {
2111
+ const val = pd.avgScores[statsKey];
2112
+ if (val === void 0) {
2113
+ cells.push(dim("\u2014"));
2114
+ } else {
2115
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
2116
+ let coloredPct;
2117
+ if (multi && colStats) {
2118
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
2119
+ } else {
2120
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
2121
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
2122
+ else coloredPct = `${red}${pctStr}${reset}`;
2123
+ }
2124
+ if (showSparklines) {
2125
+ const { fill, track } = sparkBar(val);
2126
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
2127
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
2128
+ } else {
2129
+ cells.push(coloredPct);
2130
+ }
2131
+ }
2132
+ }
1753
2133
  }
1754
2134
  }
1755
- if (hasErrors) {
1756
- const failCount = errorResults2.length;
1757
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
2135
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
2136
+ }
2137
+ if (multi && providerData.some((p) => !p.allErrors)) {
2138
+ const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
2139
+ if (winnerId) {
2140
+ console.log(` ${drawTableLine(widths, "merge")}`);
2141
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
2142
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
1758
2143
  }
1759
- console.log(` ${cells.join("")}`);
1760
2144
  }
2145
+ console.log(` ${drawTableLine(widths, "bottom")}`);
1761
2146
  console.log("");
1762
2147
  }
1763
2148
  printSummary(results, providers);
1764
2149
  const errorResults = results.filter((r) => r.error);
1765
2150
  if (errorResults.length > 0) {
1766
2151
  console.log(` ${bold("Errors")}`);
1767
- console.log(` ${dim("\u2500".repeat(70))}`);
2152
+ console.log(` ${dim("\u2501".repeat(72))}`);
1768
2153
  const seen = /* @__PURE__ */ new Set();
1769
2154
  for (const r of errorResults) {
1770
2155
  const key = `${r.providerId}::${r.error}`;
@@ -1772,7 +2157,7 @@ function consoleReporter(results) {
1772
2157
  seen.add(key);
1773
2158
  const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
1774
2159
  const suffix = count > 1 ? ` (\xD7${count})` : "";
1775
- console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
2160
+ console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
1776
2161
  const hint = apiKeyHint(r.providerId, r.error ?? "");
1777
2162
  if (hint) console.log(` ${dim(hint)}`);
1778
2163
  }
@@ -1786,15 +2171,20 @@ function consoleReporter(results) {
1786
2171
  function printSummary(results, providers) {
1787
2172
  const successResults = results.filter((r) => !r.error);
1788
2173
  if (successResults.length === 0) return;
1789
- console.log(` ${dim("\u2500".repeat(70))}`);
1790
2174
  console.log(` ${bold("Summary")}`);
2175
+ console.log(` ${dim("\u2501".repeat(72))}`);
1791
2176
  console.log("");
1792
2177
  const single = providers.length === 1;
1793
2178
  const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
1794
2179
  const byCorrectness = rankProviders(successResults, providers, correctnessKey);
1795
2180
  if (byCorrectness) {
1796
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
1797
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
2181
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2182
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2183
+ if (single) {
2184
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2185
+ } else {
2186
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2187
+ }
1798
2188
  }
1799
2189
  const byLatency = providers.map((id) => {
1800
2190
  const runs = successResults.filter((r) => r.providerId === id);
@@ -1802,8 +2192,13 @@ function printSummary(results, providers) {
1802
2192
  return { id, avg: avg ?? Infinity };
1803
2193
  }).sort((a, b) => a.avg - b.avg)[0];
1804
2194
  if (byLatency && byLatency.avg !== Infinity) {
1805
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
1806
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
2195
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2196
+ const msStr = `${Math.round(byLatency.avg)}ms`;
2197
+ if (single) {
2198
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2199
+ } else {
2200
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2201
+ }
1807
2202
  }
1808
2203
  const byCost = providers.map((id) => {
1809
2204
  const runs = successResults.filter((r) => r.providerId === id);
@@ -1815,8 +2210,32 @@ function printSummary(results, providers) {
1815
2210
  return { id, avg };
1816
2211
  }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
1817
2212
  if (byCost?.avg !== void 0) {
1818
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
1819
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
2213
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2214
+ const costStr = formatCost(byCost.avg);
2215
+ if (single) {
2216
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2217
+ } else {
2218
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2219
+ }
2220
+ }
2221
+ if (!single) {
2222
+ const wins = /* @__PURE__ */ new Map();
2223
+ for (const id of providers) wins.set(id, 0);
2224
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2225
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2226
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2227
+ const maxWins = Math.max(...wins.values());
2228
+ if (maxWins > 0) {
2229
+ const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2230
+ console.log("");
2231
+ if (topProviders.length === 1) {
2232
+ const [winnerId, winCount] = topProviders[0];
2233
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2234
+ } else {
2235
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2236
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2237
+ }
2238
+ }
1820
2239
  }
1821
2240
  console.log("");
1822
2241
  }
@@ -1878,14 +2297,6 @@ function formatCost(usd) {
1878
2297
  const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
1879
2298
  return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1880
2299
  }
1881
- function pad(str, width, align) {
1882
- if (align === "right") return str.padStart(width);
1883
- return str.padEnd(width);
1884
- }
1885
- function colorLen(str) {
1886
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
1887
- return str.length - stripped.length;
1888
- }
1889
2300
  function apiKeyHint(providerId, error) {
1890
2301
  const lower = error.toLowerCase();
1891
2302
  const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
@@ -1989,7 +2400,7 @@ function defineArena(config) {
1989
2400
  throw new Error("At least one task is required");
1990
2401
  }
1991
2402
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
1992
- const scorerFns = resolveScorers(scorerNames, config.judgeModel);
2403
+ const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
1993
2404
  const runs = config.runs ?? 1;
1994
2405
  return {
1995
2406
  config,
@@ -1999,141 +2410,13 @@ function defineArena(config) {
1999
2410
  tasks: config.tasks,
2000
2411
  scorers: scorerFns,
2001
2412
  runs,
2413
+ timeout: config.timeout,
2002
2414
  onResult: options?.onResult
2003
2415
  });
2004
2416
  }
2005
2417
  };
2006
2418
  }
2007
2419
 
2008
- // src/providers/openai.ts
2009
- var import_openai2 = __toESM(require("openai"), 1);
2010
- var import_zod_to_json_schema = require("zod-to-json-schema");
2011
- function openai(model, options) {
2012
- const client = new import_openai2.default({
2013
- apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
2014
- baseURL: options?.baseURL
2015
- });
2016
- return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
2017
- }
2018
- function openaiCompatible(options) {
2019
- const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
2020
- const client = new import_openai2.default({
2021
- apiKey,
2022
- baseURL: options.baseURL
2023
- });
2024
- if (options.free) {
2025
- registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
2026
- }
2027
- return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
2028
- }
2029
- function azureOpenai(model, options) {
2030
- const deployment = options?.deployment ?? model;
2031
- const client = new import_openai2.AzureOpenAI({
2032
- apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
2033
- endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
2034
- apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
2035
- deployment
2036
- });
2037
- return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
2038
- }
2039
- function makeProvider(id, name, model, client, requestModel, stripThinking) {
2040
- return {
2041
- id,
2042
- name,
2043
- model,
2044
- async run(input) {
2045
- const start = Date.now();
2046
- const params = {
2047
- model: requestModel,
2048
- messages: [{ role: "user", content: input.prompt }]
2049
- };
2050
- if (input.schema) {
2051
- params.response_format = { type: "json_object" };
2052
- params.messages = [
2053
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
2054
- ...params.messages
2055
- ];
2056
- }
2057
- if (input.tools?.length) {
2058
- params.tools = input.tools.map(toolDefToOpenAI);
2059
- params.tool_choice = "auto";
2060
- }
2061
- const response = await client.chat.completions.create(params);
2062
- let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
2063
- let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
2064
- const choice = response.choices[0];
2065
- const toolCallsRaw = choice?.message?.tool_calls;
2066
- const collectedToolCalls = [];
2067
- let finalResponse = response;
2068
- if (toolCallsRaw?.length && input.tools?.length) {
2069
- const toolMessages = [
2070
- ...params.messages,
2071
- choice.message
2072
- ];
2073
- for (const tc of toolCallsRaw) {
2074
- const toolDef = input.tools.find((t) => t.name === tc.function.name);
2075
- let args;
2076
- try {
2077
- args = JSON.parse(tc.function.arguments);
2078
- } catch {
2079
- args = tc.function.arguments;
2080
- }
2081
- let result;
2082
- if (toolDef?.handler) {
2083
- result = await toolDef.handler(args);
2084
- }
2085
- collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
2086
- toolMessages.push({
2087
- role: "tool",
2088
- tool_call_id: tc.id,
2089
- content: JSON.stringify(result ?? {})
2090
- });
2091
- }
2092
- const followUp = await client.chat.completions.create({
2093
- model: requestModel,
2094
- messages: toolMessages
2095
- });
2096
- totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
2097
- totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
2098
- finalResponse = followUp;
2099
- }
2100
- const latencyMs = Date.now() - start;
2101
- const finalChoice = finalResponse.choices[0];
2102
- let rawContent = finalChoice?.message?.content ?? "";
2103
- if (stripThinking) {
2104
- rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
2105
- }
2106
- let output = rawContent;
2107
- if (input.schema) {
2108
- try {
2109
- output = JSON.parse(rawContent);
2110
- } catch {
2111
- }
2112
- }
2113
- return {
2114
- output,
2115
- usage: {
2116
- promptTokens: totalPromptTokens || void 0,
2117
- completionTokens: totalCompletionTokens || void 0
2118
- },
2119
- latencyMs,
2120
- raw: finalResponse,
2121
- toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
2122
- };
2123
- }
2124
- };
2125
- }
2126
- function toolDefToOpenAI(tool) {
2127
- return {
2128
- type: "function",
2129
- function: {
2130
- name: tool.name,
2131
- description: tool.description,
2132
- parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
2133
- }
2134
- };
2135
- }
2136
-
2137
2420
  // src/providers/anthropic.ts
2138
2421
  var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
2139
2422
  function anthropic(model, options) {
@@ -2153,7 +2436,7 @@ function anthropic(model, options) {
2153
2436
  max_tokens: maxTokens,
2154
2437
  system: systemMessage,
2155
2438
  messages: [{ role: "user", content: input.prompt }]
2156
- });
2439
+ }, { signal: input.signal });
2157
2440
  const latencyMs = Date.now() - start;
2158
2441
  const textBlock = response.content.find((b) => b.type === "text");
2159
2442
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
@@ -2178,7 +2461,7 @@ function anthropic(model, options) {
2178
2461
  }
2179
2462
 
2180
2463
  // src/providers/gemini.ts
2181
- var import_openai3 = __toESM(require("openai"), 1);
2464
+ var import_openai4 = __toESM(require("openai"), 1);
2182
2465
  function gemini(model, options) {
2183
2466
  const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
2184
2467
  if (!apiKey) {
@@ -2186,22 +2469,409 @@ function gemini(model, options) {
2186
2469
  `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
2187
2470
  );
2188
2471
  }
2189
- const client = new import_openai3.default({
2472
+ const client = new import_openai4.default({
2190
2473
  apiKey,
2191
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
2474
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
2475
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
2192
2476
  });
2193
2477
  return makeProvider(`google/${model}`, "Google AI", model, client, model);
2194
2478
  }
2479
+
2480
+ // src/reporter/markdown.ts
2481
+ var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2482
+ function markdownReporter(report, _current) {
2483
+ const lines = [COMMENT_MARKER, ""];
2484
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
2485
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
2486
+ lines.push("");
2487
+ if (report.comparisons.length > 0) {
2488
+ lines.push(markdownComparisonTable(report.comparisons));
2489
+ lines.push("");
2490
+ }
2491
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
2492
+ lines.push(markdownCostSummary(report.cost));
2493
+ lines.push("");
2494
+ }
2495
+ if (report.flakyResults.length > 0) {
2496
+ lines.push("### \u26A0\uFE0F Flaky Results");
2497
+ lines.push("");
2498
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
2499
+ lines.push("");
2500
+ for (const f of report.flakyResults) {
2501
+ lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
2502
+ }
2503
+ lines.push("");
2504
+ }
2505
+ if (report.failureReasons.length > 0) {
2506
+ lines.push("### Failure Reasons");
2507
+ lines.push("");
2508
+ for (const reason of report.failureReasons) {
2509
+ lines.push(`- ${reason}`);
2510
+ }
2511
+ lines.push("");
2512
+ }
2513
+ lines.push("---");
2514
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
2515
+ return lines.join("\n");
2516
+ }
2517
+ function markdownComparisonTable(comparisons) {
2518
+ const lines = [];
2519
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
2520
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
2521
+ for (const c of comparisons) {
2522
+ const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2523
+ const currentStr = formatStats(c.current);
2524
+ const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
2525
+ const status = statusIndicator(c);
2526
+ lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2527
+ }
2528
+ return lines.join("\n");
2529
+ }
2530
+ function markdownCostSummary(cost) {
2531
+ const lines = [];
2532
+ lines.push("### \u{1F4B0} Cost Summary");
2533
+ lines.push("");
2534
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
2535
+ if (cost.budget !== void 0) {
2536
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
2537
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
2538
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
2539
+ }
2540
+ if (cost.perProvider.size > 1) {
2541
+ lines.push("");
2542
+ lines.push("| Provider | Cost |");
2543
+ lines.push("|----------|------|");
2544
+ for (const [provider, usd] of cost.perProvider) {
2545
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
2546
+ }
2547
+ }
2548
+ return lines.join("\n");
2549
+ }
2550
+ function formatStats(stats) {
2551
+ if (stats.n > 1) {
2552
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
2553
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
2554
+ }
2555
+ return stats.mean.toFixed(3);
2556
+ }
2557
+ function formatDelta(delta) {
2558
+ const sign = delta >= 0 ? "+" : "";
2559
+ return `${sign}${delta.toFixed(3)}`;
2560
+ }
2561
+ function statusIndicator(c) {
2562
+ if (c.regressed) return "\u{1F534} regressed";
2563
+ if (c.improved) return "\u{1F7E2} improved";
2564
+ if (c.baseline === null) return "\u{1F195} new";
2565
+ return "\u26AA unchanged";
2566
+ }
2567
+
2568
+ // src/ci.ts
2569
+ var import_node_fs = require("fs");
2570
+ var import_node_path = require("path");
2571
+ var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
2572
+ var FLAKY_CV_THRESHOLD = 0.3;
2573
+ var T_CRITICAL_95 = {
2574
+ 1: 12.706,
2575
+ 2: 4.303,
2576
+ 3: 3.182,
2577
+ 4: 2.776,
2578
+ 5: 2.571,
2579
+ 6: 2.447,
2580
+ 7: 2.365,
2581
+ 8: 2.306,
2582
+ 9: 2.262,
2583
+ 10: 2.228,
2584
+ 15: 2.131,
2585
+ 20: 2.086,
2586
+ 25: 2.06,
2587
+ 30: 2.042
2588
+ };
2589
+ function tCritical(df) {
2590
+ if (df <= 0) return 1.96;
2591
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
2592
+ const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
2593
+ if (df > keys[keys.length - 1]) return 1.96;
2594
+ for (let i = 0; i < keys.length - 1; i++) {
2595
+ if (df > keys[i] && df < keys[i + 1]) {
2596
+ const low = keys[i], high = keys[i + 1];
2597
+ const ratio = (df - low) / (high - low);
2598
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
2599
+ }
2600
+ }
2601
+ return 1.96;
2602
+ }
2603
+ function computeScorerStats(samples) {
2604
+ const n = samples.length;
2605
+ if (n === 0) {
2606
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
2607
+ }
2608
+ const mean = samples.reduce((a, b) => a + b, 0) / n;
2609
+ if (n === 1) {
2610
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
2611
+ }
2612
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
2613
+ const stddev = Math.sqrt(variance);
2614
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
2615
+ const se = stddev / Math.sqrt(n);
2616
+ const t = tCritical(n - 1);
2617
+ return {
2618
+ mean,
2619
+ stddev,
2620
+ cv,
2621
+ n,
2622
+ ci95Lower: mean - t * se,
2623
+ ci95Upper: mean + t * se
2624
+ };
2625
+ }
2626
+ function groupKey(providerId, taskName, scorerName) {
2627
+ return `${providerId}::${taskName}::${scorerName}`;
2628
+ }
2629
+ function computeStats(results) {
2630
+ const grouped = /* @__PURE__ */ new Map();
2631
+ for (const r of results) {
2632
+ if (r.error) continue;
2633
+ for (const score of r.scores) {
2634
+ if (score.value < 0) continue;
2635
+ const key = groupKey(r.providerId, r.taskName, score.name);
2636
+ if (!grouped.has(key)) grouped.set(key, []);
2637
+ grouped.get(key).push(score.value);
2638
+ }
2639
+ }
2640
+ const stats = /* @__PURE__ */ new Map();
2641
+ for (const [key, samples] of grouped) {
2642
+ stats.set(key, computeScorerStats(samples));
2643
+ }
2644
+ return stats;
2645
+ }
2646
+ function computeCostSummary(results, budget) {
2647
+ let totalUsd = 0;
2648
+ const perProvider = /* @__PURE__ */ new Map();
2649
+ for (const r of results) {
2650
+ if (r.error) continue;
2651
+ const costScore = r.scores.find((s) => s.name === "cost");
2652
+ if (!costScore || costScore.value < 0) continue;
2653
+ const details = costScore.details;
2654
+ const usd = details?.estimatedUsd ?? 0;
2655
+ if (usd <= 0) continue;
2656
+ totalUsd += usd;
2657
+ perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
2658
+ }
2659
+ return {
2660
+ totalUsd,
2661
+ perProvider,
2662
+ budget,
2663
+ overBudget: budget !== void 0 && totalUsd > budget
2664
+ };
2665
+ }
2666
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
2667
+ const comparisons = [];
2668
+ const failureReasons = [];
2669
+ for (const [key, current] of currentStats) {
2670
+ const [providerId, taskName, scorerName] = key.split("::");
2671
+ const baseline = baselineStats?.get(key) ?? null;
2672
+ let delta = null;
2673
+ let regressed = false;
2674
+ let improved = false;
2675
+ if (baseline) {
2676
+ delta = current.mean - baseline.mean;
2677
+ const threshold = thresholds.get(scorerName);
2678
+ if (threshold !== void 0) {
2679
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
2680
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
2681
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
2682
+ }
2683
+ }
2684
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
2685
+ comparisons.push({
2686
+ providerId,
2687
+ taskName,
2688
+ scorerName,
2689
+ baseline,
2690
+ current,
2691
+ delta,
2692
+ regressed,
2693
+ improved,
2694
+ flaky
2695
+ });
2696
+ }
2697
+ const cost = computeCostSummary(currentResults ?? [], budget);
2698
+ const regressions = comparisons.filter((c) => c.regressed);
2699
+ if (regressions.length > 0) {
2700
+ for (const r of regressions) {
2701
+ failureReasons.push(
2702
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
2703
+ );
2704
+ }
2705
+ }
2706
+ if (cost.overBudget) {
2707
+ failureReasons.push(
2708
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
2709
+ );
2710
+ }
2711
+ const flakyResults = comparisons.filter((c) => c.flaky);
2712
+ const failed = failureReasons.length > 0;
2713
+ return { comparisons, cost, failed, flakyResults, failureReasons };
2714
+ }
2715
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
2716
+ if (baseline.n === 1 && current.n === 1) {
2717
+ const delta = current.mean - baseline.mean;
2718
+ if (lowerIsBetter) return delta > threshold;
2719
+ return delta < -threshold;
2720
+ }
2721
+ if (lowerIsBetter) {
2722
+ return current.ci95Lower - baseline.ci95Upper > threshold;
2723
+ }
2724
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
2725
+ }
2726
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
2727
+ if (baseline.n === 1 && current.n === 1) {
2728
+ const delta = current.mean - baseline.mean;
2729
+ if (lowerIsBetter) return delta < -threshold;
2730
+ return delta > threshold;
2731
+ }
2732
+ if (lowerIsBetter) {
2733
+ return baseline.ci95Lower - current.ci95Upper > threshold;
2734
+ }
2735
+ return current.ci95Lower - baseline.ci95Upper > threshold;
2736
+ }
2737
+ function formatDelta2(delta) {
2738
+ const sign = delta >= 0 ? "+" : "";
2739
+ return `${sign}${delta.toFixed(4)}`;
2740
+ }
2741
+ function loadBaseline(path) {
2742
+ try {
2743
+ const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
2744
+ const data = JSON.parse(raw);
2745
+ const results = data.results ?? data;
2746
+ if (!Array.isArray(results)) return null;
2747
+ return {
2748
+ timestamp: data.timestamp ?? "unknown",
2749
+ results
2750
+ };
2751
+ } catch {
2752
+ return null;
2753
+ }
2754
+ }
2755
+ function saveBaseline(path, results) {
2756
+ (0, import_node_fs.mkdirSync)((0, import_node_path.dirname)(path), { recursive: true });
2757
+ const data = {
2758
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
2759
+ results
2760
+ };
2761
+ (0, import_node_fs.writeFileSync)(path, JSON.stringify(data, null, 2));
2762
+ }
2763
+
2764
+ // src/github.ts
2765
+ var import_node_fs2 = require("fs");
2766
+ function detectGitHubContext() {
2767
+ const token = process.env.GITHUB_TOKEN;
2768
+ const repository = process.env.GITHUB_REPOSITORY;
2769
+ const eventPath = process.env.GITHUB_EVENT_PATH;
2770
+ if (!token || !repository) return null;
2771
+ const [owner, repo] = repository.split("/");
2772
+ if (!owner || !repo) return null;
2773
+ let prNumber;
2774
+ if (eventPath) {
2775
+ try {
2776
+ const event = JSON.parse((0, import_node_fs2.readFileSync)(eventPath, "utf-8"));
2777
+ if (event.pull_request && typeof event.pull_request === "object") {
2778
+ const pr = event.pull_request;
2779
+ prNumber = pr.number;
2780
+ }
2781
+ if (!prNumber && event.issue && typeof event.issue === "object") {
2782
+ const issue = event.issue;
2783
+ if (issue.pull_request) {
2784
+ prNumber = issue.number;
2785
+ }
2786
+ }
2787
+ } catch {
2788
+ }
2789
+ }
2790
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
2791
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
2792
+ }
2793
+ if (!prNumber) return null;
2794
+ return { token, owner, repo, prNumber };
2795
+ }
2796
+ var API_BASE = "https://api.github.com";
2797
+ async function findExistingComment(ctx, marker) {
2798
+ let page = 1;
2799
+ const perPage = 50;
2800
+ while (true) {
2801
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
2802
+ const res = await fetch(url, {
2803
+ headers: {
2804
+ Authorization: `Bearer ${ctx.token}`,
2805
+ Accept: "application/vnd.github+json",
2806
+ "X-GitHub-Api-Version": "2022-11-28"
2807
+ }
2808
+ });
2809
+ if (!res.ok) return null;
2810
+ const comments = await res.json();
2811
+ if (comments.length === 0) break;
2812
+ for (const comment of comments) {
2813
+ if (comment.body?.includes(marker)) {
2814
+ return comment.id;
2815
+ }
2816
+ }
2817
+ if (comments.length < perPage) break;
2818
+ page++;
2819
+ }
2820
+ return null;
2821
+ }
2822
+ async function upsertPrComment(ctx, body, marker) {
2823
+ const existingId = await findExistingComment(ctx, marker);
2824
+ if (existingId) {
2825
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
2826
+ const res = await fetch(url, {
2827
+ method: "PATCH",
2828
+ headers: {
2829
+ Authorization: `Bearer ${ctx.token}`,
2830
+ Accept: "application/vnd.github+json",
2831
+ "Content-Type": "application/json",
2832
+ "X-GitHub-Api-Version": "2022-11-28"
2833
+ },
2834
+ body: JSON.stringify({ body })
2835
+ });
2836
+ if (!res.ok) {
2837
+ const text = await res.text();
2838
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
2839
+ }
2840
+ } else {
2841
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
2842
+ const res = await fetch(url, {
2843
+ method: "POST",
2844
+ headers: {
2845
+ Authorization: `Bearer ${ctx.token}`,
2846
+ Accept: "application/vnd.github+json",
2847
+ "Content-Type": "application/json",
2848
+ "X-GitHub-Api-Version": "2022-11-28"
2849
+ },
2850
+ body: JSON.stringify({ body })
2851
+ });
2852
+ if (!res.ok) {
2853
+ const text = await res.text();
2854
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
2855
+ }
2856
+ }
2857
+ }
2195
2858
  // Annotate the CommonJS export names for ESM import in node:
2196
2859
  0 && (module.exports = {
2197
2860
  anthropic,
2198
2861
  azureOpenai,
2862
+ compareResults,
2863
+ computeStats,
2199
2864
  consoleReporter,
2200
2865
  defineArena,
2866
+ detectGitHubContext,
2201
2867
  gemini,
2202
2868
  jsonReporter,
2869
+ loadBaseline,
2870
+ markdownReporter,
2203
2871
  openai,
2204
2872
  openaiCompatible,
2205
- registerPricing
2873
+ registerPricing,
2874
+ saveBaseline,
2875
+ upsertPrComment
2206
2876
  });
2207
2877
  //# sourceMappingURL=index.cjs.map