agent-duelist 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,13 +32,21 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  anthropic: () => anthropic,
34
34
  azureOpenai: () => azureOpenai,
35
+ compareResults: () => compareResults,
36
+ computeStats: () => computeStats,
35
37
  consoleReporter: () => consoleReporter,
36
38
  defineArena: () => defineArena,
39
+ detectGitHubContext: () => detectGitHubContext,
37
40
  gemini: () => gemini,
41
+ htmlReporter: () => htmlReporter,
38
42
  jsonReporter: () => jsonReporter,
43
+ loadBaseline: () => loadBaseline,
44
+ markdownReporter: () => markdownReporter,
39
45
  openai: () => openai,
40
46
  openaiCompatible: () => openaiCompatible,
41
- registerPricing: () => registerPricing
47
+ registerPricing: () => registerPricing,
48
+ saveBaseline: () => saveBaseline,
49
+ upsertPrComment: () => upsertPrComment
42
50
  });
43
51
  module.exports = __toCommonJS(index_exports);
44
52
 
@@ -1454,11 +1462,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
1454
1462
  }
1455
1463
  const a = stringify(task.expected);
1456
1464
  const b = stringify(result.output);
1457
- const similarity = jaccardSimilarity(tokenize(a), tokenize(b));
1465
+ const setA = tokenize(a);
1466
+ const setB = tokenize(b);
1467
+ const similarity = jaccardSimilarity(setA, setB);
1458
1468
  return {
1459
1469
  name: "fuzzy-similarity",
1460
1470
  value: Math.round(similarity * 100) / 100,
1461
- details: { method: "jaccard", expectedTokens: tokenize(a).size, actualTokens: tokenize(b).size }
1471
+ details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
1462
1472
  };
1463
1473
  };
1464
1474
  function stringify(value) {
@@ -1479,7 +1489,163 @@ function jaccardSimilarity(a, b) {
1479
1489
  }
1480
1490
 
1481
1491
  // src/scorers/llm-judge.ts
1492
+ var import_openai2 = __toESM(require("openai"), 1);
1493
+
1494
+ // src/providers/openai.ts
1482
1495
  var import_openai = __toESM(require("openai"), 1);
1496
+ var import_zod_to_json_schema = require("zod-to-json-schema");
1497
+
1498
+ // src/providers/shared.ts
1499
+ var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
1500
+ function parseSchemaOutput(rawContent, hasSchema) {
1501
+ if (!hasSchema) return rawContent;
1502
+ try {
1503
+ return JSON.parse(rawContent);
1504
+ } catch {
1505
+ return rawContent;
1506
+ }
1507
+ }
1508
+
1509
+ // src/providers/openai.ts
1510
+ var REQUEST_TIMEOUT_MS = 6e4;
1511
+ function openai(model, options) {
1512
+ const client = new import_openai.default({
1513
+ apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1514
+ baseURL: options?.baseURL,
1515
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1516
+ });
1517
+ return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1518
+ }
1519
+ function openaiCompatible(options) {
1520
+ const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1521
+ const client = new import_openai.default({
1522
+ apiKey,
1523
+ baseURL: options.baseURL,
1524
+ timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
1525
+ });
1526
+ if (options.free) {
1527
+ registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1528
+ }
1529
+ return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1530
+ }
1531
+ function azureOpenai(model, options) {
1532
+ const deployment = options?.deployment ?? model;
1533
+ const client = new import_openai.AzureOpenAI({
1534
+ apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1535
+ endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1536
+ apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1537
+ deployment,
1538
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1539
+ });
1540
+ return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1541
+ }
1542
+ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1543
+ return {
1544
+ id,
1545
+ name,
1546
+ model,
1547
+ async run(input) {
1548
+ const start = Date.now();
1549
+ const params = {
1550
+ model: requestModel,
1551
+ messages: [{ role: "user", content: input.prompt }]
1552
+ };
1553
+ if (input.schema) {
1554
+ params.response_format = { type: "json_object" };
1555
+ params.messages = [
1556
+ { role: "system", content: SCHEMA_SYSTEM_MESSAGE },
1557
+ ...params.messages
1558
+ ];
1559
+ }
1560
+ if (input.tools?.length) {
1561
+ params.tools = input.tools.map(toolDefToOpenAI);
1562
+ params.tool_choice = "auto";
1563
+ }
1564
+ const response = await client.chat.completions.create(params, { signal: input.signal });
1565
+ let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1566
+ let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1567
+ const choice = response.choices[0];
1568
+ const toolCallsRaw = choice?.message?.tool_calls;
1569
+ const collectedToolCalls = [];
1570
+ let finalResponse = response;
1571
+ if (toolCallsRaw?.length && input.tools?.length) {
1572
+ const toolMessages = [
1573
+ ...params.messages,
1574
+ choice.message
1575
+ ];
1576
+ for (const tc of toolCallsRaw) {
1577
+ const toolDef = input.tools.find((t) => t.name === tc.function.name);
1578
+ let args;
1579
+ try {
1580
+ args = JSON.parse(tc.function.arguments);
1581
+ } catch {
1582
+ args = tc.function.arguments;
1583
+ }
1584
+ let result;
1585
+ if (toolDef?.handler) {
1586
+ result = await toolDef.handler(args);
1587
+ }
1588
+ collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
1589
+ toolMessages.push({
1590
+ role: "tool",
1591
+ tool_call_id: tc.id,
1592
+ content: JSON.stringify(result ?? {})
1593
+ });
1594
+ }
1595
+ const followUp = await client.chat.completions.create({
1596
+ model: requestModel,
1597
+ messages: toolMessages
1598
+ }, { signal: input.signal });
1599
+ totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1600
+ totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1601
+ finalResponse = followUp;
1602
+ }
1603
+ const latencyMs = Date.now() - start;
1604
+ const finalChoice = finalResponse.choices[0];
1605
+ let rawContent = finalChoice?.message?.content ?? "";
1606
+ if (stripThinking) {
1607
+ rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1608
+ }
1609
+ const output = parseSchemaOutput(rawContent, !!input.schema);
1610
+ return {
1611
+ output,
1612
+ usage: {
1613
+ promptTokens: totalPromptTokens || void 0,
1614
+ completionTokens: totalCompletionTokens || void 0
1615
+ },
1616
+ latencyMs,
1617
+ raw: finalResponse,
1618
+ toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
1619
+ };
1620
+ }
1621
+ };
1622
+ }
1623
+ function gemini(model, options) {
1624
+ const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
1625
+ if (!apiKey) {
1626
+ throw new Error(
1627
+ `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
1628
+ );
1629
+ }
1630
+ const client = new import_openai.default({
1631
+ apiKey,
1632
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1633
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1634
+ });
1635
+ return makeProvider(`google/${model}`, "Google AI", model, client, model);
1636
+ }
1637
+ function toolDefToOpenAI(tool) {
1638
+ return {
1639
+ type: "function",
1640
+ function: {
1641
+ name: tool.name,
1642
+ description: tool.description,
1643
+ parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
1644
+ }
1645
+ };
1646
+ }
1647
+
1648
+ // src/scorers/llm-judge.ts
1483
1649
  var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
1484
1650
 
1485
1651
  Criteria:
@@ -1495,40 +1661,42 @@ conciseness: <number>
1495
1661
  Task: {task}
1496
1662
  Expected: {expected}
1497
1663
  Actual: {actual}`;
1498
- function resolveJudgeClient(configModel) {
1499
- const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-4o-mini";
1664
+ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1665
+ const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
1500
1666
  if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
1501
1667
  return {
1502
- client: new import_openai.default({
1668
+ client: new import_openai2.default({
1503
1669
  apiKey: process.env.GOOGLE_API_KEY,
1504
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
1670
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1671
+ timeout: timeoutMs
1505
1672
  }),
1506
1673
  model
1507
1674
  };
1508
1675
  }
1509
1676
  if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
1510
1677
  return {
1511
- client: new import_openai.AzureOpenAI({
1678
+ client: new import_openai2.AzureOpenAI({
1512
1679
  apiKey: process.env.AZURE_OPENAI_API_KEY,
1513
1680
  endpoint: process.env.AZURE_OPENAI_ENDPOINT,
1514
1681
  apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1515
- deployment: model
1682
+ deployment: model,
1683
+ timeout: timeoutMs
1516
1684
  }),
1517
1685
  model
1518
1686
  };
1519
1687
  }
1520
1688
  const apiKey = process.env.OPENAI_API_KEY;
1521
1689
  if (!apiKey) return void 0;
1522
- return { client: new import_openai.default({ apiKey }), model };
1690
+ return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
1523
1691
  }
1524
- function createLlmJudgeScorer(judgeModel) {
1692
+ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1525
1693
  let cached = void 0;
1526
1694
  return async ({ task, result }) => {
1527
1695
  if (task.expected === void 0) {
1528
1696
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
1529
1697
  }
1530
1698
  if (cached === void 0) {
1531
- cached = resolveJudgeClient(judgeModel) ?? null;
1699
+ cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
1532
1700
  }
1533
1701
  if (!cached) {
1534
1702
  return {
@@ -1543,8 +1711,7 @@ function createLlmJudgeScorer(judgeModel) {
1543
1711
  const response = await client.chat.completions.create({
1544
1712
  model,
1545
1713
  messages: [{ role: "user", content: prompt }],
1546
- temperature: 0,
1547
- max_tokens: 2048
1714
+ max_completion_tokens: 2048
1548
1715
  });
1549
1716
  const content = response.choices[0]?.message?.content?.trim() ?? "";
1550
1717
  const parsed = {};
@@ -1601,10 +1768,10 @@ var staticScorers = {
1601
1768
  "fuzzy-similarity": fuzzySimilarityScorer,
1602
1769
  "tool-usage": toolUsageScorer
1603
1770
  };
1604
- function resolveScorers(names, judgeModel) {
1771
+ function resolveScorers(names, judgeModel, timeoutMs) {
1605
1772
  return names.map((name) => {
1606
1773
  if (name === "llm-judge-correctness") {
1607
- return createLlmJudgeScorer(judgeModel);
1774
+ return createLlmJudgeScorer(judgeModel, timeoutMs);
1608
1775
  }
1609
1776
  const scorer = staticScorers[name];
1610
1777
  if (!scorer) {
@@ -1615,219 +1782,156 @@ function resolveScorers(names, judgeModel) {
1615
1782
  }
1616
1783
 
1617
1784
  // src/runner.ts
1785
+ var DEFAULT_TIMEOUT_MS = 6e4;
1786
+ function withTimeout(run, ms) {
1787
+ return new Promise((resolve, reject) => {
1788
+ const controller = new AbortController();
1789
+ const timer = setTimeout(() => {
1790
+ controller.abort();
1791
+ reject(new Error(`Request timed out after ${ms}ms`));
1792
+ }, ms);
1793
+ run(controller.signal).then(
1794
+ (v) => {
1795
+ clearTimeout(timer);
1796
+ resolve(v);
1797
+ },
1798
+ (e) => {
1799
+ clearTimeout(timer);
1800
+ reject(e);
1801
+ }
1802
+ );
1803
+ });
1804
+ }
1618
1805
  async function runBenchmarks(options) {
1619
1806
  const { providers, tasks, scorers, runs, onResult } = options;
1807
+ const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1620
1808
  const results = [];
1621
1809
  for (const task of tasks) {
1622
- for (const provider of providers) {
1623
- for (let run = 1; run <= runs; run++) {
1624
- let result;
1625
- try {
1626
- const taskResult = await provider.run({
1627
- prompt: task.prompt,
1628
- schema: task.schema,
1629
- tools: task.tools
1630
- });
1631
- const scores = await Promise.all(
1632
- scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1633
- );
1634
- result = {
1635
- providerId: provider.id,
1636
- taskName: task.name,
1637
- run,
1638
- scores,
1639
- raw: {
1640
- output: taskResult.output,
1641
- latencyMs: taskResult.latencyMs,
1642
- usage: taskResult.usage,
1643
- toolCalls: taskResult.toolCalls
1644
- }
1645
- };
1646
- } catch (err) {
1647
- const message = err instanceof Error ? err.message : String(err);
1648
- result = {
1649
- providerId: provider.id,
1650
- taskName: task.name,
1651
- run,
1652
- scores: [],
1653
- error: message,
1654
- raw: { output: "", latencyMs: 0 }
1655
- };
1656
- }
1657
- results.push(result);
1658
- onResult?.(result);
1659
- }
1810
+ for (let run = 1; run <= runs; run++) {
1811
+ const runResults = await Promise.all(
1812
+ providers.map(async (provider) => {
1813
+ let result;
1814
+ try {
1815
+ const taskResult = await withTimeout((signal) => provider.run({
1816
+ prompt: task.prompt,
1817
+ schema: task.schema,
1818
+ tools: task.tools,
1819
+ signal
1820
+ }), timeout);
1821
+ const scores = await Promise.all(
1822
+ scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1823
+ );
1824
+ result = {
1825
+ providerId: provider.id,
1826
+ taskName: task.name,
1827
+ run,
1828
+ scores,
1829
+ raw: {
1830
+ output: taskResult.output,
1831
+ latencyMs: taskResult.latencyMs,
1832
+ usage: taskResult.usage,
1833
+ toolCalls: taskResult.toolCalls
1834
+ }
1835
+ };
1836
+ } catch (err) {
1837
+ const message = err instanceof Error ? err.message : String(err);
1838
+ result = {
1839
+ providerId: provider.id,
1840
+ taskName: task.name,
1841
+ run,
1842
+ scores: [],
1843
+ error: message,
1844
+ raw: { output: "", latencyMs: 0 }
1845
+ };
1846
+ }
1847
+ onResult?.(result);
1848
+ return result;
1849
+ })
1850
+ );
1851
+ results.push(...runResults);
1660
1852
  }
1661
1853
  }
1662
1854
  return results;
1663
1855
  }
1664
1856
 
1665
- // src/reporter/console.ts
1666
- var reset = "\x1B[0m";
1667
- var boldCode = "\x1B[1m";
1668
- var dimCode = "\x1B[2m";
1669
- var green = "\x1B[32m";
1670
- var red = "\x1B[31m";
1671
- var yellow = "\x1B[33m";
1672
- var cyan = "\x1B[36m";
1673
- function bold(s) {
1674
- return `${boldCode}${s}${reset}`;
1675
- }
1676
- function dim(s) {
1677
- return `${dimCode}${s}${reset}`;
1857
+ // src/utils/format.ts
1858
+ var MAX_FRACTION_DIGITS = 100;
1859
+ function formatCost(usd) {
1860
+ if (usd === void 0) return "\u2014";
1861
+ if (usd === 0) return "$0.00";
1862
+ if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1863
+ const digits = Math.min(
1864
+ MAX_FRACTION_DIGITS,
1865
+ Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
1866
+ );
1867
+ return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1678
1868
  }
1679
- function colorScore(value) {
1680
- const pct = Math.round(value * 100);
1681
- const str = `${pct}%`;
1682
- if (value >= 0.8) return `${green}${str}${reset}`;
1683
- if (value >= 0.5) return `${yellow}${str}${reset}`;
1684
- return `${red}${str}${reset}`;
1869
+ function formatDelta(delta, precision = 4) {
1870
+ const sign = delta >= 0 ? "+" : "";
1871
+ return `${sign}${delta.toFixed(precision)}`;
1685
1872
  }
1686
- function consoleReporter(results) {
1687
- if (results.length === 0) {
1688
- console.log("\nNo results to display.\n");
1689
- return;
1690
- }
1691
- const tasks = [...new Set(results.map((r) => r.taskName))];
1692
- const providers = [...new Set(results.map((r) => r.providerId))];
1693
- const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
1694
- const hasCost = scorerNames.includes("cost");
1695
- const hasErrors = results.some((r) => r.error);
1696
- const runsPerCell = Math.max(...results.map((r) => r.run));
1697
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
1698
- console.log("");
1699
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
1700
- console.log(` ${dim("\u2500".repeat(70))}`);
1701
- console.log("");
1702
- for (const task of tasks) {
1703
- console.log(` ${bold(`Task: ${task}`)}`);
1704
- const cols = [{ label: "Provider", width: 22, align: "left" }];
1705
- for (const name of scorerNames) {
1706
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
1707
- else if (name === "cost") {
1708
- cols.push({ label: "Cost", width: 12, align: "right" });
1709
- cols.push({ label: "Tokens", width: 9, align: "right" });
1710
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
1711
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
1712
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
1713
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
1714
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
1715
- else cols.push({ label: name, width: 10, align: "right" });
1716
- }
1717
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
1718
- const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
1719
- console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
1720
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
1721
- for (const provider of providers) {
1722
- const taskResults = results.filter(
1723
- (r) => r.taskName === task && r.providerId === provider
1724
- );
1725
- const errorResults2 = taskResults.filter((r) => r.error);
1726
- const successResults = taskResults.filter((r) => !r.error);
1727
- if (successResults.length === 0 && errorResults2.length > 0) {
1728
- const cells2 = [pad(provider, 24, "left")];
1729
- for (const name of scorerNames) {
1730
- if (name === "cost") {
1731
- cells2.push(pad("\u2014", 14, "right"));
1732
- cells2.push(pad("\u2014", 11, "right"));
1733
- } else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
1734
- }
1735
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
1736
- console.log(` ${cells2.join("")}`);
1737
- continue;
1738
- }
1739
- const avgScores = averageScores(successResults);
1740
- const avgDetails = averageDetails(successResults);
1741
- const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
1742
- const cells = [pad(provider, 24, "left")];
1743
- for (const name of scorerNames) {
1744
- if (name === "latency") {
1745
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
1746
- } else if (name === "cost") {
1747
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
1748
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
1749
- } else {
1750
- const val = avgScores[name];
1751
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
1752
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
1753
- }
1754
- }
1755
- if (hasErrors) {
1756
- const failCount = errorResults2.length;
1757
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
1758
- }
1759
- console.log(` ${cells.join("")}`);
1873
+
1874
+ // src/reporter/shared.ts
1875
+ function groupResults(results) {
1876
+ const taskSet = /* @__PURE__ */ new Set();
1877
+ const providerSet = /* @__PURE__ */ new Set();
1878
+ const scorerSet = /* @__PURE__ */ new Set();
1879
+ const grouped = /* @__PURE__ */ new Map();
1880
+ const byProvider = /* @__PURE__ */ new Map();
1881
+ let hasErrors = false;
1882
+ let maxRun = 0;
1883
+ for (const r of results) {
1884
+ taskSet.add(r.taskName);
1885
+ providerSet.add(r.providerId);
1886
+ for (const s of r.scores) scorerSet.add(s.name);
1887
+ if (r.error) hasErrors = true;
1888
+ if (r.run > maxRun) maxRun = r.run;
1889
+ const key = `${r.taskName}::${r.providerId}`;
1890
+ let group = grouped.get(key);
1891
+ if (!group) {
1892
+ group = [];
1893
+ grouped.set(key, group);
1760
1894
  }
1761
- console.log("");
1762
- }
1763
- printSummary(results, providers);
1764
- const errorResults = results.filter((r) => r.error);
1765
- if (errorResults.length > 0) {
1766
- console.log(` ${bold("Errors")}`);
1767
- console.log(` ${dim("\u2500".repeat(70))}`);
1768
- const seen = /* @__PURE__ */ new Set();
1769
- for (const r of errorResults) {
1770
- const key = `${r.providerId}::${r.error}`;
1771
- if (seen.has(key)) continue;
1772
- seen.add(key);
1773
- const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
1774
- const suffix = count > 1 ? ` (\xD7${count})` : "";
1775
- console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
1776
- const hint = apiKeyHint(r.providerId, r.error ?? "");
1777
- if (hint) console.log(` ${dim(hint)}`);
1895
+ group.push(r);
1896
+ let provGroup = byProvider.get(r.providerId);
1897
+ if (!provGroup) {
1898
+ provGroup = [];
1899
+ byProvider.set(r.providerId, provGroup);
1778
1900
  }
1779
- console.log("");
1780
- }
1781
- if (hasCost) {
1782
- console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
1783
- console.log("");
1901
+ provGroup.push(r);
1784
1902
  }
1903
+ return {
1904
+ tasks: [...taskSet],
1905
+ providers: [...providerSet],
1906
+ scorerNames: [...scorerSet],
1907
+ grouped,
1908
+ byProvider,
1909
+ hasErrors,
1910
+ maxRun
1911
+ };
1785
1912
  }
1786
- function printSummary(results, providers) {
1787
- const successResults = results.filter((r) => !r.error);
1788
- if (successResults.length === 0) return;
1789
- console.log(` ${dim("\u2500".repeat(70))}`);
1790
- console.log(` ${bold("Summary")}`);
1791
- console.log("");
1792
- const single = providers.length === 1;
1793
- const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
1794
- const byCorrectness = rankProviders(successResults, providers, correctnessKey);
1795
- if (byCorrectness) {
1796
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
1797
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
1798
- }
1799
- const byLatency = providers.map((id) => {
1800
- const runs = successResults.filter((r) => r.providerId === id);
1801
- const avg = average(runs.map((r) => r.raw.latencyMs));
1802
- return { id, avg: avg ?? Infinity };
1803
- }).sort((a, b) => a.avg - b.avg)[0];
1804
- if (byLatency && byLatency.avg !== Infinity) {
1805
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
1806
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
1807
- }
1808
- const byCost = providers.map((id) => {
1809
- const runs = successResults.filter((r) => r.providerId === id);
1810
- const costs = runs.map((r) => {
1811
- const s = r.scores.find((s2) => s2.name === "cost");
1812
- return s && s.value >= 0 ? s.value : void 0;
1813
- }).filter((c) => c !== void 0);
1814
- const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
1815
- return { id, avg };
1816
- }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
1817
- if (byCost?.avg !== void 0) {
1818
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
1819
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
1913
+ function aggregateProviderTask(providerId, grouped, task) {
1914
+ const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
1915
+ const errorResults = taskResults.filter((r) => r.error);
1916
+ const successResults = taskResults.filter((r) => !r.error);
1917
+ if (successResults.length === 0) {
1918
+ return {
1919
+ providerId,
1920
+ avgScores: {},
1921
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
1922
+ latencyMs: void 0,
1923
+ allErrors: errorResults.length > 0,
1924
+ errorCount: errorResults.length
1925
+ };
1820
1926
  }
1821
- console.log("");
1822
- }
1823
- function rankProviders(results, providers, scorerName) {
1824
- const ranked = providers.map((id) => {
1825
- const runs = results.filter((r) => r.providerId === id);
1826
- const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
1827
- const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
1828
- return { id, avg };
1829
- }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
1830
- return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
1927
+ return {
1928
+ providerId,
1929
+ avgScores: averageScores(successResults),
1930
+ avgDetails: averageDetails(successResults),
1931
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1932
+ allErrors: false,
1933
+ errorCount: errorResults.length
1934
+ };
1831
1935
  }
1832
1936
  function averageScores(results) {
1833
1937
  const sums = {};
@@ -1871,38 +1975,89 @@ function average(nums) {
1871
1975
  if (nums.length === 0) return void 0;
1872
1976
  return nums.reduce((a, b) => a + b, 0) / nums.length;
1873
1977
  }
1874
- function formatCost(usd) {
1875
- if (usd === void 0) return "\u2014";
1876
- if (usd === 0) return "$0.00";
1877
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1878
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
1879
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1978
+ function computeColumnStats(providerData, scorerNames) {
1979
+ const stats = /* @__PURE__ */ new Map();
1980
+ const valid = providerData.filter((p) => !p.allErrors);
1981
+ if (scorerNames.includes("latency")) {
1982
+ const values = /* @__PURE__ */ new Map();
1983
+ for (const p of providerData) {
1984
+ values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
1985
+ }
1986
+ const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
1987
+ stats.set("latency", {
1988
+ values,
1989
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
1990
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
1991
+ });
1992
+ }
1993
+ if (scorerNames.includes("cost")) {
1994
+ const costValues = /* @__PURE__ */ new Map();
1995
+ const tokenValues = /* @__PURE__ */ new Map();
1996
+ for (const p of providerData) {
1997
+ costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
1998
+ tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
1999
+ }
2000
+ const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
2001
+ const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
2002
+ stats.set("cost", {
2003
+ values: costValues,
2004
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
2005
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
2006
+ });
2007
+ stats.set("tokens", {
2008
+ values: tokenValues,
2009
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
2010
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
2011
+ });
2012
+ }
2013
+ for (const name of scorerNames) {
2014
+ if (name === "latency" || name === "cost") continue;
2015
+ const values = /* @__PURE__ */ new Map();
2016
+ for (const p of providerData) {
2017
+ values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
2018
+ }
2019
+ const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
2020
+ stats.set(name, {
2021
+ values,
2022
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
2023
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
2024
+ });
2025
+ }
2026
+ return stats;
1880
2027
  }
1881
- function pad(str, width, align) {
1882
- if (align === "right") return str.padStart(width);
1883
- return str.padEnd(width);
1884
- }
1885
- function colorLen(str) {
1886
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
1887
- return str.length - stripped.length;
1888
- }
1889
- function apiKeyHint(providerId, error) {
1890
- const lower = error.toLowerCase();
1891
- const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
1892
- if (!isAuthError) return void 0;
1893
- const prefix = providerId.split("/")[0];
1894
- switch (prefix) {
1895
- case "openai":
1896
- return "Set: export OPENAI_API_KEY=sk-...";
1897
- case "azure":
1898
- return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
1899
- case "anthropic":
1900
- return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
1901
- case "google":
1902
- return "Set: export GOOGLE_API_KEY=...";
1903
- default:
1904
- return `Check the API key for ${providerId}`;
1905
- }
2028
+ function computeMedals(columnStats, providerIds) {
2029
+ const medals = /* @__PURE__ */ new Map();
2030
+ if (providerIds.length < 2) {
2031
+ for (const id of providerIds) medals.set(id, "none");
2032
+ return medals;
2033
+ }
2034
+ const wins = /* @__PURE__ */ new Map();
2035
+ for (const id of providerIds) wins.set(id, 0);
2036
+ for (const [, colStats] of columnStats) {
2037
+ if (colStats.best === void 0) continue;
2038
+ const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
2039
+ if (bestProviders.length === 1) {
2040
+ wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
2041
+ }
2042
+ }
2043
+ const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
2044
+ if (totalWins === 0) {
2045
+ for (const id of providerIds) medals.set(id, "none");
2046
+ return medals;
2047
+ }
2048
+ const sorted = [...wins.entries()].sort(
2049
+ (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
2050
+ );
2051
+ const medalList = ["gold", "silver", "bronze"];
2052
+ let rank = 0;
2053
+ for (let i = 0; i < sorted.length; i++) {
2054
+ if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
2055
+ rank = i;
2056
+ }
2057
+ const hasWins = sorted[i][1] > 0;
2058
+ medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
2059
+ }
2060
+ return medals;
1906
2061
  }
1907
2062
  function providerLabel(providerId) {
1908
2063
  const prefix = providerId.split("/")[0];
@@ -1955,6 +2110,369 @@ function providerLabel(providerId) {
1955
2110
  return `(${prefix})`;
1956
2111
  }
1957
2112
  }
2113
+ function apiKeyHint(providerId, error) {
2114
+ const lower = error.toLowerCase();
2115
+ const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2116
+ if (!isAuthError) return void 0;
2117
+ const prefix = providerId.split("/")[0];
2118
+ switch (prefix) {
2119
+ case "openai":
2120
+ return "Set: export OPENAI_API_KEY=sk-...";
2121
+ case "azure":
2122
+ return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2123
+ case "anthropic":
2124
+ return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2125
+ case "google":
2126
+ return "Set: export GOOGLE_API_KEY=...";
2127
+ default:
2128
+ return `Check the API key for ${providerId}`;
2129
+ }
2130
+ }
2131
+ function rankProviders(successByProvider, providers, scorerName) {
2132
+ const ranked = providers.map((id) => {
2133
+ const runs = successByProvider.get(id) ?? [];
2134
+ const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2135
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2136
+ return { id, avg };
2137
+ }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2138
+ return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2139
+ }
2140
+ function scorerLabel(name) {
2141
+ switch (name) {
2142
+ case "correctness":
2143
+ return "Match";
2144
+ case "schema-correctness":
2145
+ return "Schema";
2146
+ case "fuzzy-similarity":
2147
+ return "Fuzzy";
2148
+ case "llm-judge-correctness":
2149
+ return "Judge";
2150
+ case "tool-usage":
2151
+ return "Tool";
2152
+ default:
2153
+ return name;
2154
+ }
2155
+ }
2156
+ function medalEmoji(medal) {
2157
+ switch (medal) {
2158
+ case "gold":
2159
+ return "\u{1F947}";
2160
+ case "silver":
2161
+ return "\u{1F948}";
2162
+ case "bronze":
2163
+ return "\u{1F949}";
2164
+ case "none":
2165
+ return "";
2166
+ }
2167
+ }
2168
+
2169
+ // src/reporter/console.ts
2170
+ var reset = "\x1B[0m";
2171
+ var boldCode = "\x1B[1m";
2172
+ var dimCode = "\x1B[2m";
2173
+ var green = "\x1B[32m";
2174
+ var red = "\x1B[31m";
2175
+ var yellow = "\x1B[33m";
2176
+ var cyan = "\x1B[36m";
2177
+ var brightGreen = "\x1B[92m";
2178
+ var brightWhite = "\x1B[97m";
2179
+ function bold(s) {
2180
+ return `${boldCode}${s}${reset}`;
2181
+ }
2182
+ function dim(s) {
2183
+ return `${dimCode}${s}${reset}`;
2184
+ }
2185
+ function stripAnsi(s) {
2186
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
2187
+ }
2188
+ function displayWidth(s) {
2189
+ const stripped = stripAnsi(s);
2190
+ let width = 0;
2191
+ for (const ch of stripped) {
2192
+ const code = ch.codePointAt(0) ?? 0;
2193
+ if (code >= 126976) width += 2;
2194
+ else if (code >= 9728 && code <= 10175) width += 2;
2195
+ else width += 1;
2196
+ }
2197
+ return width;
2198
+ }
2199
+ function padCell(str, targetWidth, align) {
2200
+ const dw = displayWidth(str);
2201
+ const padding = Math.max(0, targetWidth - dw);
2202
+ if (align === "right") return " ".repeat(padding) + str;
2203
+ return str + " ".repeat(padding);
2204
+ }
2205
+ function sparkBar(ratio, width = 8) {
2206
+ const clamped = Math.max(0, Math.min(1, ratio));
2207
+ const fillLen = Math.round(clamped * width);
2208
+ const fill = "\u2593".repeat(fillLen);
2209
+ const track = "\u2591".repeat(width - fillLen);
2210
+ return { fill, track };
2211
+ }
2212
+ function drawTableLine(widths, position) {
2213
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2214
+ if (position === "bottom") {
2215
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
2216
+ }
2217
+ if (position === "merge") {
2218
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
2219
+ }
2220
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
2221
+ if (position === "top") {
2222
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
2223
+ }
2224
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
2225
+ }
2226
+ function drawTableRow(cells, widths, aligns) {
2227
+ const parts = cells.map(
2228
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
2229
+ );
2230
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
2231
+ }
2232
+ function drawSpanRow(content, widths) {
2233
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2234
+ const dw = displayWidth(content);
2235
+ const padding = Math.max(0, totalInner - dw - 1);
2236
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
2237
+ }
2238
+ function colorByRank(text, value, colStats, providerCount) {
2239
+ if (value === void 0) return dim("\u2014");
2240
+ if (providerCount < 2) return text;
2241
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
2242
+ if (colStats.best === colStats.worst) return text;
2243
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
2244
+ if (value === colStats.worst) return `${red}${text}${reset}`;
2245
+ return `${yellow}${text}${reset}`;
2246
+ }
2247
+ function consoleReporter(results, options) {
2248
+ const showSparklines = options?.sparklines ?? true;
2249
+ if (results.length === 0) {
2250
+ console.log("\nNo results to display.\n");
2251
+ return;
2252
+ }
2253
+ const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
2254
+ const hasCost = scorerNames.includes("cost");
2255
+ const multi = providers.length >= 2;
2256
+ const runsPerCell = maxRun;
2257
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
2258
+ console.log("");
2259
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
2260
+ console.log(` ${dim("\u2501".repeat(72))}`);
2261
+ console.log("");
2262
+ for (const task of tasks) {
2263
+ console.log(` ${bold(`Task: ${task}`)}`);
2264
+ console.log("");
2265
+ const providerData = providers.map(
2266
+ (providerId) => aggregateProviderTask(providerId, grouped, task)
2267
+ );
2268
+ const columnStats = computeColumnStats(providerData, scorerNames);
2269
+ const medals = computeMedals(columnStats, providers);
2270
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
2271
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
2272
+ const cols = [
2273
+ { label: "Provider", width: providerWidth, align: "left" }
2274
+ ];
2275
+ for (const name of scorerNames) {
2276
+ if (name === "latency") {
2277
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
2278
+ } else if (name === "cost") {
2279
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
2280
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
2281
+ } else {
2282
+ cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2283
+ }
2284
+ }
2285
+ if (hasErrors) {
2286
+ cols.push({ label: "Status", width: 8, align: "left" });
2287
+ }
2288
+ const widths = cols.map((c) => c.width);
2289
+ const aligns = cols.map((c) => c.align);
2290
+ console.log(` ${drawTableLine(widths, "top")}`);
2291
+ const headerCells = cols.map((c) => bold(c.label));
2292
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2293
+ console.log(` ${drawTableLine(widths, "header")}`);
2294
+ for (const pd of providerData) {
2295
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
2296
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2297
+ const cells = [providerCell];
2298
+ if (pd.allErrors) {
2299
+ for (const col of cols.slice(1)) {
2300
+ if (col.label === "Status") {
2301
+ cells.push(`${red}FAIL${reset}`);
2302
+ } else {
2303
+ cells.push(dim("\u2014"));
2304
+ }
2305
+ }
2306
+ } else {
2307
+ for (const col of cols.slice(1)) {
2308
+ if (col.label === "Status") {
2309
+ cells.push(
2310
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
2311
+ );
2312
+ continue;
2313
+ }
2314
+ const statsKey = col.statsKey;
2315
+ const colStats = columnStats.get(statsKey);
2316
+ if (statsKey === "latency") {
2317
+ const ms = pd.latencyMs;
2318
+ if (ms === void 0) {
2319
+ cells.push(dim("\u2014"));
2320
+ } else {
2321
+ const text = `${Math.round(ms)}ms`;
2322
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
2323
+ }
2324
+ } else if (statsKey === "cost") {
2325
+ const cost = pd.avgDetails.costUsd;
2326
+ if (cost === void 0) {
2327
+ cells.push(dim("\u2014"));
2328
+ } else {
2329
+ const text = formatCost(cost);
2330
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
2331
+ }
2332
+ } else if (statsKey === "tokens") {
2333
+ const tokens = pd.avgDetails.totalTokens;
2334
+ if (tokens === void 0) {
2335
+ cells.push(dim("\u2014"));
2336
+ } else {
2337
+ const text = `${tokens}`;
2338
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
2339
+ }
2340
+ } else {
2341
+ const val = pd.avgScores[statsKey];
2342
+ if (val === void 0) {
2343
+ cells.push(dim("\u2014"));
2344
+ } else {
2345
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
2346
+ let coloredPct;
2347
+ if (multi && colStats) {
2348
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
2349
+ } else {
2350
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
2351
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
2352
+ else coloredPct = `${red}${pctStr}${reset}`;
2353
+ }
2354
+ if (showSparklines) {
2355
+ const { fill, track } = sparkBar(val);
2356
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
2357
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
2358
+ } else {
2359
+ cells.push(coloredPct);
2360
+ }
2361
+ }
2362
+ }
2363
+ }
2364
+ }
2365
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
2366
+ }
2367
+ if (multi && providerData.some((p) => !p.allErrors)) {
2368
+ const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
2369
+ if (winnerId) {
2370
+ console.log(` ${drawTableLine(widths, "merge")}`);
2371
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
2372
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
2373
+ }
2374
+ }
2375
+ console.log(` ${drawTableLine(widths, "bottom")}`);
2376
+ console.log("");
2377
+ }
2378
+ printSummary(results, providers, byProvider);
2379
+ const errorResults = results.filter((r) => r.error);
2380
+ if (errorResults.length > 0) {
2381
+ console.log(` ${bold("Errors")}`);
2382
+ console.log(` ${dim("\u2501".repeat(72))}`);
2383
+ const seen = /* @__PURE__ */ new Set();
2384
+ for (const r of errorResults) {
2385
+ const key = `${r.providerId}::${r.error}`;
2386
+ if (seen.has(key)) continue;
2387
+ seen.add(key);
2388
+ const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
2389
+ const suffix = count > 1 ? ` (\xD7${count})` : "";
2390
+ console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
2391
+ const hint = apiKeyHint(r.providerId, r.error ?? "");
2392
+ if (hint) console.log(` ${dim(hint)}`);
2393
+ }
2394
+ console.log("");
2395
+ }
2396
+ if (hasCost) {
2397
+ console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
2398
+ console.log("");
2399
+ }
2400
+ }
2401
+ function printSummary(results, providers, byProvider) {
2402
+ const successResults = results.filter((r) => !r.error);
2403
+ if (successResults.length === 0) return;
2404
+ const successByProvider = /* @__PURE__ */ new Map();
2405
+ for (const id of providers) {
2406
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2407
+ }
2408
+ console.log(` ${bold("Summary")}`);
2409
+ console.log(` ${dim("\u2501".repeat(72))}`);
2410
+ console.log("");
2411
+ const single = providers.length === 1;
2412
+ const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
2413
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2414
+ if (byCorrectness) {
2415
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2416
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2417
+ if (single) {
2418
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2419
+ } else {
2420
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2421
+ }
2422
+ }
2423
+ const byLatency = providers.map((id) => {
2424
+ const runs = successByProvider.get(id) ?? [];
2425
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2426
+ return { id, avg: avg ?? Infinity };
2427
+ }).sort((a, b) => a.avg - b.avg)[0];
2428
+ if (byLatency && byLatency.avg !== Infinity) {
2429
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2430
+ const msStr = `${Math.round(byLatency.avg)}ms`;
2431
+ if (single) {
2432
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2433
+ } else {
2434
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2435
+ }
2436
+ }
2437
+ const byCost = providers.map((id) => {
2438
+ const runs = successByProvider.get(id) ?? [];
2439
+ const costs = runs.map((r) => {
2440
+ const s = r.scores.find((s2) => s2.name === "cost");
2441
+ return s && s.value >= 0 ? s.value : void 0;
2442
+ }).filter((c) => c !== void 0);
2443
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2444
+ return { id, avg };
2445
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2446
+ if (byCost?.avg !== void 0) {
2447
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2448
+ const costStr = formatCost(byCost.avg);
2449
+ if (single) {
2450
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2451
+ } else {
2452
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2453
+ }
2454
+ }
2455
+ if (!single) {
2456
+ const wins = /* @__PURE__ */ new Map();
2457
+ for (const id of providers) wins.set(id, 0);
2458
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2459
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2460
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2461
+ const maxWins = Math.max(...wins.values());
2462
+ if (maxWins > 0) {
2463
+ const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2464
+ console.log("");
2465
+ if (topProviders.length === 1) {
2466
+ const [winnerId, winCount] = topProviders[0];
2467
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2468
+ } else {
2469
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2470
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2471
+ }
2472
+ }
2473
+ }
2474
+ console.log("");
2475
+ }
1958
2476
 
1959
2477
  // src/reporter/json.ts
1960
2478
  function jsonReporter(results) {
@@ -1989,7 +2507,7 @@ function defineArena(config) {
1989
2507
  throw new Error("At least one task is required");
1990
2508
  }
1991
2509
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
1992
- const scorerFns = resolveScorers(scorerNames, config.judgeModel);
2510
+ const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
1993
2511
  const runs = config.runs ?? 1;
1994
2512
  return {
1995
2513
  config,
@@ -1999,141 +2517,13 @@ function defineArena(config) {
1999
2517
  tasks: config.tasks,
2000
2518
  scorers: scorerFns,
2001
2519
  runs,
2520
+ timeout: config.timeout,
2002
2521
  onResult: options?.onResult
2003
2522
  });
2004
2523
  }
2005
2524
  };
2006
2525
  }
2007
2526
 
2008
- // src/providers/openai.ts
2009
- var import_openai2 = __toESM(require("openai"), 1);
2010
- var import_zod_to_json_schema = require("zod-to-json-schema");
2011
- function openai(model, options) {
2012
- const client = new import_openai2.default({
2013
- apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
2014
- baseURL: options?.baseURL
2015
- });
2016
- return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
2017
- }
2018
- function openaiCompatible(options) {
2019
- const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
2020
- const client = new import_openai2.default({
2021
- apiKey,
2022
- baseURL: options.baseURL
2023
- });
2024
- if (options.free) {
2025
- registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
2026
- }
2027
- return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
2028
- }
2029
- function azureOpenai(model, options) {
2030
- const deployment = options?.deployment ?? model;
2031
- const client = new import_openai2.AzureOpenAI({
2032
- apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
2033
- endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
2034
- apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
2035
- deployment
2036
- });
2037
- return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
2038
- }
2039
- function makeProvider(id, name, model, client, requestModel, stripThinking) {
2040
- return {
2041
- id,
2042
- name,
2043
- model,
2044
- async run(input) {
2045
- const start = Date.now();
2046
- const params = {
2047
- model: requestModel,
2048
- messages: [{ role: "user", content: input.prompt }]
2049
- };
2050
- if (input.schema) {
2051
- params.response_format = { type: "json_object" };
2052
- params.messages = [
2053
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
2054
- ...params.messages
2055
- ];
2056
- }
2057
- if (input.tools?.length) {
2058
- params.tools = input.tools.map(toolDefToOpenAI);
2059
- params.tool_choice = "auto";
2060
- }
2061
- const response = await client.chat.completions.create(params);
2062
- let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
2063
- let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
2064
- const choice = response.choices[0];
2065
- const toolCallsRaw = choice?.message?.tool_calls;
2066
- const collectedToolCalls = [];
2067
- let finalResponse = response;
2068
- if (toolCallsRaw?.length && input.tools?.length) {
2069
- const toolMessages = [
2070
- ...params.messages,
2071
- choice.message
2072
- ];
2073
- for (const tc of toolCallsRaw) {
2074
- const toolDef = input.tools.find((t) => t.name === tc.function.name);
2075
- let args;
2076
- try {
2077
- args = JSON.parse(tc.function.arguments);
2078
- } catch {
2079
- args = tc.function.arguments;
2080
- }
2081
- let result;
2082
- if (toolDef?.handler) {
2083
- result = await toolDef.handler(args);
2084
- }
2085
- collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
2086
- toolMessages.push({
2087
- role: "tool",
2088
- tool_call_id: tc.id,
2089
- content: JSON.stringify(result ?? {})
2090
- });
2091
- }
2092
- const followUp = await client.chat.completions.create({
2093
- model: requestModel,
2094
- messages: toolMessages
2095
- });
2096
- totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
2097
- totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
2098
- finalResponse = followUp;
2099
- }
2100
- const latencyMs = Date.now() - start;
2101
- const finalChoice = finalResponse.choices[0];
2102
- let rawContent = finalChoice?.message?.content ?? "";
2103
- if (stripThinking) {
2104
- rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
2105
- }
2106
- let output = rawContent;
2107
- if (input.schema) {
2108
- try {
2109
- output = JSON.parse(rawContent);
2110
- } catch {
2111
- }
2112
- }
2113
- return {
2114
- output,
2115
- usage: {
2116
- promptTokens: totalPromptTokens || void 0,
2117
- completionTokens: totalCompletionTokens || void 0
2118
- },
2119
- latencyMs,
2120
- raw: finalResponse,
2121
- toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
2122
- };
2123
- }
2124
- };
2125
- }
2126
- function toolDefToOpenAI(tool) {
2127
- return {
2128
- type: "function",
2129
- function: {
2130
- name: tool.name,
2131
- description: tool.description,
2132
- parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
2133
- }
2134
- };
2135
- }
2136
-
2137
2527
  // src/providers/anthropic.ts
2138
2528
  var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
2139
2529
  function anthropic(model, options) {
@@ -2147,23 +2537,17 @@ function anthropic(model, options) {
2147
2537
  model,
2148
2538
  async run(input) {
2149
2539
  const start = Date.now();
2150
- const systemMessage = input.schema ? "Respond with valid JSON matching the requested schema." : void 0;
2540
+ const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
2151
2541
  const response = await client.messages.create({
2152
2542
  model,
2153
2543
  max_tokens: maxTokens,
2154
2544
  system: systemMessage,
2155
2545
  messages: [{ role: "user", content: input.prompt }]
2156
- });
2546
+ }, { signal: input.signal });
2157
2547
  const latencyMs = Date.now() - start;
2158
2548
  const textBlock = response.content.find((b) => b.type === "text");
2159
2549
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
2160
- let output = rawContent;
2161
- if (input.schema) {
2162
- try {
2163
- output = JSON.parse(rawContent);
2164
- } catch {
2165
- }
2166
- }
2550
+ const output = parseSchemaOutput(rawContent, !!input.schema);
2167
2551
  return {
2168
2552
  output,
2169
2553
  usage: {
@@ -2177,31 +2561,1025 @@ function anthropic(model, options) {
2177
2561
  };
2178
2562
  }
2179
2563
 
2180
- // src/providers/gemini.ts
2181
- var import_openai3 = __toESM(require("openai"), 1);
2182
- function gemini(model, options) {
2183
- const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
2184
- if (!apiKey) {
2185
- throw new Error(
2186
- `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
2187
- );
2564
+ // src/reporter/markdown.ts
2565
+ var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2566
+ function markdownReporter(report, _current) {
2567
+ const lines = [COMMENT_MARKER, ""];
2568
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
2569
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
2570
+ lines.push("");
2571
+ if (report.comparisons.length > 0) {
2572
+ lines.push(markdownComparisonTable(report.comparisons));
2573
+ lines.push("");
2188
2574
  }
2189
- const client = new import_openai3.default({
2190
- apiKey,
2191
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
2575
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
2576
+ lines.push(markdownCostSummary(report.cost));
2577
+ lines.push("");
2578
+ }
2579
+ if (report.flakyResults.length > 0) {
2580
+ lines.push("### \u26A0\uFE0F Flaky Results");
2581
+ lines.push("");
2582
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
2583
+ lines.push("");
2584
+ for (const f of report.flakyResults) {
2585
+ lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
2586
+ }
2587
+ lines.push("");
2588
+ }
2589
+ if (report.failureReasons.length > 0) {
2590
+ lines.push("### Failure Reasons");
2591
+ lines.push("");
2592
+ for (const reason of report.failureReasons) {
2593
+ lines.push(`- ${reason}`);
2594
+ }
2595
+ lines.push("");
2596
+ }
2597
+ lines.push("---");
2598
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
2599
+ return lines.join("\n");
2600
+ }
2601
+ function markdownComparisonTable(comparisons) {
2602
+ const lines = [];
2603
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
2604
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
2605
+ for (const c of comparisons) {
2606
+ const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2607
+ const currentStr = formatStats(c.current);
2608
+ const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
2609
+ const status = statusIndicator(c);
2610
+ lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2611
+ }
2612
+ return lines.join("\n");
2613
+ }
2614
+ function markdownCostSummary(cost) {
2615
+ const lines = [];
2616
+ lines.push("### \u{1F4B0} Cost Summary");
2617
+ lines.push("");
2618
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
2619
+ if (cost.budget !== void 0) {
2620
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
2621
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
2622
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
2623
+ }
2624
+ if (cost.perProvider.size > 1) {
2625
+ lines.push("");
2626
+ lines.push("| Provider | Cost |");
2627
+ lines.push("|----------|------|");
2628
+ for (const [provider, usd] of cost.perProvider) {
2629
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
2630
+ }
2631
+ }
2632
+ return lines.join("\n");
2633
+ }
2634
+ function formatStats(stats) {
2635
+ if (stats.n > 1) {
2636
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
2637
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
2638
+ }
2639
+ return stats.mean.toFixed(3);
2640
+ }
2641
+ function statusIndicator(c) {
2642
+ if (c.regressed) return "\u{1F534} regressed";
2643
+ if (c.improved) return "\u{1F7E2} improved";
2644
+ if (c.baseline === null) return "\u{1F195} new";
2645
+ return "\u26AA unchanged";
2646
+ }
2647
+
2648
+ // src/reporter/html.ts
2649
+ function esc(s) {
2650
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
2651
+ }
2652
+ function htmlReporter(results) {
2653
+ if (results.length === 0) {
2654
+ return emptyReport();
2655
+ }
2656
+ const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
2657
+ const hasCost = scorerNames.includes("cost");
2658
+ const multi = providers.length >= 2;
2659
+ const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
2660
+ const taskSections = tasks.map((task) => {
2661
+ const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
2662
+ const columnStats = computeColumnStats(providerData, scorerNames);
2663
+ const medals = computeMedals(columnStats, providers);
2664
+ const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
2665
+ return { task, providerData, columnStats, medals, winnerId };
2192
2666
  });
2193
- return makeProvider(`google/${model}`, "Google AI", model, client, model);
2667
+ const successResults = results.filter((r) => !r.error);
2668
+ const successByProvider = /* @__PURE__ */ new Map();
2669
+ for (const id of providers) {
2670
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2671
+ }
2672
+ const correctnessKey = successResults.some(
2673
+ (r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
2674
+ ) ? "llm-judge-correctness" : "correctness";
2675
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2676
+ const byLatency = providers.map((id) => {
2677
+ const runs = successByProvider.get(id) ?? [];
2678
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2679
+ return { id, avg: avg ?? Infinity };
2680
+ }).sort((a, b) => a.avg - b.avg)[0];
2681
+ const byCost = providers.map((id) => {
2682
+ const runs = successByProvider.get(id) ?? [];
2683
+ const costs = runs.map((r) => {
2684
+ const s = r.scores.find((s2) => s2.name === "cost");
2685
+ return s && s.value >= 0 ? s.value : void 0;
2686
+ }).filter((c) => c !== void 0);
2687
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2688
+ return { id, avg };
2689
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2690
+ let overallWinner;
2691
+ if (multi) {
2692
+ const wins = /* @__PURE__ */ new Map();
2693
+ for (const id of providers) wins.set(id, 0);
2694
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2695
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2696
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2697
+ const maxWins = Math.max(...wins.values());
2698
+ if (maxWins > 0) {
2699
+ const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
2700
+ if (tops.length === 1) overallWinner = tops[0][0];
2701
+ }
2702
+ }
2703
+ const errorResults = results.filter((r) => r.error);
2704
+ const deduped = dedupeErrors(errorResults);
2705
+ return `<!DOCTYPE html>
2706
+ <html lang="en">
2707
+ <head>
2708
+ <meta charset="UTF-8">
2709
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2710
+ <title>Agent Duelist Report</title>
2711
+ <meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
2712
+ <meta property="og:title" content="Agent Duelist Report">
2713
+ <meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
2714
+ <meta property="og:type" content="website">
2715
+ ${renderStyle()}
2716
+ </head>
2717
+ <body>
2718
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2719
+ <div class="report">
2720
+
2721
+ ${renderHeader(runsLabel, providers.length, tasks.length)}
2722
+
2723
+ ${tasks.length > 1 ? renderTabs(tasks) : ""}
2724
+
2725
+ <main>
2726
+ ${taskSections.map((s, i) => renderTaskSection(
2727
+ s.task,
2728
+ s.providerData,
2729
+ s.columnStats,
2730
+ s.medals,
2731
+ s.winnerId,
2732
+ scorerNames,
2733
+ hasCost,
2734
+ multi,
2735
+ i
2736
+ )).join("\n")}
2737
+ </main>
2738
+
2739
+ ${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
2740
+
2741
+ ${deduped.length > 0 ? renderErrors(deduped) : ""}
2742
+
2743
+ ${renderFooter()}
2744
+
2745
+ </div>
2746
+ ${renderScript(tasks.length)}
2747
+ </body>
2748
+ </html>`;
2749
+ }
2750
+ function emptyReport() {
2751
+ return `<!DOCTYPE html>
2752
+ <html lang="en">
2753
+ <head>
2754
+ <meta charset="UTF-8">
2755
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2756
+ <title>Agent Duelist Report</title>
2757
+ ${renderStyle()}
2758
+ </head>
2759
+ <body>
2760
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2761
+ <div class="report">
2762
+ ${renderHeader("0 runs", 0, 0)}
2763
+ <main><p class="empty-msg">No results to display.</p></main>
2764
+ ${renderFooter()}
2765
+ </div>
2766
+ </body>
2767
+ </html>`;
2768
+ }
2769
+ function dedupeErrors(errorResults) {
2770
+ const seen = /* @__PURE__ */ new Map();
2771
+ for (const r of errorResults) {
2772
+ const key = `${r.providerId}::${r.error}`;
2773
+ const existing = seen.get(key);
2774
+ if (existing) {
2775
+ existing.count++;
2776
+ } else {
2777
+ seen.set(key, {
2778
+ providerId: r.providerId,
2779
+ error: r.error ?? "Unknown error",
2780
+ count: 1,
2781
+ hint: apiKeyHint(r.providerId, r.error ?? "")
2782
+ });
2783
+ }
2784
+ }
2785
+ return [...seen.values()];
2786
+ }
2787
+ function renderStyle() {
2788
+ return `<style>
2789
+ :root {
2790
+ --bg: #0f172a;
2791
+ --bg-deep: #020617;
2792
+ --panel: rgba(15, 23, 42, 0.85);
2793
+ --accent: #f59e0b;
2794
+ --accent-soft: rgba(245, 158, 11, 0.15);
2795
+ --text: #e2e8f0;
2796
+ --muted: #94a3b8;
2797
+ --border: rgba(148, 163, 184, 0.15);
2798
+ --green: #22c55e;
2799
+ --red: #ef4444;
2800
+ --yellow: #eab308;
2801
+ --radius: 12px;
2802
+ --mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
2803
+ --sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
2804
+ }
2805
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
2806
+ html, body {
2807
+ font-family: var(--sans);
2808
+ background: var(--bg);
2809
+ color: var(--text);
2810
+ min-height: 100vh;
2811
+ }
2812
+ body { padding: 24px; display: flex; justify-content: center; }
2813
+
2814
+ /* Animated gradient mesh */
2815
+ .bg-mesh {
2816
+ position: fixed; inset: 0; z-index: 0;
2817
+ overflow: hidden; pointer-events: none;
2818
+ }
2819
+ .bg-mesh::before, .bg-mesh::after {
2820
+ content: ""; position: absolute; border-radius: 50%;
2821
+ filter: blur(120px); opacity: 0.4;
2822
+ }
2823
+ .bg-mesh::before {
2824
+ width: 600px; height: 600px;
2825
+ background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
2826
+ top: -10%; left: -5%;
2827
+ animation: meshDrift1 18s ease-in-out infinite alternate;
2828
+ }
2829
+ .bg-mesh::after {
2830
+ width: 500px; height: 500px;
2831
+ background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
2832
+ bottom: -10%; right: -5%;
2833
+ animation: meshDrift2 22s ease-in-out infinite alternate;
2834
+ }
2835
+ .bg-mesh-extra {
2836
+ position: absolute; width: 400px; height: 400px;
2837
+ border-radius: 50%; filter: blur(100px); opacity: 0.3;
2838
+ background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
2839
+ top: 50%; left: 60%;
2840
+ animation: meshDrift3 15s ease-in-out infinite alternate;
2841
+ }
2842
+ @keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
2843
+ @keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
2844
+ @keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
2845
+
2846
+ /* Report container */
2847
+ .report {
2848
+ position: relative; z-index: 1;
2849
+ width: 100%; max-width: 960px;
2850
+ }
2851
+
2852
+ /* Header */
2853
+ .report-header {
2854
+ display: flex; justify-content: space-between; align-items: center;
2855
+ padding: 20px 0; margin-bottom: 8px;
2856
+ }
2857
+ .report-brand {
2858
+ display: flex; align-items: center; gap: 10px;
2859
+ text-decoration: none; color: var(--muted);
2860
+ font-weight: 600; font-size: 14px;
2861
+ letter-spacing: 0.04em; text-transform: uppercase;
2862
+ }
2863
+ .report-brand:hover { color: var(--text); }
2864
+ .brand-icon {
2865
+ width: 32px; height: 32px; border-radius: 8px;
2866
+ background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
2867
+ border: 1px solid rgba(245,158,11,0.3);
2868
+ display: flex; align-items: center; justify-content: center;
2869
+ font-size: 16px;
2870
+ }
2871
+ .report-meta {
2872
+ font-size: 12px; color: var(--muted);
2873
+ text-align: right; line-height: 1.6;
2874
+ }
2875
+
2876
+ /* Task tabs */
2877
+ .task-tabs {
2878
+ display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
2879
+ }
2880
+ .task-tab {
2881
+ padding: 6px 16px; border-radius: 999px;
2882
+ border: 1px solid var(--border);
2883
+ background: transparent; color: var(--muted);
2884
+ font-size: 13px; font-weight: 500; cursor: pointer;
2885
+ transition: all 150ms ease;
2886
+ }
2887
+ .task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
2888
+ .task-tab.active {
2889
+ background: var(--accent-soft);
2890
+ border-color: rgba(245,158,11,0.4);
2891
+ color: var(--accent);
2892
+ }
2893
+
2894
+ /* Task sections */
2895
+ .task-section { display: none; }
2896
+ .task-section.active { display: block; }
2897
+ .task-name {
2898
+ font-size: 18px; font-weight: 600;
2899
+ margin-bottom: 12px; letter-spacing: -0.01em;
2900
+ }
2901
+
2902
+ /* Results table */
2903
+ .results-table {
2904
+ width: 100%; border-collapse: collapse;
2905
+ font-size: 13px; margin-bottom: 16px;
2906
+ border-radius: var(--radius); overflow: hidden;
2907
+ border: 1px solid var(--border);
2908
+ }
2909
+ .results-table th, .results-table td {
2910
+ padding: 10px 14px;
2911
+ text-align: left;
2912
+ border-bottom: 1px solid var(--border);
2913
+ }
2914
+ .results-table th {
2915
+ background: rgba(0,0,0,0.3);
2916
+ font-size: 11px; font-weight: 600;
2917
+ text-transform: uppercase; letter-spacing: 0.05em;
2918
+ color: var(--muted); cursor: pointer;
2919
+ user-select: none; white-space: nowrap;
2920
+ }
2921
+ .results-table th:hover { color: var(--text); }
2922
+ .results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
2923
+ .results-table tbody tr {
2924
+ background: var(--panel);
2925
+ transition: background 120ms ease;
2926
+ }
2927
+ .results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
2928
+ .results-table tbody tr:last-child td { border-bottom: none; }
2929
+
2930
+ /* Score cell with progress bar */
2931
+ .score-cell { position: relative; min-width: 90px; }
2932
+ .score-bar {
2933
+ position: absolute; left: 0; bottom: 0;
2934
+ height: 3px; border-radius: 2px;
2935
+ transition: width 300ms ease;
2936
+ }
2937
+ .score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
2938
+
2939
+ /* Color ranking */
2940
+ .rank-best { color: var(--green); font-weight: 600; }
2941
+ .rank-worst { color: var(--red); }
2942
+ .rank-mid { color: var(--yellow); }
2943
+ .rank-neutral { color: var(--text); }
2944
+ .rank-error { color: var(--muted); }
2945
+
2946
+ /* Winner banner */
2947
+ .task-winner {
2948
+ display: flex; align-items: center; gap: 10px;
2949
+ padding: 12px 18px; margin-bottom: 20px;
2950
+ border-radius: var(--radius);
2951
+ background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
2952
+ border: 1px solid rgba(34,197,94,0.2);
2953
+ font-size: 14px; font-weight: 500;
2954
+ }
2955
+ .task-winner .trophy { font-size: 20px; }
2956
+ .task-winner .winner-name { color: var(--green); font-weight: 600; }
2957
+ .task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
2958
+
2959
+ /* Summary cards */
2960
+ .summary-section { margin-top: 32px; }
2961
+ .summary-title {
2962
+ font-size: 16px; font-weight: 600;
2963
+ margin-bottom: 12px; color: var(--text);
2964
+ }
2965
+ .summary-cards {
2966
+ display: grid;
2967
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
2968
+ gap: 12px;
2969
+ }
2970
+ .summary-card {
2971
+ padding: 16px; border-radius: var(--radius);
2972
+ border: 1px solid var(--border);
2973
+ background: var(--panel);
2974
+ }
2975
+ .summary-card .card-label {
2976
+ font-size: 11px; font-weight: 600;
2977
+ text-transform: uppercase; letter-spacing: 0.05em;
2978
+ color: var(--muted); margin-bottom: 6px;
2979
+ }
2980
+ .summary-card .card-value {
2981
+ font-size: 20px; font-weight: 700;
2982
+ color: var(--green); font-family: var(--mono);
2983
+ }
2984
+ .summary-card .card-provider {
2985
+ font-size: 12px; color: var(--muted); margin-top: 4px;
2986
+ }
2987
+
2988
+ /* Errors */
2989
+ .errors-section { margin-top: 24px; }
2990
+ .errors-title {
2991
+ font-size: 16px; font-weight: 600;
2992
+ margin-bottom: 8px; color: var(--red);
2993
+ cursor: pointer;
2994
+ }
2995
+ .errors-list {
2996
+ border-radius: var(--radius);
2997
+ border: 1px solid rgba(239,68,68,0.2);
2998
+ background: rgba(239,68,68,0.04);
2999
+ overflow: hidden;
3000
+ }
3001
+ .error-item {
3002
+ padding: 10px 16px;
3003
+ border-bottom: 1px solid rgba(239,68,68,0.1);
3004
+ font-size: 13px;
3005
+ }
3006
+ .error-item:last-child { border-bottom: none; }
3007
+ .error-provider { font-weight: 600; color: var(--text); }
3008
+ .error-msg { color: var(--muted); margin-left: 8px; }
3009
+ .error-count { color: var(--muted); font-size: 11px; }
3010
+ .error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
3011
+
3012
+ /* Footer */
3013
+ .report-footer {
3014
+ margin-top: 40px; padding: 20px 0;
3015
+ border-top: 1px solid var(--border);
3016
+ display: flex; justify-content: space-between; align-items: center;
3017
+ flex-wrap: wrap; gap: 12px;
3018
+ }
3019
+ .footer-brand {
3020
+ font-size: 13px; color: var(--muted);
3021
+ }
3022
+ .footer-brand a {
3023
+ color: var(--accent); text-decoration: none; font-weight: 500;
3024
+ }
3025
+ .footer-brand a:hover { text-decoration: underline; }
3026
+ .footer-cta {
3027
+ display: inline-flex; align-items: center; gap: 6px;
3028
+ padding: 6px 14px; border-radius: 8px;
3029
+ background: var(--accent-soft);
3030
+ border: 1px solid rgba(245,158,11,0.3);
3031
+ color: var(--accent); font-size: 12px; font-weight: 500;
3032
+ text-decoration: none;
3033
+ transition: transform 120ms ease, box-shadow 120ms ease;
3034
+ }
3035
+ .footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
3036
+
3037
+ /* Empty state */
3038
+ .empty-msg {
3039
+ text-align: center; color: var(--muted);
3040
+ padding: 60px 20px; font-size: 16px;
3041
+ }
3042
+
3043
+ /* Responsive */
3044
+ @media (max-width: 640px) {
3045
+ body { padding: 12px; }
3046
+ .report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
3047
+ .report-meta { text-align: left; }
3048
+ .summary-cards { grid-template-columns: 1fr; }
3049
+ .results-table { font-size: 12px; }
3050
+ .results-table th, .results-table td { padding: 8px 10px; }
3051
+ .report-footer { flex-direction: column; align-items: flex-start; }
3052
+ }
3053
+ </style>`;
3054
+ }
3055
+ function renderHeader(runsLabel, providerCount, taskCount) {
3056
+ const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
3057
+ return `<header class="report-header">
3058
+ <a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3059
+ <div class="brand-icon">&#x2B21;</div>
3060
+ <span>Agent Duelist</span>
3061
+ </a>
3062
+ <div class="report-meta">
3063
+ ${providerCount} provider${providerCount !== 1 ? "s" : ""} &middot;
3064
+ ${taskCount} task${taskCount !== 1 ? "s" : ""} &middot;
3065
+ ${esc(runsLabel)}<br>
3066
+ ${esc(now)}
3067
+ </div>
3068
+ </header>`;
3069
+ }
3070
+ function renderTabs(tasks) {
3071
+ const buttons = tasks.map(
3072
+ (t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
3073
+ ).join("\n ");
3074
+ return `<nav class="task-tabs">
3075
+ ${buttons}
3076
+ </nav>`;
3077
+ }
3078
+ function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
3079
+ const cols = [
3080
+ { label: "Provider", key: "provider", isScore: false }
3081
+ ];
3082
+ for (const name of scorerNames) {
3083
+ if (name === "latency") {
3084
+ cols.push({ label: "Latency", key: "latency", isScore: false });
3085
+ } else if (name === "cost") {
3086
+ cols.push({ label: "Cost", key: "cost", isScore: false });
3087
+ cols.push({ label: "Tokens", key: "tokens", isScore: false });
3088
+ } else {
3089
+ cols.push({ label: scorerLabel(name), key: name, isScore: true });
3090
+ }
3091
+ }
3092
+ const ths = cols.map(
3093
+ (c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
3094
+ ).join("");
3095
+ const rows = providerData.map((pd) => {
3096
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
3097
+ const cells = [];
3098
+ const medalHtml = medal ? `${medal} ` : "";
3099
+ cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
3100
+ if (pd.allErrors) {
3101
+ for (let ci = 1; ci < cols.length; ci++) {
3102
+ cells.push(`<td class="rank-error">&mdash;</td>`);
3103
+ }
3104
+ } else {
3105
+ for (const col of cols.slice(1)) {
3106
+ cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
3107
+ }
3108
+ }
3109
+ return `<tr>${cells.join("")}</tr>`;
3110
+ }).join("\n");
3111
+ const winnerHtml = winnerId ? `<div class="task-winner">
3112
+ <span class="trophy">&#x1F3C6;</span>
3113
+ <span>Winner: <span class="winner-name">${esc(winnerId)}</span>
3114
+ <span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
3115
+ </div>` : "";
3116
+ return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
3117
+ <h2 class="task-name">${esc(task)}</h2>
3118
+ <table class="results-table">
3119
+ <thead><tr>${ths}</tr></thead>
3120
+ <tbody>${rows}</tbody>
3121
+ </table>
3122
+ ${winnerHtml}
3123
+ </section>`;
3124
+ }
3125
+ function renderDataCell(key, _isScore, pd, columnStats, multi) {
3126
+ const colStats = columnStats.get(key);
3127
+ if (key === "latency") {
3128
+ const ms = pd.latencyMs;
3129
+ if (ms === void 0) return `<td class="rank-error">&mdash;</td>`;
3130
+ const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
3131
+ return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
3132
+ }
3133
+ if (key === "cost") {
3134
+ const cost = pd.avgDetails.costUsd;
3135
+ if (cost === void 0) return `<td class="rank-error">&mdash;</td>`;
3136
+ const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
3137
+ return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
3138
+ }
3139
+ if (key === "tokens") {
3140
+ const tokens = pd.avgDetails.totalTokens;
3141
+ if (tokens === void 0) return `<td class="rank-error">&mdash;</td>`;
3142
+ const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
3143
+ return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
3144
+ }
3145
+ const val = pd.avgScores[key];
3146
+ if (val === void 0) return `<td class="rank-error">&mdash;</td>`;
3147
+ const pct = Math.round(val * 100);
3148
+ let rankCls;
3149
+ if (multi && colStats) {
3150
+ rankCls = rankClass_(val, colStats);
3151
+ } else {
3152
+ rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
3153
+ }
3154
+ const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
3155
+ return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
3156
+ <span class="score-val">${pct}%</span>
3157
+ <div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
3158
+ </td>`;
3159
+ }
3160
+ function rankClass_(value, colStats) {
3161
+ if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
3162
+ if (colStats.best === colStats.worst) return "rank-neutral";
3163
+ if (value === colStats.best) return "rank-best";
3164
+ if (value === colStats.worst) return "rank-worst";
3165
+ return "rank-mid";
3166
+ }
3167
+ function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
3168
+ const cards = [];
3169
+ if (byCorrectness) {
3170
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
3171
+ const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
3172
+ cards.push(`<div class="summary-card">
3173
+ <div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
3174
+ <div class="card-value">${pct}</div>
3175
+ ${provider}
3176
+ </div>`);
3177
+ }
3178
+ if (byLatency && byLatency.avg !== Infinity) {
3179
+ const ms = `${Math.round(byLatency.avg)}ms`;
3180
+ const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
3181
+ cards.push(`<div class="summary-card">
3182
+ <div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
3183
+ <div class="card-value">${ms}</div>
3184
+ ${provider}
3185
+ </div>`);
3186
+ }
3187
+ if (byCost?.avg !== void 0) {
3188
+ const cost = esc(formatCost(byCost.avg));
3189
+ const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
3190
+ cards.push(`<div class="summary-card">
3191
+ <div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
3192
+ <div class="card-value">${cost}</div>
3193
+ ${provider}
3194
+ </div>`);
3195
+ }
3196
+ if (overallWinner) {
3197
+ cards.push(`<div class="summary-card">
3198
+ <div class="card-label">Overall Winner</div>
3199
+ <div class="card-value">&#x1F3C6;</div>
3200
+ <div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
3201
+ </div>`);
3202
+ }
3203
+ if (cards.length === 0) return "";
3204
+ return `<section class="summary-section">
3205
+ <h2 class="summary-title">Summary</h2>
3206
+ <div class="summary-cards">
3207
+ ${cards.join("\n ")}
3208
+ </div>
3209
+ </section>`;
3210
+ }
3211
+ function renderErrors(errors) {
3212
+ const items = errors.map((e) => {
3213
+ const suffix = e.count > 1 ? ` <span class="error-count">(&times;${e.count})</span>` : "";
3214
+ const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
3215
+ return `<div class="error-item">
3216
+ <span class="error-provider">${esc(e.providerId)}:</span>
3217
+ <span class="error-msg">${esc(e.error)}</span>${suffix}
3218
+ ${hint}
3219
+ </div>`;
3220
+ }).join("\n");
3221
+ return `<section class="errors-section">
3222
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
3223
+ <div class="errors-list">
3224
+ ${items}
3225
+ </div>
3226
+ </section>`;
3227
+ }
3228
+ function renderFooter() {
3229
+ return `<footer class="report-footer">
3230
+ <div class="footer-brand">
3231
+ Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
3232
+ </div>
3233
+ <a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3234
+ &#x2B50; Star on GitHub
3235
+ </a>
3236
+ </footer>`;
3237
+ }
3238
+ function renderScript(taskCount) {
3239
+ return `<script>
3240
+ (function() {
3241
+ /* Tab switching */
3242
+ ${taskCount > 1 ? `
3243
+ var tabs = document.querySelectorAll('.task-tab');
3244
+ var sections = document.querySelectorAll('.task-section');
3245
+ tabs.forEach(function(tab) {
3246
+ tab.addEventListener('click', function() {
3247
+ var idx = parseInt(tab.getAttribute('data-task'));
3248
+ tabs.forEach(function(t) { t.classList.remove('active'); });
3249
+ sections.forEach(function(s) { s.classList.remove('active'); });
3250
+ tab.classList.add('active');
3251
+ sections[idx].classList.add('active');
3252
+ });
3253
+ });` : ""}
3254
+
3255
+ /* Column sorting */
3256
+ document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
3257
+ var table = th.closest('table');
3258
+ var asc = true;
3259
+ th.addEventListener('click', function() {
3260
+ var tbody = table.querySelector('tbody');
3261
+ var rows = Array.from(tbody.querySelectorAll('tr'));
3262
+ rows.sort(function(a, b) {
3263
+ var aCell = a.children[colIdx];
3264
+ var bCell = b.children[colIdx];
3265
+ var aVal = aCell.getAttribute('data-sort-val');
3266
+ var bVal = bCell.getAttribute('data-sort-val');
3267
+ if (aVal !== null && bVal !== null) {
3268
+ return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
3269
+ }
3270
+ var aText = aCell.textContent || '';
3271
+ var bText = bCell.textContent || '';
3272
+ return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
3273
+ });
3274
+ rows.forEach(function(row) { tbody.appendChild(row); });
3275
+
3276
+ /* Update sort arrows */
3277
+ table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
3278
+ th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
3279
+ asc = !asc;
3280
+ });
3281
+ });
3282
+ })();
3283
+ </script>`;
3284
+ }
3285
+
3286
+ // src/ci.ts
3287
+ var import_node_fs = require("fs");
3288
+ var import_node_path = require("path");
3289
+ var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
3290
+ var FLAKY_CV_THRESHOLD = 0.3;
3291
+ var T_CRITICAL_95 = {
3292
+ 1: 12.706,
3293
+ 2: 4.303,
3294
+ 3: 3.182,
3295
+ 4: 2.776,
3296
+ 5: 2.571,
3297
+ 6: 2.447,
3298
+ 7: 2.365,
3299
+ 8: 2.306,
3300
+ 9: 2.262,
3301
+ 10: 2.228,
3302
+ 15: 2.131,
3303
+ 20: 2.086,
3304
+ 25: 2.06,
3305
+ 30: 2.042
3306
+ };
3307
+ var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
3308
+ function tCritical(df) {
3309
+ if (df <= 0) return 1.96;
3310
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
3311
+ const keys = T_CRITICAL_KEYS;
3312
+ if (df > keys[keys.length - 1]) return 1.96;
3313
+ for (let i = 0; i < keys.length - 1; i++) {
3314
+ if (df > keys[i] && df < keys[i + 1]) {
3315
+ const low = keys[i], high = keys[i + 1];
3316
+ const ratio = (df - low) / (high - low);
3317
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
3318
+ }
3319
+ }
3320
+ return 1.96;
3321
+ }
3322
+ function computeScorerStats(samples) {
3323
+ const n = samples.length;
3324
+ if (n === 0) {
3325
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
3326
+ }
3327
+ const mean = samples.reduce((a, b) => a + b, 0) / n;
3328
+ if (n === 1) {
3329
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
3330
+ }
3331
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
3332
+ const stddev = Math.sqrt(variance);
3333
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
3334
+ const se = stddev / Math.sqrt(n);
3335
+ const t = tCritical(n - 1);
3336
+ return {
3337
+ mean,
3338
+ stddev,
3339
+ cv,
3340
+ n,
3341
+ ci95Lower: mean - t * se,
3342
+ ci95Upper: mean + t * se
3343
+ };
3344
+ }
3345
+ function groupKey(providerId, taskName, scorerName) {
3346
+ return `${providerId}::${taskName}::${scorerName}`;
3347
+ }
3348
+ function computeStats(results) {
3349
+ const grouped = /* @__PURE__ */ new Map();
3350
+ for (const r of results) {
3351
+ if (r.error) continue;
3352
+ for (const score of r.scores) {
3353
+ if (score.value < 0) continue;
3354
+ const key = groupKey(r.providerId, r.taskName, score.name);
3355
+ if (!grouped.has(key)) grouped.set(key, []);
3356
+ grouped.get(key).push(score.value);
3357
+ }
3358
+ }
3359
+ const stats = /* @__PURE__ */ new Map();
3360
+ for (const [key, samples] of grouped) {
3361
+ stats.set(key, computeScorerStats(samples));
3362
+ }
3363
+ return stats;
3364
+ }
3365
+ function computeCostSummary(results, budget) {
3366
+ let totalUsd = 0;
3367
+ const perProvider = /* @__PURE__ */ new Map();
3368
+ for (const r of results) {
3369
+ if (r.error) continue;
3370
+ const costScore = r.scores.find((s) => s.name === "cost");
3371
+ if (!costScore || costScore.value < 0) continue;
3372
+ const details = costScore.details;
3373
+ const usd = details?.estimatedUsd ?? 0;
3374
+ if (usd <= 0) continue;
3375
+ totalUsd += usd;
3376
+ perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
3377
+ }
3378
+ return {
3379
+ totalUsd,
3380
+ perProvider,
3381
+ budget,
3382
+ overBudget: budget !== void 0 && totalUsd > budget
3383
+ };
3384
+ }
3385
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
3386
+ const comparisons = [];
3387
+ const failureReasons = [];
3388
+ for (const [key, current] of currentStats) {
3389
+ const [providerId, taskName, scorerName] = key.split("::");
3390
+ const baseline = baselineStats?.get(key) ?? null;
3391
+ let delta = null;
3392
+ let regressed = false;
3393
+ let improved = false;
3394
+ if (baseline) {
3395
+ delta = current.mean - baseline.mean;
3396
+ const threshold = thresholds.get(scorerName);
3397
+ if (threshold !== void 0) {
3398
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
3399
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
3400
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
3401
+ }
3402
+ }
3403
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
3404
+ comparisons.push({
3405
+ providerId,
3406
+ taskName,
3407
+ scorerName,
3408
+ baseline,
3409
+ current,
3410
+ delta,
3411
+ regressed,
3412
+ improved,
3413
+ flaky
3414
+ });
3415
+ }
3416
+ const cost = computeCostSummary(currentResults ?? [], budget);
3417
+ const regressions = comparisons.filter((c) => c.regressed);
3418
+ if (regressions.length > 0) {
3419
+ for (const r of regressions) {
3420
+ failureReasons.push(
3421
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
3422
+ );
3423
+ }
3424
+ }
3425
+ if (cost.overBudget) {
3426
+ failureReasons.push(
3427
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
3428
+ );
3429
+ }
3430
+ const flakyResults = comparisons.filter((c) => c.flaky);
3431
+ const failed = failureReasons.length > 0;
3432
+ return { comparisons, cost, failed, flakyResults, failureReasons };
3433
+ }
3434
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
3435
+ if (baseline.n === 1 && current.n === 1) {
3436
+ const delta = current.mean - baseline.mean;
3437
+ if (lowerIsBetter) return delta > threshold;
3438
+ return delta < -threshold;
3439
+ }
3440
+ if (lowerIsBetter) {
3441
+ return current.ci95Lower - baseline.ci95Upper > threshold;
3442
+ }
3443
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
3444
+ }
3445
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
3446
+ if (baseline.n === 1 && current.n === 1) {
3447
+ const delta = current.mean - baseline.mean;
3448
+ if (lowerIsBetter) return delta < -threshold;
3449
+ return delta > threshold;
3450
+ }
3451
+ if (lowerIsBetter) {
3452
+ return baseline.ci95Lower - current.ci95Upper > threshold;
3453
+ }
3454
+ return current.ci95Lower - baseline.ci95Upper > threshold;
3455
+ }
3456
+ function loadBaseline(path) {
3457
+ try {
3458
+ const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
3459
+ const data = JSON.parse(raw);
3460
+ const results = data.results ?? data;
3461
+ if (!Array.isArray(results)) return null;
3462
+ return {
3463
+ timestamp: data.timestamp ?? "unknown",
3464
+ results
3465
+ };
3466
+ } catch {
3467
+ return null;
3468
+ }
3469
+ }
3470
+ function saveBaseline(path, results) {
3471
+ (0, import_node_fs.mkdirSync)((0, import_node_path.dirname)(path), { recursive: true });
3472
+ const data = {
3473
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3474
+ results
3475
+ };
3476
+ (0, import_node_fs.writeFileSync)(path, JSON.stringify(data, null, 2));
3477
+ }
3478
+
3479
+ // src/github.ts
3480
+ var import_node_fs2 = require("fs");
3481
+ function detectGitHubContext() {
3482
+ const token = process.env.GITHUB_TOKEN;
3483
+ const repository = process.env.GITHUB_REPOSITORY;
3484
+ const eventPath = process.env.GITHUB_EVENT_PATH;
3485
+ if (!token || !repository) return null;
3486
+ const [owner, repo] = repository.split("/");
3487
+ if (!owner || !repo) return null;
3488
+ let prNumber;
3489
+ if (eventPath) {
3490
+ try {
3491
+ const event = JSON.parse((0, import_node_fs2.readFileSync)(eventPath, "utf-8"));
3492
+ if (event.pull_request && typeof event.pull_request === "object") {
3493
+ const pr = event.pull_request;
3494
+ prNumber = pr.number;
3495
+ }
3496
+ if (!prNumber && event.issue && typeof event.issue === "object") {
3497
+ const issue = event.issue;
3498
+ if (issue.pull_request) {
3499
+ prNumber = issue.number;
3500
+ }
3501
+ }
3502
+ } catch {
3503
+ }
3504
+ }
3505
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
3506
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
3507
+ }
3508
+ if (!prNumber) return null;
3509
+ return { token, owner, repo, prNumber };
3510
+ }
3511
+ var API_BASE = "https://api.github.com";
3512
+ function ghHeaders(token, extra) {
3513
+ return {
3514
+ Authorization: `Bearer ${token}`,
3515
+ Accept: "application/vnd.github+json",
3516
+ "X-GitHub-Api-Version": "2022-11-28",
3517
+ ...extra
3518
+ };
3519
+ }
3520
+ async function findExistingComment(ctx, marker) {
3521
+ let page = 1;
3522
+ const perPage = 50;
3523
+ while (true) {
3524
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
3525
+ const res = await fetch(url, { headers: ghHeaders(ctx.token) });
3526
+ if (!res.ok) return null;
3527
+ const comments = await res.json();
3528
+ if (comments.length === 0) break;
3529
+ for (const comment of comments) {
3530
+ if (comment.body?.includes(marker)) {
3531
+ return comment.id;
3532
+ }
3533
+ }
3534
+ if (comments.length < perPage) break;
3535
+ page++;
3536
+ }
3537
+ return null;
3538
+ }
3539
+ async function upsertPrComment(ctx, body, marker) {
3540
+ const existingId = await findExistingComment(ctx, marker);
3541
+ if (existingId) {
3542
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
3543
+ const res = await fetch(url, {
3544
+ method: "PATCH",
3545
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
3546
+ body: JSON.stringify({ body })
3547
+ });
3548
+ if (!res.ok) {
3549
+ const text = await res.text();
3550
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
3551
+ }
3552
+ } else {
3553
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
3554
+ const res = await fetch(url, {
3555
+ method: "POST",
3556
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
3557
+ body: JSON.stringify({ body })
3558
+ });
3559
+ if (!res.ok) {
3560
+ const text = await res.text();
3561
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
3562
+ }
3563
+ }
2194
3564
  }
2195
3565
  // Annotate the CommonJS export names for ESM import in node:
2196
3566
  0 && (module.exports = {
2197
3567
  anthropic,
2198
3568
  azureOpenai,
3569
+ compareResults,
3570
+ computeStats,
2199
3571
  consoleReporter,
2200
3572
  defineArena,
3573
+ detectGitHubContext,
2201
3574
  gemini,
3575
+ htmlReporter,
2202
3576
  jsonReporter,
3577
+ loadBaseline,
3578
+ markdownReporter,
2203
3579
  openai,
2204
3580
  openaiCompatible,
2205
- registerPricing
3581
+ registerPricing,
3582
+ saveBaseline,
3583
+ upsertPrComment
2206
3584
  });
2207
3585
  //# sourceMappingURL=index.cjs.map