agent-duelist 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1410,11 +1410,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
1410
1410
  }
1411
1411
  const a = stringify(task.expected);
1412
1412
  const b = stringify(result.output);
1413
- const similarity = jaccardSimilarity(tokenize(a), tokenize(b));
1413
+ const setA = tokenize(a);
1414
+ const setB = tokenize(b);
1415
+ const similarity = jaccardSimilarity(setA, setB);
1414
1416
  return {
1415
1417
  name: "fuzzy-similarity",
1416
1418
  value: Math.round(similarity * 100) / 100,
1417
- details: { method: "jaccard", expectedTokens: tokenize(a).size, actualTokens: tokenize(b).size }
1419
+ details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
1418
1420
  };
1419
1421
  };
1420
1422
  function stringify(value) {
@@ -1435,7 +1437,163 @@ function jaccardSimilarity(a, b) {
1435
1437
  }
1436
1438
 
1437
1439
  // src/scorers/llm-judge.ts
1440
+ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
1441
+
1442
+ // src/providers/openai.ts
1438
1443
  import OpenAI, { AzureOpenAI } from "openai";
1444
+ import { zodToJsonSchema } from "zod-to-json-schema";
1445
+
1446
+ // src/providers/shared.ts
1447
+ var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
1448
+ function parseSchemaOutput(rawContent, hasSchema) {
1449
+ if (!hasSchema) return rawContent;
1450
+ try {
1451
+ return JSON.parse(rawContent);
1452
+ } catch {
1453
+ return rawContent;
1454
+ }
1455
+ }
1456
+
1457
+ // src/providers/openai.ts
1458
+ var REQUEST_TIMEOUT_MS = 6e4;
1459
+ function openai(model, options) {
1460
+ const client = new OpenAI({
1461
+ apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1462
+ baseURL: options?.baseURL,
1463
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1464
+ });
1465
+ return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1466
+ }
1467
+ function openaiCompatible(options) {
1468
+ const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1469
+ const client = new OpenAI({
1470
+ apiKey,
1471
+ baseURL: options.baseURL,
1472
+ timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
1473
+ });
1474
+ if (options.free) {
1475
+ registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1476
+ }
1477
+ return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1478
+ }
1479
+ function azureOpenai(model, options) {
1480
+ const deployment = options?.deployment ?? model;
1481
+ const client = new AzureOpenAI({
1482
+ apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1483
+ endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1484
+ apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1485
+ deployment,
1486
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1487
+ });
1488
+ return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1489
+ }
1490
+ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1491
+ return {
1492
+ id,
1493
+ name,
1494
+ model,
1495
+ async run(input) {
1496
+ const start = Date.now();
1497
+ const params = {
1498
+ model: requestModel,
1499
+ messages: [{ role: "user", content: input.prompt }]
1500
+ };
1501
+ if (input.schema) {
1502
+ params.response_format = { type: "json_object" };
1503
+ params.messages = [
1504
+ { role: "system", content: SCHEMA_SYSTEM_MESSAGE },
1505
+ ...params.messages
1506
+ ];
1507
+ }
1508
+ if (input.tools?.length) {
1509
+ params.tools = input.tools.map(toolDefToOpenAI);
1510
+ params.tool_choice = "auto";
1511
+ }
1512
+ const response = await client.chat.completions.create(params, { signal: input.signal });
1513
+ let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1514
+ let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1515
+ const choice = response.choices[0];
1516
+ const toolCallsRaw = choice?.message?.tool_calls;
1517
+ const collectedToolCalls = [];
1518
+ let finalResponse = response;
1519
+ if (toolCallsRaw?.length && input.tools?.length) {
1520
+ const toolMessages = [
1521
+ ...params.messages,
1522
+ choice.message
1523
+ ];
1524
+ for (const tc of toolCallsRaw) {
1525
+ const toolDef = input.tools.find((t) => t.name === tc.function.name);
1526
+ let args;
1527
+ try {
1528
+ args = JSON.parse(tc.function.arguments);
1529
+ } catch {
1530
+ args = tc.function.arguments;
1531
+ }
1532
+ let result;
1533
+ if (toolDef?.handler) {
1534
+ result = await toolDef.handler(args);
1535
+ }
1536
+ collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
1537
+ toolMessages.push({
1538
+ role: "tool",
1539
+ tool_call_id: tc.id,
1540
+ content: JSON.stringify(result ?? {})
1541
+ });
1542
+ }
1543
+ const followUp = await client.chat.completions.create({
1544
+ model: requestModel,
1545
+ messages: toolMessages
1546
+ }, { signal: input.signal });
1547
+ totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1548
+ totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1549
+ finalResponse = followUp;
1550
+ }
1551
+ const latencyMs = Date.now() - start;
1552
+ const finalChoice = finalResponse.choices[0];
1553
+ let rawContent = finalChoice?.message?.content ?? "";
1554
+ if (stripThinking) {
1555
+ rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1556
+ }
1557
+ const output = parseSchemaOutput(rawContent, !!input.schema);
1558
+ return {
1559
+ output,
1560
+ usage: {
1561
+ promptTokens: totalPromptTokens || void 0,
1562
+ completionTokens: totalCompletionTokens || void 0
1563
+ },
1564
+ latencyMs,
1565
+ raw: finalResponse,
1566
+ toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
1567
+ };
1568
+ }
1569
+ };
1570
+ }
1571
+ function gemini(model, options) {
1572
+ const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
1573
+ if (!apiKey) {
1574
+ throw new Error(
1575
+ `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
1576
+ );
1577
+ }
1578
+ const client = new OpenAI({
1579
+ apiKey,
1580
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1581
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1582
+ });
1583
+ return makeProvider(`google/${model}`, "Google AI", model, client, model);
1584
+ }
1585
+ function toolDefToOpenAI(tool) {
1586
+ return {
1587
+ type: "function",
1588
+ function: {
1589
+ name: tool.name,
1590
+ description: tool.description,
1591
+ parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
1592
+ }
1593
+ };
1594
+ }
1595
+
1596
+ // src/scorers/llm-judge.ts
1439
1597
  var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
1440
1598
 
1441
1599
  Criteria:
@@ -1451,40 +1609,42 @@ conciseness: <number>
1451
1609
  Task: {task}
1452
1610
  Expected: {expected}
1453
1611
  Actual: {actual}`;
1454
- function resolveJudgeClient(configModel) {
1455
- const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-4o-mini";
1612
+ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1613
+ const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
1456
1614
  if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
1457
1615
  return {
1458
- client: new OpenAI({
1616
+ client: new OpenAI2({
1459
1617
  apiKey: process.env.GOOGLE_API_KEY,
1460
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
1618
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1619
+ timeout: timeoutMs
1461
1620
  }),
1462
1621
  model
1463
1622
  };
1464
1623
  }
1465
1624
  if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
1466
1625
  return {
1467
- client: new AzureOpenAI({
1626
+ client: new AzureOpenAI2({
1468
1627
  apiKey: process.env.AZURE_OPENAI_API_KEY,
1469
1628
  endpoint: process.env.AZURE_OPENAI_ENDPOINT,
1470
1629
  apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1471
- deployment: model
1630
+ deployment: model,
1631
+ timeout: timeoutMs
1472
1632
  }),
1473
1633
  model
1474
1634
  };
1475
1635
  }
1476
1636
  const apiKey = process.env.OPENAI_API_KEY;
1477
1637
  if (!apiKey) return void 0;
1478
- return { client: new OpenAI({ apiKey }), model };
1638
+ return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
1479
1639
  }
1480
- function createLlmJudgeScorer(judgeModel) {
1640
+ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1481
1641
  let cached = void 0;
1482
1642
  return async ({ task, result }) => {
1483
1643
  if (task.expected === void 0) {
1484
1644
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
1485
1645
  }
1486
1646
  if (cached === void 0) {
1487
- cached = resolveJudgeClient(judgeModel) ?? null;
1647
+ cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
1488
1648
  }
1489
1649
  if (!cached) {
1490
1650
  return {
@@ -1499,8 +1659,7 @@ function createLlmJudgeScorer(judgeModel) {
1499
1659
  const response = await client.chat.completions.create({
1500
1660
  model,
1501
1661
  messages: [{ role: "user", content: prompt }],
1502
- temperature: 0,
1503
- max_tokens: 2048
1662
+ max_completion_tokens: 2048
1504
1663
  });
1505
1664
  const content = response.choices[0]?.message?.content?.trim() ?? "";
1506
1665
  const parsed = {};
@@ -1557,10 +1716,10 @@ var staticScorers = {
1557
1716
  "fuzzy-similarity": fuzzySimilarityScorer,
1558
1717
  "tool-usage": toolUsageScorer
1559
1718
  };
1560
- function resolveScorers(names, judgeModel) {
1719
+ function resolveScorers(names, judgeModel, timeoutMs) {
1561
1720
  return names.map((name) => {
1562
1721
  if (name === "llm-judge-correctness") {
1563
- return createLlmJudgeScorer(judgeModel);
1722
+ return createLlmJudgeScorer(judgeModel, timeoutMs);
1564
1723
  }
1565
1724
  const scorer = staticScorers[name];
1566
1725
  if (!scorer) {
@@ -1571,219 +1730,156 @@ function resolveScorers(names, judgeModel) {
1571
1730
  }
1572
1731
 
1573
1732
  // src/runner.ts
1733
+ var DEFAULT_TIMEOUT_MS = 6e4;
1734
+ function withTimeout(run, ms) {
1735
+ return new Promise((resolve, reject) => {
1736
+ const controller = new AbortController();
1737
+ const timer = setTimeout(() => {
1738
+ controller.abort();
1739
+ reject(new Error(`Request timed out after ${ms}ms`));
1740
+ }, ms);
1741
+ run(controller.signal).then(
1742
+ (v) => {
1743
+ clearTimeout(timer);
1744
+ resolve(v);
1745
+ },
1746
+ (e) => {
1747
+ clearTimeout(timer);
1748
+ reject(e);
1749
+ }
1750
+ );
1751
+ });
1752
+ }
1574
1753
  async function runBenchmarks(options) {
1575
1754
  const { providers, tasks, scorers, runs, onResult } = options;
1755
+ const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1576
1756
  const results = [];
1577
1757
  for (const task of tasks) {
1578
- for (const provider of providers) {
1579
- for (let run = 1; run <= runs; run++) {
1580
- let result;
1581
- try {
1582
- const taskResult = await provider.run({
1583
- prompt: task.prompt,
1584
- schema: task.schema,
1585
- tools: task.tools
1586
- });
1587
- const scores = await Promise.all(
1588
- scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1589
- );
1590
- result = {
1591
- providerId: provider.id,
1592
- taskName: task.name,
1593
- run,
1594
- scores,
1595
- raw: {
1596
- output: taskResult.output,
1597
- latencyMs: taskResult.latencyMs,
1598
- usage: taskResult.usage,
1599
- toolCalls: taskResult.toolCalls
1600
- }
1601
- };
1602
- } catch (err) {
1603
- const message = err instanceof Error ? err.message : String(err);
1604
- result = {
1605
- providerId: provider.id,
1606
- taskName: task.name,
1607
- run,
1608
- scores: [],
1609
- error: message,
1610
- raw: { output: "", latencyMs: 0 }
1611
- };
1612
- }
1613
- results.push(result);
1614
- onResult?.(result);
1615
- }
1758
+ for (let run = 1; run <= runs; run++) {
1759
+ const runResults = await Promise.all(
1760
+ providers.map(async (provider) => {
1761
+ let result;
1762
+ try {
1763
+ const taskResult = await withTimeout((signal) => provider.run({
1764
+ prompt: task.prompt,
1765
+ schema: task.schema,
1766
+ tools: task.tools,
1767
+ signal
1768
+ }), timeout);
1769
+ const scores = await Promise.all(
1770
+ scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1771
+ );
1772
+ result = {
1773
+ providerId: provider.id,
1774
+ taskName: task.name,
1775
+ run,
1776
+ scores,
1777
+ raw: {
1778
+ output: taskResult.output,
1779
+ latencyMs: taskResult.latencyMs,
1780
+ usage: taskResult.usage,
1781
+ toolCalls: taskResult.toolCalls
1782
+ }
1783
+ };
1784
+ } catch (err) {
1785
+ const message = err instanceof Error ? err.message : String(err);
1786
+ result = {
1787
+ providerId: provider.id,
1788
+ taskName: task.name,
1789
+ run,
1790
+ scores: [],
1791
+ error: message,
1792
+ raw: { output: "", latencyMs: 0 }
1793
+ };
1794
+ }
1795
+ onResult?.(result);
1796
+ return result;
1797
+ })
1798
+ );
1799
+ results.push(...runResults);
1616
1800
  }
1617
1801
  }
1618
1802
  return results;
1619
1803
  }
1620
1804
 
1621
- // src/reporter/console.ts
1622
- var reset = "\x1B[0m";
1623
- var boldCode = "\x1B[1m";
1624
- var dimCode = "\x1B[2m";
1625
- var green = "\x1B[32m";
1626
- var red = "\x1B[31m";
1627
- var yellow = "\x1B[33m";
1628
- var cyan = "\x1B[36m";
1629
- function bold(s) {
1630
- return `${boldCode}${s}${reset}`;
1631
- }
1632
- function dim(s) {
1633
- return `${dimCode}${s}${reset}`;
1805
+ // src/utils/format.ts
1806
+ var MAX_FRACTION_DIGITS = 100;
1807
+ function formatCost(usd) {
1808
+ if (usd === void 0) return "\u2014";
1809
+ if (usd === 0) return "$0.00";
1810
+ if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1811
+ const digits = Math.min(
1812
+ MAX_FRACTION_DIGITS,
1813
+ Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
1814
+ );
1815
+ return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1634
1816
  }
1635
- function colorScore(value) {
1636
- const pct = Math.round(value * 100);
1637
- const str = `${pct}%`;
1638
- if (value >= 0.8) return `${green}${str}${reset}`;
1639
- if (value >= 0.5) return `${yellow}${str}${reset}`;
1640
- return `${red}${str}${reset}`;
1817
+ function formatDelta(delta, precision = 4) {
1818
+ const sign = delta >= 0 ? "+" : "";
1819
+ return `${sign}${delta.toFixed(precision)}`;
1641
1820
  }
1642
- function consoleReporter(results) {
1643
- if (results.length === 0) {
1644
- console.log("\nNo results to display.\n");
1645
- return;
1646
- }
1647
- const tasks = [...new Set(results.map((r) => r.taskName))];
1648
- const providers = [...new Set(results.map((r) => r.providerId))];
1649
- const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
1650
- const hasCost = scorerNames.includes("cost");
1651
- const hasErrors = results.some((r) => r.error);
1652
- const runsPerCell = Math.max(...results.map((r) => r.run));
1653
- const runLabel = runsPerCell > 1 ? ` (${runsPerCell} runs each)` : "";
1654
- console.log("");
1655
- console.log(` ${bold(`\u2B21 Agent Duelist Results${runLabel}`)}`);
1656
- console.log(` ${dim("\u2500".repeat(70))}`);
1657
- console.log("");
1658
- for (const task of tasks) {
1659
- console.log(` ${bold(`Task: ${task}`)}`);
1660
- const cols = [{ label: "Provider", width: 22, align: "left" }];
1661
- for (const name of scorerNames) {
1662
- if (name === "latency") cols.push({ label: "Latency", width: 10, align: "right" });
1663
- else if (name === "cost") {
1664
- cols.push({ label: "Cost", width: 12, align: "right" });
1665
- cols.push({ label: "Tokens", width: 9, align: "right" });
1666
- } else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
1667
- else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
1668
- else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
1669
- else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
1670
- else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
1671
- else cols.push({ label: name, width: 10, align: "right" });
1672
- }
1673
- if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
1674
- const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
1675
- console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
1676
- console.log(` ${dim("\u2500".repeat(totalWidth))}`);
1677
- for (const provider of providers) {
1678
- const taskResults = results.filter(
1679
- (r) => r.taskName === task && r.providerId === provider
1680
- );
1681
- const errorResults2 = taskResults.filter((r) => r.error);
1682
- const successResults = taskResults.filter((r) => !r.error);
1683
- if (successResults.length === 0 && errorResults2.length > 0) {
1684
- const cells2 = [pad(provider, 24, "left")];
1685
- for (const name of scorerNames) {
1686
- if (name === "cost") {
1687
- cells2.push(pad("\u2014", 14, "right"));
1688
- cells2.push(pad("\u2014", 11, "right"));
1689
- } else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
1690
- }
1691
- if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
1692
- console.log(` ${cells2.join("")}`);
1693
- continue;
1694
- }
1695
- const avgScores = averageScores(successResults);
1696
- const avgDetails = averageDetails(successResults);
1697
- const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
1698
- const cells = [pad(provider, 24, "left")];
1699
- for (const name of scorerNames) {
1700
- if (name === "latency") {
1701
- cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
1702
- } else if (name === "cost") {
1703
- cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
1704
- cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
1705
- } else {
1706
- const val = avgScores[name];
1707
- if (val === void 0) cells.push(pad("\u2014", 10, "right"));
1708
- else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
1709
- }
1710
- }
1711
- if (hasErrors) {
1712
- const failCount = errorResults2.length;
1713
- cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
1714
- }
1715
- console.log(` ${cells.join("")}`);
1821
+
1822
+ // src/reporter/shared.ts
1823
+ function groupResults(results) {
1824
+ const taskSet = /* @__PURE__ */ new Set();
1825
+ const providerSet = /* @__PURE__ */ new Set();
1826
+ const scorerSet = /* @__PURE__ */ new Set();
1827
+ const grouped = /* @__PURE__ */ new Map();
1828
+ const byProvider = /* @__PURE__ */ new Map();
1829
+ let hasErrors = false;
1830
+ let maxRun = 0;
1831
+ for (const r of results) {
1832
+ taskSet.add(r.taskName);
1833
+ providerSet.add(r.providerId);
1834
+ for (const s of r.scores) scorerSet.add(s.name);
1835
+ if (r.error) hasErrors = true;
1836
+ if (r.run > maxRun) maxRun = r.run;
1837
+ const key = `${r.taskName}::${r.providerId}`;
1838
+ let group = grouped.get(key);
1839
+ if (!group) {
1840
+ group = [];
1841
+ grouped.set(key, group);
1716
1842
  }
1717
- console.log("");
1718
- }
1719
- printSummary(results, providers);
1720
- const errorResults = results.filter((r) => r.error);
1721
- if (errorResults.length > 0) {
1722
- console.log(` ${bold("Errors")}`);
1723
- console.log(` ${dim("\u2500".repeat(70))}`);
1724
- const seen = /* @__PURE__ */ new Set();
1725
- for (const r of errorResults) {
1726
- const key = `${r.providerId}::${r.error}`;
1727
- if (seen.has(key)) continue;
1728
- seen.add(key);
1729
- const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
1730
- const suffix = count > 1 ? ` (\xD7${count})` : "";
1731
- console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
1732
- const hint = apiKeyHint(r.providerId, r.error ?? "");
1733
- if (hint) console.log(` ${dim(hint)}`);
1843
+ group.push(r);
1844
+ let provGroup = byProvider.get(r.providerId);
1845
+ if (!provGroup) {
1846
+ provGroup = [];
1847
+ byProvider.set(r.providerId, provGroup);
1734
1848
  }
1735
- console.log("");
1736
- }
1737
- if (hasCost) {
1738
- console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
1739
- console.log("");
1849
+ provGroup.push(r);
1740
1850
  }
1851
+ return {
1852
+ tasks: [...taskSet],
1853
+ providers: [...providerSet],
1854
+ scorerNames: [...scorerSet],
1855
+ grouped,
1856
+ byProvider,
1857
+ hasErrors,
1858
+ maxRun
1859
+ };
1741
1860
  }
1742
- function printSummary(results, providers) {
1743
- const successResults = results.filter((r) => !r.error);
1744
- if (successResults.length === 0) return;
1745
- console.log(` ${dim("\u2500".repeat(70))}`);
1746
- console.log(` ${bold("Summary")}`);
1747
- console.log("");
1748
- const single = providers.length === 1;
1749
- const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
1750
- const byCorrectness = rankProviders(successResults, providers, correctnessKey);
1751
- if (byCorrectness) {
1752
- const label = single ? "Avg correctness" : `Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))}`;
1753
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${colorScore(byCorrectness.avg)})`);
1754
- }
1755
- const byLatency = providers.map((id) => {
1756
- const runs = successResults.filter((r) => r.providerId === id);
1757
- const avg = average(runs.map((r) => r.raw.latencyMs));
1758
- return { id, avg: avg ?? Infinity };
1759
- }).sort((a, b) => a.avg - b.avg)[0];
1760
- if (byLatency && byLatency.avg !== Infinity) {
1761
- const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
1762
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
1763
- }
1764
- const byCost = providers.map((id) => {
1765
- const runs = successResults.filter((r) => r.providerId === id);
1766
- const costs = runs.map((r) => {
1767
- const s = r.scores.find((s2) => s2.name === "cost");
1768
- return s && s.value >= 0 ? s.value : void 0;
1769
- }).filter((c) => c !== void 0);
1770
- const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
1771
- return { id, avg };
1772
- }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
1773
- if (byCost?.avg !== void 0) {
1774
- const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
1775
- console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
1861
+ function aggregateProviderTask(providerId, grouped, task) {
1862
+ const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
1863
+ const errorResults = taskResults.filter((r) => r.error);
1864
+ const successResults = taskResults.filter((r) => !r.error);
1865
+ if (successResults.length === 0) {
1866
+ return {
1867
+ providerId,
1868
+ avgScores: {},
1869
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
1870
+ latencyMs: void 0,
1871
+ allErrors: errorResults.length > 0,
1872
+ errorCount: errorResults.length
1873
+ };
1776
1874
  }
1777
- console.log("");
1778
- }
1779
- function rankProviders(results, providers, scorerName) {
1780
- const ranked = providers.map((id) => {
1781
- const runs = results.filter((r) => r.providerId === id);
1782
- const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
1783
- const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
1784
- return { id, avg };
1785
- }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
1786
- return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
1875
+ return {
1876
+ providerId,
1877
+ avgScores: averageScores(successResults),
1878
+ avgDetails: averageDetails(successResults),
1879
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1880
+ allErrors: false,
1881
+ errorCount: errorResults.length
1882
+ };
1787
1883
  }
1788
1884
  function averageScores(results) {
1789
1885
  const sums = {};
@@ -1827,38 +1923,89 @@ function average(nums) {
1827
1923
  if (nums.length === 0) return void 0;
1828
1924
  return nums.reduce((a, b) => a + b, 0) / nums.length;
1829
1925
  }
1830
- function formatCost(usd) {
1831
- if (usd === void 0) return "\u2014";
1832
- if (usd === 0) return "$0.00";
1833
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1834
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
1835
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1926
+ function computeColumnStats(providerData, scorerNames) {
1927
+ const stats = /* @__PURE__ */ new Map();
1928
+ const valid = providerData.filter((p) => !p.allErrors);
1929
+ if (scorerNames.includes("latency")) {
1930
+ const values = /* @__PURE__ */ new Map();
1931
+ for (const p of providerData) {
1932
+ values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
1933
+ }
1934
+ const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
1935
+ stats.set("latency", {
1936
+ values,
1937
+ best: nums.length > 0 ? Math.min(...nums) : void 0,
1938
+ worst: nums.length > 0 ? Math.max(...nums) : void 0
1939
+ });
1940
+ }
1941
+ if (scorerNames.includes("cost")) {
1942
+ const costValues = /* @__PURE__ */ new Map();
1943
+ const tokenValues = /* @__PURE__ */ new Map();
1944
+ for (const p of providerData) {
1945
+ costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
1946
+ tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
1947
+ }
1948
+ const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
1949
+ const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
1950
+ stats.set("cost", {
1951
+ values: costValues,
1952
+ best: costNums.length > 0 ? Math.min(...costNums) : void 0,
1953
+ worst: costNums.length > 0 ? Math.max(...costNums) : void 0
1954
+ });
1955
+ stats.set("tokens", {
1956
+ values: tokenValues,
1957
+ best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
1958
+ worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
1959
+ });
1960
+ }
1961
+ for (const name of scorerNames) {
1962
+ if (name === "latency" || name === "cost") continue;
1963
+ const values = /* @__PURE__ */ new Map();
1964
+ for (const p of providerData) {
1965
+ values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
1966
+ }
1967
+ const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
1968
+ stats.set(name, {
1969
+ values,
1970
+ best: nums.length > 0 ? Math.max(...nums) : void 0,
1971
+ worst: nums.length > 0 ? Math.min(...nums) : void 0
1972
+ });
1973
+ }
1974
+ return stats;
1836
1975
  }
1837
- function pad(str, width, align) {
1838
- if (align === "right") return str.padStart(width);
1839
- return str.padEnd(width);
1840
- }
1841
- function colorLen(str) {
1842
- const stripped = str.replace(/\x1b\[[0-9;]*m/g, "");
1843
- return str.length - stripped.length;
1844
- }
1845
- function apiKeyHint(providerId, error) {
1846
- const lower = error.toLowerCase();
1847
- const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
1848
- if (!isAuthError) return void 0;
1849
- const prefix = providerId.split("/")[0];
1850
- switch (prefix) {
1851
- case "openai":
1852
- return "Set: export OPENAI_API_KEY=sk-...";
1853
- case "azure":
1854
- return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
1855
- case "anthropic":
1856
- return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
1857
- case "google":
1858
- return "Set: export GOOGLE_API_KEY=...";
1859
- default:
1860
- return `Check the API key for ${providerId}`;
1861
- }
1976
+ function computeMedals(columnStats, providerIds) {
1977
+ const medals = /* @__PURE__ */ new Map();
1978
+ if (providerIds.length < 2) {
1979
+ for (const id of providerIds) medals.set(id, "none");
1980
+ return medals;
1981
+ }
1982
+ const wins = /* @__PURE__ */ new Map();
1983
+ for (const id of providerIds) wins.set(id, 0);
1984
+ for (const [, colStats] of columnStats) {
1985
+ if (colStats.best === void 0) continue;
1986
+ const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
1987
+ if (bestProviders.length === 1) {
1988
+ wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
1989
+ }
1990
+ }
1991
+ const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
1992
+ if (totalWins === 0) {
1993
+ for (const id of providerIds) medals.set(id, "none");
1994
+ return medals;
1995
+ }
1996
+ const sorted = [...wins.entries()].sort(
1997
+ (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
1998
+ );
1999
+ const medalList = ["gold", "silver", "bronze"];
2000
+ let rank = 0;
2001
+ for (let i = 0; i < sorted.length; i++) {
2002
+ if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
2003
+ rank = i;
2004
+ }
2005
+ const hasWins = sorted[i][1] > 0;
2006
+ medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
2007
+ }
2008
+ return medals;
1862
2009
  }
1863
2010
  function providerLabel(providerId) {
1864
2011
  const prefix = providerId.split("/")[0];
@@ -1911,6 +2058,369 @@ function providerLabel(providerId) {
1911
2058
  return `(${prefix})`;
1912
2059
  }
1913
2060
  }
2061
+ function apiKeyHint(providerId, error) {
2062
+ const lower = error.toLowerCase();
2063
+ const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2064
+ if (!isAuthError) return void 0;
2065
+ const prefix = providerId.split("/")[0];
2066
+ switch (prefix) {
2067
+ case "openai":
2068
+ return "Set: export OPENAI_API_KEY=sk-...";
2069
+ case "azure":
2070
+ return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2071
+ case "anthropic":
2072
+ return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2073
+ case "google":
2074
+ return "Set: export GOOGLE_API_KEY=...";
2075
+ default:
2076
+ return `Check the API key for ${providerId}`;
2077
+ }
2078
+ }
2079
+ function rankProviders(successByProvider, providers, scorerName) {
2080
+ const ranked = providers.map((id) => {
2081
+ const runs = successByProvider.get(id) ?? [];
2082
+ const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2083
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2084
+ return { id, avg };
2085
+ }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2086
+ return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2087
+ }
2088
+ function scorerLabel(name) {
2089
+ switch (name) {
2090
+ case "correctness":
2091
+ return "Match";
2092
+ case "schema-correctness":
2093
+ return "Schema";
2094
+ case "fuzzy-similarity":
2095
+ return "Fuzzy";
2096
+ case "llm-judge-correctness":
2097
+ return "Judge";
2098
+ case "tool-usage":
2099
+ return "Tool";
2100
+ default:
2101
+ return name;
2102
+ }
2103
+ }
2104
+ function medalEmoji(medal) {
2105
+ switch (medal) {
2106
+ case "gold":
2107
+ return "\u{1F947}";
2108
+ case "silver":
2109
+ return "\u{1F948}";
2110
+ case "bronze":
2111
+ return "\u{1F949}";
2112
+ case "none":
2113
+ return "";
2114
+ }
2115
+ }
2116
+
2117
+ // src/reporter/console.ts
2118
+ var reset = "\x1B[0m";
2119
+ var boldCode = "\x1B[1m";
2120
+ var dimCode = "\x1B[2m";
2121
+ var green = "\x1B[32m";
2122
+ var red = "\x1B[31m";
2123
+ var yellow = "\x1B[33m";
2124
+ var cyan = "\x1B[36m";
2125
+ var brightGreen = "\x1B[92m";
2126
+ var brightWhite = "\x1B[97m";
2127
+ function bold(s) {
2128
+ return `${boldCode}${s}${reset}`;
2129
+ }
2130
+ function dim(s) {
2131
+ return `${dimCode}${s}${reset}`;
2132
+ }
2133
+ function stripAnsi(s) {
2134
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
2135
+ }
2136
+ function displayWidth(s) {
2137
+ const stripped = stripAnsi(s);
2138
+ let width = 0;
2139
+ for (const ch of stripped) {
2140
+ const code = ch.codePointAt(0) ?? 0;
2141
+ if (code >= 126976) width += 2;
2142
+ else if (code >= 9728 && code <= 10175) width += 2;
2143
+ else width += 1;
2144
+ }
2145
+ return width;
2146
+ }
2147
+ function padCell(str, targetWidth, align) {
2148
+ const dw = displayWidth(str);
2149
+ const padding = Math.max(0, targetWidth - dw);
2150
+ if (align === "right") return " ".repeat(padding) + str;
2151
+ return str + " ".repeat(padding);
2152
+ }
2153
+ function sparkBar(ratio, width = 8) {
2154
+ const clamped = Math.max(0, Math.min(1, ratio));
2155
+ const fillLen = Math.round(clamped * width);
2156
+ const fill = "\u2593".repeat(fillLen);
2157
+ const track = "\u2591".repeat(width - fillLen);
2158
+ return { fill, track };
2159
+ }
2160
+ function drawTableLine(widths, position) {
2161
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2162
+ if (position === "bottom") {
2163
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
2164
+ }
2165
+ if (position === "merge") {
2166
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
2167
+ }
2168
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
2169
+ if (position === "top") {
2170
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
2171
+ }
2172
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
2173
+ }
2174
+ function drawTableRow(cells, widths, aligns) {
2175
+ const parts = cells.map(
2176
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
2177
+ );
2178
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
2179
+ }
2180
+ function drawSpanRow(content, widths) {
2181
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2182
+ const dw = displayWidth(content);
2183
+ const padding = Math.max(0, totalInner - dw - 1);
2184
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
2185
+ }
2186
+ function colorByRank(text, value, colStats, providerCount) {
2187
+ if (value === void 0) return dim("\u2014");
2188
+ if (providerCount < 2) return text;
2189
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
2190
+ if (colStats.best === colStats.worst) return text;
2191
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
2192
+ if (value === colStats.worst) return `${red}${text}${reset}`;
2193
+ return `${yellow}${text}${reset}`;
2194
+ }
2195
+ function consoleReporter(results, options) {
2196
+ const showSparklines = options?.sparklines ?? true;
2197
+ if (results.length === 0) {
2198
+ console.log("\nNo results to display.\n");
2199
+ return;
2200
+ }
2201
+ const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
2202
+ const hasCost = scorerNames.includes("cost");
2203
+ const multi = providers.length >= 2;
2204
+ const runsPerCell = maxRun;
2205
+ const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
2206
+ console.log("");
2207
+ console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
2208
+ console.log(` ${dim("\u2501".repeat(72))}`);
2209
+ console.log("");
2210
+ for (const task of tasks) {
2211
+ console.log(` ${bold(`Task: ${task}`)}`);
2212
+ console.log("");
2213
+ const providerData = providers.map(
2214
+ (providerId) => aggregateProviderTask(providerId, grouped, task)
2215
+ );
2216
+ const columnStats = computeColumnStats(providerData, scorerNames);
2217
+ const medals = computeMedals(columnStats, providers);
2218
+ const maxProviderLen = Math.max(...providers.map((id) => id.length));
2219
+ const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
2220
+ const cols = [
2221
+ { label: "Provider", width: providerWidth, align: "left" }
2222
+ ];
2223
+ for (const name of scorerNames) {
2224
+ if (name === "latency") {
2225
+ cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
2226
+ } else if (name === "cost") {
2227
+ cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
2228
+ cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
2229
+ } else {
2230
+ cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2231
+ }
2232
+ }
2233
+ if (hasErrors) {
2234
+ cols.push({ label: "Status", width: 8, align: "left" });
2235
+ }
2236
+ const widths = cols.map((c) => c.width);
2237
+ const aligns = cols.map((c) => c.align);
2238
+ console.log(` ${drawTableLine(widths, "top")}`);
2239
+ const headerCells = cols.map((c) => bold(c.label));
2240
+ console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2241
+ console.log(` ${drawTableLine(widths, "header")}`);
2242
+ for (const pd of providerData) {
2243
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
2244
+ const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2245
+ const cells = [providerCell];
2246
+ if (pd.allErrors) {
2247
+ for (const col of cols.slice(1)) {
2248
+ if (col.label === "Status") {
2249
+ cells.push(`${red}FAIL${reset}`);
2250
+ } else {
2251
+ cells.push(dim("\u2014"));
2252
+ }
2253
+ }
2254
+ } else {
2255
+ for (const col of cols.slice(1)) {
2256
+ if (col.label === "Status") {
2257
+ cells.push(
2258
+ pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
2259
+ );
2260
+ continue;
2261
+ }
2262
+ const statsKey = col.statsKey;
2263
+ const colStats = columnStats.get(statsKey);
2264
+ if (statsKey === "latency") {
2265
+ const ms = pd.latencyMs;
2266
+ if (ms === void 0) {
2267
+ cells.push(dim("\u2014"));
2268
+ } else {
2269
+ const text = `${Math.round(ms)}ms`;
2270
+ cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
2271
+ }
2272
+ } else if (statsKey === "cost") {
2273
+ const cost = pd.avgDetails.costUsd;
2274
+ if (cost === void 0) {
2275
+ cells.push(dim("\u2014"));
2276
+ } else {
2277
+ const text = formatCost(cost);
2278
+ cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
2279
+ }
2280
+ } else if (statsKey === "tokens") {
2281
+ const tokens = pd.avgDetails.totalTokens;
2282
+ if (tokens === void 0) {
2283
+ cells.push(dim("\u2014"));
2284
+ } else {
2285
+ const text = `${tokens}`;
2286
+ cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
2287
+ }
2288
+ } else {
2289
+ const val = pd.avgScores[statsKey];
2290
+ if (val === void 0) {
2291
+ cells.push(dim("\u2014"));
2292
+ } else {
2293
+ const pctStr = `${Math.round(val * 100)}%`.padStart(4);
2294
+ let coloredPct;
2295
+ if (multi && colStats) {
2296
+ coloredPct = colorByRank(pctStr, val, colStats, providers.length);
2297
+ } else {
2298
+ if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
2299
+ else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
2300
+ else coloredPct = `${red}${pctStr}${reset}`;
2301
+ }
2302
+ if (showSparklines) {
2303
+ const { fill, track } = sparkBar(val);
2304
+ const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
2305
+ cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
2306
+ } else {
2307
+ cells.push(coloredPct);
2308
+ }
2309
+ }
2310
+ }
2311
+ }
2312
+ }
2313
+ console.log(` ${drawTableRow(cells, widths, aligns)}`);
2314
+ }
2315
+ if (multi && providerData.some((p) => !p.allErrors)) {
2316
+ const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
2317
+ if (winnerId) {
2318
+ console.log(` ${drawTableLine(widths, "merge")}`);
2319
+ const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
2320
+ console.log(` ${drawSpanRow(winnerText, widths)}`);
2321
+ }
2322
+ }
2323
+ console.log(` ${drawTableLine(widths, "bottom")}`);
2324
+ console.log("");
2325
+ }
2326
+ printSummary(results, providers, byProvider);
2327
+ const errorResults = results.filter((r) => r.error);
2328
+ if (errorResults.length > 0) {
2329
+ console.log(` ${bold("Errors")}`);
2330
+ console.log(` ${dim("\u2501".repeat(72))}`);
2331
+ const seen = /* @__PURE__ */ new Set();
2332
+ for (const r of errorResults) {
2333
+ const key = `${r.providerId}::${r.error}`;
2334
+ if (seen.has(key)) continue;
2335
+ seen.add(key);
2336
+ const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
2337
+ const suffix = count > 1 ? ` (\xD7${count})` : "";
2338
+ console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
2339
+ const hint = apiKeyHint(r.providerId, r.error ?? "");
2340
+ if (hint) console.log(` ${dim(hint)}`);
2341
+ }
2342
+ console.log("");
2343
+ }
2344
+ if (hasCost) {
2345
+ console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
2346
+ console.log("");
2347
+ }
2348
+ }
2349
+ function printSummary(results, providers, byProvider) {
2350
+ const successResults = results.filter((r) => !r.error);
2351
+ if (successResults.length === 0) return;
2352
+ const successByProvider = /* @__PURE__ */ new Map();
2353
+ for (const id of providers) {
2354
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2355
+ }
2356
+ console.log(` ${bold("Summary")}`);
2357
+ console.log(` ${dim("\u2501".repeat(72))}`);
2358
+ console.log("");
2359
+ const single = providers.length === 1;
2360
+ const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
2361
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2362
+ if (byCorrectness) {
2363
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2364
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2365
+ if (single) {
2366
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2367
+ } else {
2368
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2369
+ }
2370
+ }
2371
+ const byLatency = providers.map((id) => {
2372
+ const runs = successByProvider.get(id) ?? [];
2373
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2374
+ return { id, avg: avg ?? Infinity };
2375
+ }).sort((a, b) => a.avg - b.avg)[0];
2376
+ if (byLatency && byLatency.avg !== Infinity) {
2377
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2378
+ const msStr = `${Math.round(byLatency.avg)}ms`;
2379
+ if (single) {
2380
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2381
+ } else {
2382
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2383
+ }
2384
+ }
2385
+ const byCost = providers.map((id) => {
2386
+ const runs = successByProvider.get(id) ?? [];
2387
+ const costs = runs.map((r) => {
2388
+ const s = r.scores.find((s2) => s2.name === "cost");
2389
+ return s && s.value >= 0 ? s.value : void 0;
2390
+ }).filter((c) => c !== void 0);
2391
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2392
+ return { id, avg };
2393
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2394
+ if (byCost?.avg !== void 0) {
2395
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2396
+ const costStr = formatCost(byCost.avg);
2397
+ if (single) {
2398
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2399
+ } else {
2400
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2401
+ }
2402
+ }
2403
+ if (!single) {
2404
+ const wins = /* @__PURE__ */ new Map();
2405
+ for (const id of providers) wins.set(id, 0);
2406
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2407
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2408
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2409
+ const maxWins = Math.max(...wins.values());
2410
+ if (maxWins > 0) {
2411
+ const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2412
+ console.log("");
2413
+ if (topProviders.length === 1) {
2414
+ const [winnerId, winCount] = topProviders[0];
2415
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2416
+ } else {
2417
+ const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2418
+ console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2419
+ }
2420
+ }
2421
+ }
2422
+ console.log("");
2423
+ }
1914
2424
 
1915
2425
  // src/reporter/json.ts
1916
2426
  function jsonReporter(results) {
@@ -1945,7 +2455,7 @@ function defineArena(config) {
1945
2455
  throw new Error("At least one task is required");
1946
2456
  }
1947
2457
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
1948
- const scorerFns = resolveScorers(scorerNames, config.judgeModel);
2458
+ const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
1949
2459
  const runs = config.runs ?? 1;
1950
2460
  return {
1951
2461
  config,
@@ -1955,141 +2465,13 @@ function defineArena(config) {
1955
2465
  tasks: config.tasks,
1956
2466
  scorers: scorerFns,
1957
2467
  runs,
2468
+ timeout: config.timeout,
1958
2469
  onResult: options?.onResult
1959
2470
  });
1960
2471
  }
1961
2472
  };
1962
2473
  }
1963
2474
 
1964
- // src/providers/openai.ts
1965
- import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
1966
- import { zodToJsonSchema } from "zod-to-json-schema";
1967
- function openai(model, options) {
1968
- const client = new OpenAI2({
1969
- apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
1970
- baseURL: options?.baseURL
1971
- });
1972
- return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
1973
- }
1974
- function openaiCompatible(options) {
1975
- const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
1976
- const client = new OpenAI2({
1977
- apiKey,
1978
- baseURL: options.baseURL
1979
- });
1980
- if (options.free) {
1981
- registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
1982
- }
1983
- return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
1984
- }
1985
- function azureOpenai(model, options) {
1986
- const deployment = options?.deployment ?? model;
1987
- const client = new AzureOpenAI2({
1988
- apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
1989
- endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
1990
- apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
1991
- deployment
1992
- });
1993
- return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
1994
- }
1995
- function makeProvider(id, name, model, client, requestModel, stripThinking) {
1996
- return {
1997
- id,
1998
- name,
1999
- model,
2000
- async run(input) {
2001
- const start = Date.now();
2002
- const params = {
2003
- model: requestModel,
2004
- messages: [{ role: "user", content: input.prompt }]
2005
- };
2006
- if (input.schema) {
2007
- params.response_format = { type: "json_object" };
2008
- params.messages = [
2009
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
2010
- ...params.messages
2011
- ];
2012
- }
2013
- if (input.tools?.length) {
2014
- params.tools = input.tools.map(toolDefToOpenAI);
2015
- params.tool_choice = "auto";
2016
- }
2017
- const response = await client.chat.completions.create(params);
2018
- let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
2019
- let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
2020
- const choice = response.choices[0];
2021
- const toolCallsRaw = choice?.message?.tool_calls;
2022
- const collectedToolCalls = [];
2023
- let finalResponse = response;
2024
- if (toolCallsRaw?.length && input.tools?.length) {
2025
- const toolMessages = [
2026
- ...params.messages,
2027
- choice.message
2028
- ];
2029
- for (const tc of toolCallsRaw) {
2030
- const toolDef = input.tools.find((t) => t.name === tc.function.name);
2031
- let args;
2032
- try {
2033
- args = JSON.parse(tc.function.arguments);
2034
- } catch {
2035
- args = tc.function.arguments;
2036
- }
2037
- let result;
2038
- if (toolDef?.handler) {
2039
- result = await toolDef.handler(args);
2040
- }
2041
- collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
2042
- toolMessages.push({
2043
- role: "tool",
2044
- tool_call_id: tc.id,
2045
- content: JSON.stringify(result ?? {})
2046
- });
2047
- }
2048
- const followUp = await client.chat.completions.create({
2049
- model: requestModel,
2050
- messages: toolMessages
2051
- });
2052
- totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
2053
- totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
2054
- finalResponse = followUp;
2055
- }
2056
- const latencyMs = Date.now() - start;
2057
- const finalChoice = finalResponse.choices[0];
2058
- let rawContent = finalChoice?.message?.content ?? "";
2059
- if (stripThinking) {
2060
- rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
2061
- }
2062
- let output = rawContent;
2063
- if (input.schema) {
2064
- try {
2065
- output = JSON.parse(rawContent);
2066
- } catch {
2067
- }
2068
- }
2069
- return {
2070
- output,
2071
- usage: {
2072
- promptTokens: totalPromptTokens || void 0,
2073
- completionTokens: totalCompletionTokens || void 0
2074
- },
2075
- latencyMs,
2076
- raw: finalResponse,
2077
- toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
2078
- };
2079
- }
2080
- };
2081
- }
2082
- function toolDefToOpenAI(tool) {
2083
- return {
2084
- type: "function",
2085
- function: {
2086
- name: tool.name,
2087
- description: tool.description,
2088
- parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
2089
- }
2090
- };
2091
- }
2092
-
2093
2475
  // src/providers/anthropic.ts
2094
2476
  import Anthropic from "@anthropic-ai/sdk";
2095
2477
  function anthropic(model, options) {
@@ -2103,23 +2485,17 @@ function anthropic(model, options) {
2103
2485
  model,
2104
2486
  async run(input) {
2105
2487
  const start = Date.now();
2106
- const systemMessage = input.schema ? "Respond with valid JSON matching the requested schema." : void 0;
2488
+ const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
2107
2489
  const response = await client.messages.create({
2108
2490
  model,
2109
2491
  max_tokens: maxTokens,
2110
2492
  system: systemMessage,
2111
2493
  messages: [{ role: "user", content: input.prompt }]
2112
- });
2494
+ }, { signal: input.signal });
2113
2495
  const latencyMs = Date.now() - start;
2114
2496
  const textBlock = response.content.find((b) => b.type === "text");
2115
2497
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
2116
- let output = rawContent;
2117
- if (input.schema) {
2118
- try {
2119
- output = JSON.parse(rawContent);
2120
- } catch {
2121
- }
2122
- }
2498
+ const output = parseSchemaOutput(rawContent, !!input.schema);
2123
2499
  return {
2124
2500
  output,
2125
2501
  usage: {
@@ -2133,30 +2509,1024 @@ function anthropic(model, options) {
2133
2509
  };
2134
2510
  }
2135
2511
 
2136
- // src/providers/gemini.ts
2137
- import OpenAI3 from "openai";
2138
- function gemini(model, options) {
2139
- const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
2140
- if (!apiKey) {
2141
- throw new Error(
2142
- `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
2143
- );
2512
+ // src/reporter/markdown.ts
2513
+ var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2514
+ function markdownReporter(report, _current) {
2515
+ const lines = [COMMENT_MARKER, ""];
2516
+ const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
2517
+ lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
2518
+ lines.push("");
2519
+ if (report.comparisons.length > 0) {
2520
+ lines.push(markdownComparisonTable(report.comparisons));
2521
+ lines.push("");
2144
2522
  }
2145
- const client = new OpenAI3({
2146
- apiKey,
2147
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
2523
+ if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
2524
+ lines.push(markdownCostSummary(report.cost));
2525
+ lines.push("");
2526
+ }
2527
+ if (report.flakyResults.length > 0) {
2528
+ lines.push("### \u26A0\uFE0F Flaky Results");
2529
+ lines.push("");
2530
+ lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
2531
+ lines.push("");
2532
+ for (const f of report.flakyResults) {
2533
+ lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
2534
+ }
2535
+ lines.push("");
2536
+ }
2537
+ if (report.failureReasons.length > 0) {
2538
+ lines.push("### Failure Reasons");
2539
+ lines.push("");
2540
+ for (const reason of report.failureReasons) {
2541
+ lines.push(`- ${reason}`);
2542
+ }
2543
+ lines.push("");
2544
+ }
2545
+ lines.push("---");
2546
+ lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
2547
+ return lines.join("\n");
2548
+ }
2549
+ function markdownComparisonTable(comparisons) {
2550
+ const lines = [];
2551
+ lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
2552
+ lines.push("|----------|------|--------|----------|---------|-------|--------|");
2553
+ for (const c of comparisons) {
2554
+ const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2555
+ const currentStr = formatStats(c.current);
2556
+ const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
2557
+ const status = statusIndicator(c);
2558
+ lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2559
+ }
2560
+ return lines.join("\n");
2561
+ }
2562
+ function markdownCostSummary(cost) {
2563
+ const lines = [];
2564
+ lines.push("### \u{1F4B0} Cost Summary");
2565
+ lines.push("");
2566
+ lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
2567
+ if (cost.budget !== void 0) {
2568
+ const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
2569
+ const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
2570
+ lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
2571
+ }
2572
+ if (cost.perProvider.size > 1) {
2573
+ lines.push("");
2574
+ lines.push("| Provider | Cost |");
2575
+ lines.push("|----------|------|");
2576
+ for (const [provider, usd] of cost.perProvider) {
2577
+ lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
2578
+ }
2579
+ }
2580
+ return lines.join("\n");
2581
+ }
2582
+ function formatStats(stats) {
2583
+ if (stats.n > 1) {
2584
+ const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
2585
+ return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
2586
+ }
2587
+ return stats.mean.toFixed(3);
2588
+ }
2589
+ function statusIndicator(c) {
2590
+ if (c.regressed) return "\u{1F534} regressed";
2591
+ if (c.improved) return "\u{1F7E2} improved";
2592
+ if (c.baseline === null) return "\u{1F195} new";
2593
+ return "\u26AA unchanged";
2594
+ }
2595
+
2596
+ // src/reporter/html.ts
2597
+ function esc(s) {
2598
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
2599
+ }
2600
+ function htmlReporter(results) {
2601
+ if (results.length === 0) {
2602
+ return emptyReport();
2603
+ }
2604
+ const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
2605
+ const hasCost = scorerNames.includes("cost");
2606
+ const multi = providers.length >= 2;
2607
+ const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
2608
+ const taskSections = tasks.map((task) => {
2609
+ const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
2610
+ const columnStats = computeColumnStats(providerData, scorerNames);
2611
+ const medals = computeMedals(columnStats, providers);
2612
+ const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
2613
+ return { task, providerData, columnStats, medals, winnerId };
2148
2614
  });
2149
- return makeProvider(`google/${model}`, "Google AI", model, client, model);
2615
+ const successResults = results.filter((r) => !r.error);
2616
+ const successByProvider = /* @__PURE__ */ new Map();
2617
+ for (const id of providers) {
2618
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2619
+ }
2620
+ const correctnessKey = successResults.some(
2621
+ (r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
2622
+ ) ? "llm-judge-correctness" : "correctness";
2623
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2624
+ const byLatency = providers.map((id) => {
2625
+ const runs = successByProvider.get(id) ?? [];
2626
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2627
+ return { id, avg: avg ?? Infinity };
2628
+ }).sort((a, b) => a.avg - b.avg)[0];
2629
+ const byCost = providers.map((id) => {
2630
+ const runs = successByProvider.get(id) ?? [];
2631
+ const costs = runs.map((r) => {
2632
+ const s = r.scores.find((s2) => s2.name === "cost");
2633
+ return s && s.value >= 0 ? s.value : void 0;
2634
+ }).filter((c) => c !== void 0);
2635
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2636
+ return { id, avg };
2637
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2638
+ let overallWinner;
2639
+ if (multi) {
2640
+ const wins = /* @__PURE__ */ new Map();
2641
+ for (const id of providers) wins.set(id, 0);
2642
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2643
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2644
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2645
+ const maxWins = Math.max(...wins.values());
2646
+ if (maxWins > 0) {
2647
+ const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
2648
+ if (tops.length === 1) overallWinner = tops[0][0];
2649
+ }
2650
+ }
2651
+ const errorResults = results.filter((r) => r.error);
2652
+ const deduped = dedupeErrors(errorResults);
2653
+ return `<!DOCTYPE html>
2654
+ <html lang="en">
2655
+ <head>
2656
+ <meta charset="UTF-8">
2657
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2658
+ <title>Agent Duelist Report</title>
2659
+ <meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
2660
+ <meta property="og:title" content="Agent Duelist Report">
2661
+ <meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
2662
+ <meta property="og:type" content="website">
2663
+ ${renderStyle()}
2664
+ </head>
2665
+ <body>
2666
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2667
+ <div class="report">
2668
+
2669
+ ${renderHeader(runsLabel, providers.length, tasks.length)}
2670
+
2671
+ ${tasks.length > 1 ? renderTabs(tasks) : ""}
2672
+
2673
+ <main>
2674
+ ${taskSections.map((s, i) => renderTaskSection(
2675
+ s.task,
2676
+ s.providerData,
2677
+ s.columnStats,
2678
+ s.medals,
2679
+ s.winnerId,
2680
+ scorerNames,
2681
+ hasCost,
2682
+ multi,
2683
+ i
2684
+ )).join("\n")}
2685
+ </main>
2686
+
2687
+ ${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
2688
+
2689
+ ${deduped.length > 0 ? renderErrors(deduped) : ""}
2690
+
2691
+ ${renderFooter()}
2692
+
2693
+ </div>
2694
+ ${renderScript(tasks.length)}
2695
+ </body>
2696
+ </html>`;
2697
+ }
2698
+ function emptyReport() {
2699
+ return `<!DOCTYPE html>
2700
+ <html lang="en">
2701
+ <head>
2702
+ <meta charset="UTF-8">
2703
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2704
+ <title>Agent Duelist Report</title>
2705
+ ${renderStyle()}
2706
+ </head>
2707
+ <body>
2708
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2709
+ <div class="report">
2710
+ ${renderHeader("0 runs", 0, 0)}
2711
+ <main><p class="empty-msg">No results to display.</p></main>
2712
+ ${renderFooter()}
2713
+ </div>
2714
+ </body>
2715
+ </html>`;
2716
+ }
2717
+ function dedupeErrors(errorResults) {
2718
+ const seen = /* @__PURE__ */ new Map();
2719
+ for (const r of errorResults) {
2720
+ const key = `${r.providerId}::${r.error}`;
2721
+ const existing = seen.get(key);
2722
+ if (existing) {
2723
+ existing.count++;
2724
+ } else {
2725
+ seen.set(key, {
2726
+ providerId: r.providerId,
2727
+ error: r.error ?? "Unknown error",
2728
+ count: 1,
2729
+ hint: apiKeyHint(r.providerId, r.error ?? "")
2730
+ });
2731
+ }
2732
+ }
2733
+ return [...seen.values()];
2734
+ }
2735
+ function renderStyle() {
2736
+ return `<style>
2737
+ :root {
2738
+ --bg: #0f172a;
2739
+ --bg-deep: #020617;
2740
+ --panel: rgba(15, 23, 42, 0.85);
2741
+ --accent: #f59e0b;
2742
+ --accent-soft: rgba(245, 158, 11, 0.15);
2743
+ --text: #e2e8f0;
2744
+ --muted: #94a3b8;
2745
+ --border: rgba(148, 163, 184, 0.15);
2746
+ --green: #22c55e;
2747
+ --red: #ef4444;
2748
+ --yellow: #eab308;
2749
+ --radius: 12px;
2750
+ --mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
2751
+ --sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
2752
+ }
2753
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
2754
+ html, body {
2755
+ font-family: var(--sans);
2756
+ background: var(--bg);
2757
+ color: var(--text);
2758
+ min-height: 100vh;
2759
+ }
2760
+ body { padding: 24px; display: flex; justify-content: center; }
2761
+
2762
+ /* Animated gradient mesh */
2763
+ .bg-mesh {
2764
+ position: fixed; inset: 0; z-index: 0;
2765
+ overflow: hidden; pointer-events: none;
2766
+ }
2767
+ .bg-mesh::before, .bg-mesh::after {
2768
+ content: ""; position: absolute; border-radius: 50%;
2769
+ filter: blur(120px); opacity: 0.4;
2770
+ }
2771
+ .bg-mesh::before {
2772
+ width: 600px; height: 600px;
2773
+ background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
2774
+ top: -10%; left: -5%;
2775
+ animation: meshDrift1 18s ease-in-out infinite alternate;
2776
+ }
2777
+ .bg-mesh::after {
2778
+ width: 500px; height: 500px;
2779
+ background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
2780
+ bottom: -10%; right: -5%;
2781
+ animation: meshDrift2 22s ease-in-out infinite alternate;
2782
+ }
2783
+ .bg-mesh-extra {
2784
+ position: absolute; width: 400px; height: 400px;
2785
+ border-radius: 50%; filter: blur(100px); opacity: 0.3;
2786
+ background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
2787
+ top: 50%; left: 60%;
2788
+ animation: meshDrift3 15s ease-in-out infinite alternate;
2789
+ }
2790
+ @keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
2791
+ @keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
2792
+ @keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
2793
+
2794
+ /* Report container */
2795
+ .report {
2796
+ position: relative; z-index: 1;
2797
+ width: 100%; max-width: 960px;
2798
+ }
2799
+
2800
+ /* Header */
2801
+ .report-header {
2802
+ display: flex; justify-content: space-between; align-items: center;
2803
+ padding: 20px 0; margin-bottom: 8px;
2804
+ }
2805
+ .report-brand {
2806
+ display: flex; align-items: center; gap: 10px;
2807
+ text-decoration: none; color: var(--muted);
2808
+ font-weight: 600; font-size: 14px;
2809
+ letter-spacing: 0.04em; text-transform: uppercase;
2810
+ }
2811
+ .report-brand:hover { color: var(--text); }
2812
+ .brand-icon {
2813
+ width: 32px; height: 32px; border-radius: 8px;
2814
+ background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
2815
+ border: 1px solid rgba(245,158,11,0.3);
2816
+ display: flex; align-items: center; justify-content: center;
2817
+ font-size: 16px;
2818
+ }
2819
+ .report-meta {
2820
+ font-size: 12px; color: var(--muted);
2821
+ text-align: right; line-height: 1.6;
2822
+ }
2823
+
2824
+ /* Task tabs */
2825
+ .task-tabs {
2826
+ display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
2827
+ }
2828
+ .task-tab {
2829
+ padding: 6px 16px; border-radius: 999px;
2830
+ border: 1px solid var(--border);
2831
+ background: transparent; color: var(--muted);
2832
+ font-size: 13px; font-weight: 500; cursor: pointer;
2833
+ transition: all 150ms ease;
2834
+ }
2835
+ .task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
2836
+ .task-tab.active {
2837
+ background: var(--accent-soft);
2838
+ border-color: rgba(245,158,11,0.4);
2839
+ color: var(--accent);
2840
+ }
2841
+
2842
+ /* Task sections */
2843
+ .task-section { display: none; }
2844
+ .task-section.active { display: block; }
2845
+ .task-name {
2846
+ font-size: 18px; font-weight: 600;
2847
+ margin-bottom: 12px; letter-spacing: -0.01em;
2848
+ }
2849
+
2850
+ /* Results table */
2851
+ .results-table {
2852
+ width: 100%; border-collapse: collapse;
2853
+ font-size: 13px; margin-bottom: 16px;
2854
+ border-radius: var(--radius); overflow: hidden;
2855
+ border: 1px solid var(--border);
2856
+ }
2857
+ .results-table th, .results-table td {
2858
+ padding: 10px 14px;
2859
+ text-align: left;
2860
+ border-bottom: 1px solid var(--border);
2861
+ }
2862
+ .results-table th {
2863
+ background: rgba(0,0,0,0.3);
2864
+ font-size: 11px; font-weight: 600;
2865
+ text-transform: uppercase; letter-spacing: 0.05em;
2866
+ color: var(--muted); cursor: pointer;
2867
+ user-select: none; white-space: nowrap;
2868
+ }
2869
+ .results-table th:hover { color: var(--text); }
2870
+ .results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
2871
+ .results-table tbody tr {
2872
+ background: var(--panel);
2873
+ transition: background 120ms ease;
2874
+ }
2875
+ .results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
2876
+ .results-table tbody tr:last-child td { border-bottom: none; }
2877
+
2878
+ /* Score cell with progress bar */
2879
+ .score-cell { position: relative; min-width: 90px; }
2880
+ .score-bar {
2881
+ position: absolute; left: 0; bottom: 0;
2882
+ height: 3px; border-radius: 2px;
2883
+ transition: width 300ms ease;
2884
+ }
2885
+ .score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
2886
+
2887
+ /* Color ranking */
2888
+ .rank-best { color: var(--green); font-weight: 600; }
2889
+ .rank-worst { color: var(--red); }
2890
+ .rank-mid { color: var(--yellow); }
2891
+ .rank-neutral { color: var(--text); }
2892
+ .rank-error { color: var(--muted); }
2893
+
2894
+ /* Winner banner */
2895
+ .task-winner {
2896
+ display: flex; align-items: center; gap: 10px;
2897
+ padding: 12px 18px; margin-bottom: 20px;
2898
+ border-radius: var(--radius);
2899
+ background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
2900
+ border: 1px solid rgba(34,197,94,0.2);
2901
+ font-size: 14px; font-weight: 500;
2902
+ }
2903
+ .task-winner .trophy { font-size: 20px; }
2904
+ .task-winner .winner-name { color: var(--green); font-weight: 600; }
2905
+ .task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
2906
+
2907
+ /* Summary cards */
2908
+ .summary-section { margin-top: 32px; }
2909
+ .summary-title {
2910
+ font-size: 16px; font-weight: 600;
2911
+ margin-bottom: 12px; color: var(--text);
2912
+ }
2913
+ .summary-cards {
2914
+ display: grid;
2915
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
2916
+ gap: 12px;
2917
+ }
2918
+ .summary-card {
2919
+ padding: 16px; border-radius: var(--radius);
2920
+ border: 1px solid var(--border);
2921
+ background: var(--panel);
2922
+ }
2923
+ .summary-card .card-label {
2924
+ font-size: 11px; font-weight: 600;
2925
+ text-transform: uppercase; letter-spacing: 0.05em;
2926
+ color: var(--muted); margin-bottom: 6px;
2927
+ }
2928
+ .summary-card .card-value {
2929
+ font-size: 20px; font-weight: 700;
2930
+ color: var(--green); font-family: var(--mono);
2931
+ }
2932
+ .summary-card .card-provider {
2933
+ font-size: 12px; color: var(--muted); margin-top: 4px;
2934
+ }
2935
+
2936
+ /* Errors */
2937
+ .errors-section { margin-top: 24px; }
2938
+ .errors-title {
2939
+ font-size: 16px; font-weight: 600;
2940
+ margin-bottom: 8px; color: var(--red);
2941
+ cursor: pointer;
2942
+ }
2943
+ .errors-list {
2944
+ border-radius: var(--radius);
2945
+ border: 1px solid rgba(239,68,68,0.2);
2946
+ background: rgba(239,68,68,0.04);
2947
+ overflow: hidden;
2948
+ }
2949
+ .error-item {
2950
+ padding: 10px 16px;
2951
+ border-bottom: 1px solid rgba(239,68,68,0.1);
2952
+ font-size: 13px;
2953
+ }
2954
+ .error-item:last-child { border-bottom: none; }
2955
+ .error-provider { font-weight: 600; color: var(--text); }
2956
+ .error-msg { color: var(--muted); margin-left: 8px; }
2957
+ .error-count { color: var(--muted); font-size: 11px; }
2958
+ .error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
2959
+
2960
+ /* Footer */
2961
+ .report-footer {
2962
+ margin-top: 40px; padding: 20px 0;
2963
+ border-top: 1px solid var(--border);
2964
+ display: flex; justify-content: space-between; align-items: center;
2965
+ flex-wrap: wrap; gap: 12px;
2966
+ }
2967
+ .footer-brand {
2968
+ font-size: 13px; color: var(--muted);
2969
+ }
2970
+ .footer-brand a {
2971
+ color: var(--accent); text-decoration: none; font-weight: 500;
2972
+ }
2973
+ .footer-brand a:hover { text-decoration: underline; }
2974
+ .footer-cta {
2975
+ display: inline-flex; align-items: center; gap: 6px;
2976
+ padding: 6px 14px; border-radius: 8px;
2977
+ background: var(--accent-soft);
2978
+ border: 1px solid rgba(245,158,11,0.3);
2979
+ color: var(--accent); font-size: 12px; font-weight: 500;
2980
+ text-decoration: none;
2981
+ transition: transform 120ms ease, box-shadow 120ms ease;
2982
+ }
2983
+ .footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
2984
+
2985
+ /* Empty state */
2986
+ .empty-msg {
2987
+ text-align: center; color: var(--muted);
2988
+ padding: 60px 20px; font-size: 16px;
2989
+ }
2990
+
2991
+ /* Responsive */
2992
+ @media (max-width: 640px) {
2993
+ body { padding: 12px; }
2994
+ .report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
2995
+ .report-meta { text-align: left; }
2996
+ .summary-cards { grid-template-columns: 1fr; }
2997
+ .results-table { font-size: 12px; }
2998
+ .results-table th, .results-table td { padding: 8px 10px; }
2999
+ .report-footer { flex-direction: column; align-items: flex-start; }
3000
+ }
3001
+ </style>`;
3002
+ }
3003
+ function renderHeader(runsLabel, providerCount, taskCount) {
3004
+ const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
3005
+ return `<header class="report-header">
3006
+ <a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3007
+ <div class="brand-icon">&#x2B21;</div>
3008
+ <span>Agent Duelist</span>
3009
+ </a>
3010
+ <div class="report-meta">
3011
+ ${providerCount} provider${providerCount !== 1 ? "s" : ""} &middot;
3012
+ ${taskCount} task${taskCount !== 1 ? "s" : ""} &middot;
3013
+ ${esc(runsLabel)}<br>
3014
+ ${esc(now)}
3015
+ </div>
3016
+ </header>`;
3017
+ }
3018
+ function renderTabs(tasks) {
3019
+ const buttons = tasks.map(
3020
+ (t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
3021
+ ).join("\n ");
3022
+ return `<nav class="task-tabs">
3023
+ ${buttons}
3024
+ </nav>`;
3025
+ }
3026
+ function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
3027
+ const cols = [
3028
+ { label: "Provider", key: "provider", isScore: false }
3029
+ ];
3030
+ for (const name of scorerNames) {
3031
+ if (name === "latency") {
3032
+ cols.push({ label: "Latency", key: "latency", isScore: false });
3033
+ } else if (name === "cost") {
3034
+ cols.push({ label: "Cost", key: "cost", isScore: false });
3035
+ cols.push({ label: "Tokens", key: "tokens", isScore: false });
3036
+ } else {
3037
+ cols.push({ label: scorerLabel(name), key: name, isScore: true });
3038
+ }
3039
+ }
3040
+ const ths = cols.map(
3041
+ (c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
3042
+ ).join("");
3043
+ const rows = providerData.map((pd) => {
3044
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
3045
+ const cells = [];
3046
+ const medalHtml = medal ? `${medal} ` : "";
3047
+ cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
3048
+ if (pd.allErrors) {
3049
+ for (let ci = 1; ci < cols.length; ci++) {
3050
+ cells.push(`<td class="rank-error">&mdash;</td>`);
3051
+ }
3052
+ } else {
3053
+ for (const col of cols.slice(1)) {
3054
+ cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
3055
+ }
3056
+ }
3057
+ return `<tr>${cells.join("")}</tr>`;
3058
+ }).join("\n");
3059
+ const winnerHtml = winnerId ? `<div class="task-winner">
3060
+ <span class="trophy">&#x1F3C6;</span>
3061
+ <span>Winner: <span class="winner-name">${esc(winnerId)}</span>
3062
+ <span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
3063
+ </div>` : "";
3064
+ return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
3065
+ <h2 class="task-name">${esc(task)}</h2>
3066
+ <table class="results-table">
3067
+ <thead><tr>${ths}</tr></thead>
3068
+ <tbody>${rows}</tbody>
3069
+ </table>
3070
+ ${winnerHtml}
3071
+ </section>`;
3072
+ }
3073
+ function renderDataCell(key, _isScore, pd, columnStats, multi) {
3074
+ const colStats = columnStats.get(key);
3075
+ if (key === "latency") {
3076
+ const ms = pd.latencyMs;
3077
+ if (ms === void 0) return `<td class="rank-error">&mdash;</td>`;
3078
+ const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
3079
+ return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
3080
+ }
3081
+ if (key === "cost") {
3082
+ const cost = pd.avgDetails.costUsd;
3083
+ if (cost === void 0) return `<td class="rank-error">&mdash;</td>`;
3084
+ const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
3085
+ return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
3086
+ }
3087
+ if (key === "tokens") {
3088
+ const tokens = pd.avgDetails.totalTokens;
3089
+ if (tokens === void 0) return `<td class="rank-error">&mdash;</td>`;
3090
+ const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
3091
+ return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
3092
+ }
3093
+ const val = pd.avgScores[key];
3094
+ if (val === void 0) return `<td class="rank-error">&mdash;</td>`;
3095
+ const pct = Math.round(val * 100);
3096
+ let rankCls;
3097
+ if (multi && colStats) {
3098
+ rankCls = rankClass_(val, colStats);
3099
+ } else {
3100
+ rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
3101
+ }
3102
+ const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
3103
+ return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
3104
+ <span class="score-val">${pct}%</span>
3105
+ <div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
3106
+ </td>`;
3107
+ }
3108
+ function rankClass_(value, colStats) {
3109
+ if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
3110
+ if (colStats.best === colStats.worst) return "rank-neutral";
3111
+ if (value === colStats.best) return "rank-best";
3112
+ if (value === colStats.worst) return "rank-worst";
3113
+ return "rank-mid";
3114
+ }
3115
+ function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
3116
+ const cards = [];
3117
+ if (byCorrectness) {
3118
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
3119
+ const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
3120
+ cards.push(`<div class="summary-card">
3121
+ <div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
3122
+ <div class="card-value">${pct}</div>
3123
+ ${provider}
3124
+ </div>`);
3125
+ }
3126
+ if (byLatency && byLatency.avg !== Infinity) {
3127
+ const ms = `${Math.round(byLatency.avg)}ms`;
3128
+ const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
3129
+ cards.push(`<div class="summary-card">
3130
+ <div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
3131
+ <div class="card-value">${ms}</div>
3132
+ ${provider}
3133
+ </div>`);
3134
+ }
3135
+ if (byCost?.avg !== void 0) {
3136
+ const cost = esc(formatCost(byCost.avg));
3137
+ const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
3138
+ cards.push(`<div class="summary-card">
3139
+ <div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
3140
+ <div class="card-value">${cost}</div>
3141
+ ${provider}
3142
+ </div>`);
3143
+ }
3144
+ if (overallWinner) {
3145
+ cards.push(`<div class="summary-card">
3146
+ <div class="card-label">Overall Winner</div>
3147
+ <div class="card-value">&#x1F3C6;</div>
3148
+ <div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
3149
+ </div>`);
3150
+ }
3151
+ if (cards.length === 0) return "";
3152
+ return `<section class="summary-section">
3153
+ <h2 class="summary-title">Summary</h2>
3154
+ <div class="summary-cards">
3155
+ ${cards.join("\n ")}
3156
+ </div>
3157
+ </section>`;
3158
+ }
3159
+ function renderErrors(errors) {
3160
+ const items = errors.map((e) => {
3161
+ const suffix = e.count > 1 ? ` <span class="error-count">(&times;${e.count})</span>` : "";
3162
+ const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
3163
+ return `<div class="error-item">
3164
+ <span class="error-provider">${esc(e.providerId)}:</span>
3165
+ <span class="error-msg">${esc(e.error)}</span>${suffix}
3166
+ ${hint}
3167
+ </div>`;
3168
+ }).join("\n");
3169
+ return `<section class="errors-section">
3170
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
3171
+ <div class="errors-list">
3172
+ ${items}
3173
+ </div>
3174
+ </section>`;
3175
+ }
3176
+ function renderFooter() {
3177
+ return `<footer class="report-footer">
3178
+ <div class="footer-brand">
3179
+ Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
3180
+ </div>
3181
+ <a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3182
+ &#x2B50; Star on GitHub
3183
+ </a>
3184
+ </footer>`;
3185
+ }
3186
+ function renderScript(taskCount) {
3187
+ return `<script>
3188
+ (function() {
3189
+ /* Tab switching */
3190
+ ${taskCount > 1 ? `
3191
+ var tabs = document.querySelectorAll('.task-tab');
3192
+ var sections = document.querySelectorAll('.task-section');
3193
+ tabs.forEach(function(tab) {
3194
+ tab.addEventListener('click', function() {
3195
+ var idx = parseInt(tab.getAttribute('data-task'));
3196
+ tabs.forEach(function(t) { t.classList.remove('active'); });
3197
+ sections.forEach(function(s) { s.classList.remove('active'); });
3198
+ tab.classList.add('active');
3199
+ sections[idx].classList.add('active');
3200
+ });
3201
+ });` : ""}
3202
+
3203
+ /* Column sorting */
3204
+ document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
3205
+ var table = th.closest('table');
3206
+ var asc = true;
3207
+ th.addEventListener('click', function() {
3208
+ var tbody = table.querySelector('tbody');
3209
+ var rows = Array.from(tbody.querySelectorAll('tr'));
3210
+ rows.sort(function(a, b) {
3211
+ var aCell = a.children[colIdx];
3212
+ var bCell = b.children[colIdx];
3213
+ var aVal = aCell.getAttribute('data-sort-val');
3214
+ var bVal = bCell.getAttribute('data-sort-val');
3215
+ if (aVal !== null && bVal !== null) {
3216
+ return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
3217
+ }
3218
+ var aText = aCell.textContent || '';
3219
+ var bText = bCell.textContent || '';
3220
+ return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
3221
+ });
3222
+ rows.forEach(function(row) { tbody.appendChild(row); });
3223
+
3224
+ /* Update sort arrows */
3225
+ table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
3226
+ th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
3227
+ asc = !asc;
3228
+ });
3229
+ });
3230
+ })();
3231
+ </script>`;
3232
+ }
3233
+
3234
+ // src/ci.ts
3235
+ import { readFileSync, writeFileSync, mkdirSync } from "fs";
3236
+ import { dirname } from "path";
3237
+ var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
3238
+ var FLAKY_CV_THRESHOLD = 0.3;
3239
+ var T_CRITICAL_95 = {
3240
+ 1: 12.706,
3241
+ 2: 4.303,
3242
+ 3: 3.182,
3243
+ 4: 2.776,
3244
+ 5: 2.571,
3245
+ 6: 2.447,
3246
+ 7: 2.365,
3247
+ 8: 2.306,
3248
+ 9: 2.262,
3249
+ 10: 2.228,
3250
+ 15: 2.131,
3251
+ 20: 2.086,
3252
+ 25: 2.06,
3253
+ 30: 2.042
3254
+ };
3255
+ var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
3256
+ function tCritical(df) {
3257
+ if (df <= 0) return 1.96;
3258
+ if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
3259
+ const keys = T_CRITICAL_KEYS;
3260
+ if (df > keys[keys.length - 1]) return 1.96;
3261
+ for (let i = 0; i < keys.length - 1; i++) {
3262
+ if (df > keys[i] && df < keys[i + 1]) {
3263
+ const low = keys[i], high = keys[i + 1];
3264
+ const ratio = (df - low) / (high - low);
3265
+ return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
3266
+ }
3267
+ }
3268
+ return 1.96;
3269
+ }
3270
+ function computeScorerStats(samples) {
3271
+ const n = samples.length;
3272
+ if (n === 0) {
3273
+ return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
3274
+ }
3275
+ const mean = samples.reduce((a, b) => a + b, 0) / n;
3276
+ if (n === 1) {
3277
+ return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
3278
+ }
3279
+ const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
3280
+ const stddev = Math.sqrt(variance);
3281
+ const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
3282
+ const se = stddev / Math.sqrt(n);
3283
+ const t = tCritical(n - 1);
3284
+ return {
3285
+ mean,
3286
+ stddev,
3287
+ cv,
3288
+ n,
3289
+ ci95Lower: mean - t * se,
3290
+ ci95Upper: mean + t * se
3291
+ };
3292
+ }
3293
+ function groupKey(providerId, taskName, scorerName) {
3294
+ return `${providerId}::${taskName}::${scorerName}`;
3295
+ }
3296
+ function computeStats(results) {
3297
+ const grouped = /* @__PURE__ */ new Map();
3298
+ for (const r of results) {
3299
+ if (r.error) continue;
3300
+ for (const score of r.scores) {
3301
+ if (score.value < 0) continue;
3302
+ const key = groupKey(r.providerId, r.taskName, score.name);
3303
+ if (!grouped.has(key)) grouped.set(key, []);
3304
+ grouped.get(key).push(score.value);
3305
+ }
3306
+ }
3307
+ const stats = /* @__PURE__ */ new Map();
3308
+ for (const [key, samples] of grouped) {
3309
+ stats.set(key, computeScorerStats(samples));
3310
+ }
3311
+ return stats;
3312
+ }
3313
+ function computeCostSummary(results, budget) {
3314
+ let totalUsd = 0;
3315
+ const perProvider = /* @__PURE__ */ new Map();
3316
+ for (const r of results) {
3317
+ if (r.error) continue;
3318
+ const costScore = r.scores.find((s) => s.name === "cost");
3319
+ if (!costScore || costScore.value < 0) continue;
3320
+ const details = costScore.details;
3321
+ const usd = details?.estimatedUsd ?? 0;
3322
+ if (usd <= 0) continue;
3323
+ totalUsd += usd;
3324
+ perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
3325
+ }
3326
+ return {
3327
+ totalUsd,
3328
+ perProvider,
3329
+ budget,
3330
+ overBudget: budget !== void 0 && totalUsd > budget
3331
+ };
3332
+ }
3333
+ function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
3334
+ const comparisons = [];
3335
+ const failureReasons = [];
3336
+ for (const [key, current] of currentStats) {
3337
+ const [providerId, taskName, scorerName] = key.split("::");
3338
+ const baseline = baselineStats?.get(key) ?? null;
3339
+ let delta = null;
3340
+ let regressed = false;
3341
+ let improved = false;
3342
+ if (baseline) {
3343
+ delta = current.mean - baseline.mean;
3344
+ const threshold = thresholds.get(scorerName);
3345
+ if (threshold !== void 0) {
3346
+ const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
3347
+ regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
3348
+ improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
3349
+ }
3350
+ }
3351
+ const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
3352
+ comparisons.push({
3353
+ providerId,
3354
+ taskName,
3355
+ scorerName,
3356
+ baseline,
3357
+ current,
3358
+ delta,
3359
+ regressed,
3360
+ improved,
3361
+ flaky
3362
+ });
3363
+ }
3364
+ const cost = computeCostSummary(currentResults ?? [], budget);
3365
+ const regressions = comparisons.filter((c) => c.regressed);
3366
+ if (regressions.length > 0) {
3367
+ for (const r of regressions) {
3368
+ failureReasons.push(
3369
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
3370
+ );
3371
+ }
3372
+ }
3373
+ if (cost.overBudget) {
3374
+ failureReasons.push(
3375
+ `Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
3376
+ );
3377
+ }
3378
+ const flakyResults = comparisons.filter((c) => c.flaky);
3379
+ const failed = failureReasons.length > 0;
3380
+ return { comparisons, cost, failed, flakyResults, failureReasons };
3381
+ }
3382
+ function detectRegression(baseline, current, threshold, lowerIsBetter) {
3383
+ if (baseline.n === 1 && current.n === 1) {
3384
+ const delta = current.mean - baseline.mean;
3385
+ if (lowerIsBetter) return delta > threshold;
3386
+ return delta < -threshold;
3387
+ }
3388
+ if (lowerIsBetter) {
3389
+ return current.ci95Lower - baseline.ci95Upper > threshold;
3390
+ }
3391
+ return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
3392
+ }
3393
+ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
3394
+ if (baseline.n === 1 && current.n === 1) {
3395
+ const delta = current.mean - baseline.mean;
3396
+ if (lowerIsBetter) return delta < -threshold;
3397
+ return delta > threshold;
3398
+ }
3399
+ if (lowerIsBetter) {
3400
+ return baseline.ci95Lower - current.ci95Upper > threshold;
3401
+ }
3402
+ return current.ci95Lower - baseline.ci95Upper > threshold;
3403
+ }
3404
+ function loadBaseline(path) {
3405
+ try {
3406
+ const raw = readFileSync(path, "utf-8");
3407
+ const data = JSON.parse(raw);
3408
+ const results = data.results ?? data;
3409
+ if (!Array.isArray(results)) return null;
3410
+ return {
3411
+ timestamp: data.timestamp ?? "unknown",
3412
+ results
3413
+ };
3414
+ } catch {
3415
+ return null;
3416
+ }
3417
+ }
3418
+ function saveBaseline(path, results) {
3419
+ mkdirSync(dirname(path), { recursive: true });
3420
+ const data = {
3421
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3422
+ results
3423
+ };
3424
+ writeFileSync(path, JSON.stringify(data, null, 2));
3425
+ }
3426
+
3427
+ // src/github.ts
3428
+ import { readFileSync as readFileSync2 } from "fs";
3429
+ function detectGitHubContext() {
3430
+ const token = process.env.GITHUB_TOKEN;
3431
+ const repository = process.env.GITHUB_REPOSITORY;
3432
+ const eventPath = process.env.GITHUB_EVENT_PATH;
3433
+ if (!token || !repository) return null;
3434
+ const [owner, repo] = repository.split("/");
3435
+ if (!owner || !repo) return null;
3436
+ let prNumber;
3437
+ if (eventPath) {
3438
+ try {
3439
+ const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
3440
+ if (event.pull_request && typeof event.pull_request === "object") {
3441
+ const pr = event.pull_request;
3442
+ prNumber = pr.number;
3443
+ }
3444
+ if (!prNumber && event.issue && typeof event.issue === "object") {
3445
+ const issue = event.issue;
3446
+ if (issue.pull_request) {
3447
+ prNumber = issue.number;
3448
+ }
3449
+ }
3450
+ } catch {
3451
+ }
3452
+ }
3453
+ if (!prNumber && process.env.DUELIST_PR_NUMBER) {
3454
+ prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
3455
+ }
3456
+ if (!prNumber) return null;
3457
+ return { token, owner, repo, prNumber };
3458
+ }
3459
+ var API_BASE = "https://api.github.com";
3460
+ function ghHeaders(token, extra) {
3461
+ return {
3462
+ Authorization: `Bearer ${token}`,
3463
+ Accept: "application/vnd.github+json",
3464
+ "X-GitHub-Api-Version": "2022-11-28",
3465
+ ...extra
3466
+ };
3467
+ }
3468
+ async function findExistingComment(ctx, marker) {
3469
+ let page = 1;
3470
+ const perPage = 50;
3471
+ while (true) {
3472
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
3473
+ const res = await fetch(url, { headers: ghHeaders(ctx.token) });
3474
+ if (!res.ok) return null;
3475
+ const comments = await res.json();
3476
+ if (comments.length === 0) break;
3477
+ for (const comment of comments) {
3478
+ if (comment.body?.includes(marker)) {
3479
+ return comment.id;
3480
+ }
3481
+ }
3482
+ if (comments.length < perPage) break;
3483
+ page++;
3484
+ }
3485
+ return null;
3486
+ }
3487
+ async function upsertPrComment(ctx, body, marker) {
3488
+ const existingId = await findExistingComment(ctx, marker);
3489
+ if (existingId) {
3490
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
3491
+ const res = await fetch(url, {
3492
+ method: "PATCH",
3493
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
3494
+ body: JSON.stringify({ body })
3495
+ });
3496
+ if (!res.ok) {
3497
+ const text = await res.text();
3498
+ console.warn(`Failed to update PR comment: ${res.status} ${text}`);
3499
+ }
3500
+ } else {
3501
+ const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
3502
+ const res = await fetch(url, {
3503
+ method: "POST",
3504
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
3505
+ body: JSON.stringify({ body })
3506
+ });
3507
+ if (!res.ok) {
3508
+ const text = await res.text();
3509
+ console.warn(`Failed to create PR comment: ${res.status} ${text}`);
3510
+ }
3511
+ }
2150
3512
  }
2151
3513
  export {
2152
3514
  anthropic,
2153
3515
  azureOpenai,
3516
+ compareResults,
3517
+ computeStats,
2154
3518
  consoleReporter,
2155
3519
  defineArena,
3520
+ detectGitHubContext,
2156
3521
  gemini,
3522
+ htmlReporter,
2157
3523
  jsonReporter,
3524
+ loadBaseline,
3525
+ markdownReporter,
2158
3526
  openai,
2159
3527
  openaiCompatible,
2160
- registerPricing
3528
+ registerPricing,
3529
+ saveBaseline,
3530
+ upsertPrComment
2161
3531
  };
2162
3532
  //# sourceMappingURL=index.js.map