agent-duelist 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -38,6 +38,7 @@ __export(index_exports, {
38
38
  defineArena: () => defineArena,
39
39
  detectGitHubContext: () => detectGitHubContext,
40
40
  gemini: () => gemini,
41
+ htmlReporter: () => htmlReporter,
41
42
  jsonReporter: () => jsonReporter,
42
43
  loadBaseline: () => loadBaseline,
43
44
  markdownReporter: () => markdownReporter,
@@ -1461,11 +1462,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
1461
1462
  }
1462
1463
  const a = stringify(task.expected);
1463
1464
  const b = stringify(result.output);
1464
- const similarity = jaccardSimilarity(tokenize(a), tokenize(b));
1465
+ const setA = tokenize(a);
1466
+ const setB = tokenize(b);
1467
+ const similarity = jaccardSimilarity(setA, setB);
1465
1468
  return {
1466
1469
  name: "fuzzy-similarity",
1467
1470
  value: Math.round(similarity * 100) / 100,
1468
- details: { method: "jaccard", expectedTokens: tokenize(a).size, actualTokens: tokenize(b).size }
1471
+ details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
1469
1472
  };
1470
1473
  };
1471
1474
  function stringify(value) {
@@ -1491,6 +1494,19 @@ var import_openai2 = __toESM(require("openai"), 1);
1491
1494
  // src/providers/openai.ts
1492
1495
  var import_openai = __toESM(require("openai"), 1);
1493
1496
  var import_zod_to_json_schema = require("zod-to-json-schema");
1497
+
1498
+ // src/providers/shared.ts
1499
+ var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
1500
+ function parseSchemaOutput(rawContent, hasSchema) {
1501
+ if (!hasSchema) return rawContent;
1502
+ try {
1503
+ return JSON.parse(rawContent);
1504
+ } catch {
1505
+ return rawContent;
1506
+ }
1507
+ }
1508
+
1509
+ // src/providers/openai.ts
1494
1510
  var REQUEST_TIMEOUT_MS = 6e4;
1495
1511
  function openai(model, options) {
1496
1512
  const client = new import_openai.default({
@@ -1537,7 +1553,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1537
1553
  if (input.schema) {
1538
1554
  params.response_format = { type: "json_object" };
1539
1555
  params.messages = [
1540
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
1556
+ { role: "system", content: SCHEMA_SYSTEM_MESSAGE },
1541
1557
  ...params.messages
1542
1558
  ];
1543
1559
  }
@@ -1590,13 +1606,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1590
1606
  if (stripThinking) {
1591
1607
  rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1592
1608
  }
1593
- let output = rawContent;
1594
- if (input.schema) {
1595
- try {
1596
- output = JSON.parse(rawContent);
1597
- } catch {
1598
- }
1599
- }
1609
+ const output = parseSchemaOutput(rawContent, !!input.schema);
1600
1610
  return {
1601
1611
  output,
1602
1612
  usage: {
@@ -1610,6 +1620,20 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1610
1620
  }
1611
1621
  };
1612
1622
  }
1623
+ function gemini(model, options) {
1624
+ const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
1625
+ if (!apiKey) {
1626
+ throw new Error(
1627
+ `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
1628
+ );
1629
+ }
1630
+ const client = new import_openai.default({
1631
+ apiKey,
1632
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1633
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1634
+ });
1635
+ return makeProvider(`google/${model}`, "Google AI", model, client, model);
1636
+ }
1613
1637
  function toolDefToOpenAI(tool) {
1614
1638
  return {
1615
1639
  type: "function",
@@ -1687,8 +1711,7 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1687
1711
  const response = await client.chat.completions.create({
1688
1712
  model,
1689
1713
  messages: [{ role: "user", content: prompt }],
1690
- temperature: 0,
1691
- max_tokens: 2048
1714
+ max_completion_tokens: 2048
1692
1715
  });
1693
1716
  const content = response.choices[0]?.message?.content?.trim() ?? "";
1694
1717
  const parsed = {};
@@ -1784,118 +1807,173 @@ async function runBenchmarks(options) {
1784
1807
  const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1785
1808
  const results = [];
1786
1809
  for (const task of tasks) {
1787
- for (const provider of providers) {
1788
- for (let run = 1; run <= runs; run++) {
1789
- let result;
1790
- try {
1791
- const taskResult = await withTimeout((signal) => provider.run({
1792
- prompt: task.prompt,
1793
- schema: task.schema,
1794
- tools: task.tools,
1795
- signal
1796
- }), timeout);
1797
- const scores = await Promise.all(
1798
- scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1799
- );
1800
- result = {
1801
- providerId: provider.id,
1802
- taskName: task.name,
1803
- run,
1804
- scores,
1805
- raw: {
1806
- output: taskResult.output,
1807
- latencyMs: taskResult.latencyMs,
1808
- usage: taskResult.usage,
1809
- toolCalls: taskResult.toolCalls
1810
- }
1811
- };
1812
- } catch (err) {
1813
- const message = err instanceof Error ? err.message : String(err);
1814
- result = {
1815
- providerId: provider.id,
1816
- taskName: task.name,
1817
- run,
1818
- scores: [],
1819
- error: message,
1820
- raw: { output: "", latencyMs: 0 }
1821
- };
1822
- }
1823
- results.push(result);
1824
- onResult?.(result);
1825
- }
1810
+ for (let run = 1; run <= runs; run++) {
1811
+ const runResults = await Promise.all(
1812
+ providers.map(async (provider) => {
1813
+ let result;
1814
+ try {
1815
+ const taskResult = await withTimeout((signal) => provider.run({
1816
+ prompt: task.prompt,
1817
+ schema: task.schema,
1818
+ tools: task.tools,
1819
+ signal
1820
+ }), timeout);
1821
+ const scores = await Promise.all(
1822
+ scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1823
+ );
1824
+ result = {
1825
+ providerId: provider.id,
1826
+ taskName: task.name,
1827
+ run,
1828
+ scores,
1829
+ raw: {
1830
+ output: taskResult.output,
1831
+ latencyMs: taskResult.latencyMs,
1832
+ usage: taskResult.usage,
1833
+ toolCalls: taskResult.toolCalls
1834
+ }
1835
+ };
1836
+ } catch (err) {
1837
+ const message = err instanceof Error ? err.message : String(err);
1838
+ result = {
1839
+ providerId: provider.id,
1840
+ taskName: task.name,
1841
+ run,
1842
+ scores: [],
1843
+ error: message,
1844
+ raw: { output: "", latencyMs: 0 }
1845
+ };
1846
+ }
1847
+ onResult?.(result);
1848
+ return result;
1849
+ })
1850
+ );
1851
+ results.push(...runResults);
1826
1852
  }
1827
1853
  }
1828
1854
  return results;
1829
1855
  }
1830
1856
 
1831
- // src/reporter/console.ts
1832
- var reset = "\x1B[0m";
1833
- var boldCode = "\x1B[1m";
1834
- var dimCode = "\x1B[2m";
1835
- var green = "\x1B[32m";
1836
- var red = "\x1B[31m";
1837
- var yellow = "\x1B[33m";
1838
- var cyan = "\x1B[36m";
1839
- var brightGreen = "\x1B[92m";
1840
- var brightWhite = "\x1B[97m";
1841
- function bold(s) {
1842
- return `${boldCode}${s}${reset}`;
1843
- }
1844
- function dim(s) {
1845
- return `${dimCode}${s}${reset}`;
1857
+ // src/utils/format.ts
1858
+ var MAX_FRACTION_DIGITS = 100;
1859
+ function formatCost(usd) {
1860
+ if (usd === void 0) return "\u2014";
1861
+ if (usd === 0) return "$0.00";
1862
+ if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1863
+ const digits = Math.min(
1864
+ MAX_FRACTION_DIGITS,
1865
+ Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
1866
+ );
1867
+ return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1846
1868
  }
1847
- function stripAnsi(s) {
1848
- return s.replace(/\x1b\[[0-9;]*m/g, "");
1869
+ function formatDelta(delta, precision = 4) {
1870
+ const sign = delta >= 0 ? "+" : "";
1871
+ return `${sign}${delta.toFixed(precision)}`;
1849
1872
  }
1850
- function displayWidth(s) {
1851
- const stripped = stripAnsi(s);
1852
- let width = 0;
1853
- for (const ch of stripped) {
1854
- const code = ch.codePointAt(0) ?? 0;
1855
- if (code >= 126976) width += 2;
1856
- else if (code >= 9728 && code <= 10175) width += 2;
1857
- else width += 1;
1873
+
1874
+ // src/reporter/shared.ts
1875
+ function groupResults(results) {
1876
+ const taskSet = /* @__PURE__ */ new Set();
1877
+ const providerSet = /* @__PURE__ */ new Set();
1878
+ const scorerSet = /* @__PURE__ */ new Set();
1879
+ const grouped = /* @__PURE__ */ new Map();
1880
+ const byProvider = /* @__PURE__ */ new Map();
1881
+ let hasErrors = false;
1882
+ let maxRun = 0;
1883
+ for (const r of results) {
1884
+ taskSet.add(r.taskName);
1885
+ providerSet.add(r.providerId);
1886
+ for (const s of r.scores) scorerSet.add(s.name);
1887
+ if (r.error) hasErrors = true;
1888
+ if (r.run > maxRun) maxRun = r.run;
1889
+ const key = `${r.taskName}::${r.providerId}`;
1890
+ let group = grouped.get(key);
1891
+ if (!group) {
1892
+ group = [];
1893
+ grouped.set(key, group);
1894
+ }
1895
+ group.push(r);
1896
+ let provGroup = byProvider.get(r.providerId);
1897
+ if (!provGroup) {
1898
+ provGroup = [];
1899
+ byProvider.set(r.providerId, provGroup);
1900
+ }
1901
+ provGroup.push(r);
1858
1902
  }
1859
- return width;
1860
- }
1861
- function padCell(str, targetWidth, align) {
1862
- const dw = displayWidth(str);
1863
- const padding = Math.max(0, targetWidth - dw);
1864
- if (align === "right") return " ".repeat(padding) + str;
1865
- return str + " ".repeat(padding);
1866
- }
1867
- function sparkBar(ratio, width = 8) {
1868
- const clamped = Math.max(0, Math.min(1, ratio));
1869
- const fillLen = Math.round(clamped * width);
1870
- const fill = "\u2593".repeat(fillLen);
1871
- const track = "\u2591".repeat(width - fillLen);
1872
- return { fill, track };
1903
+ return {
1904
+ tasks: [...taskSet],
1905
+ providers: [...providerSet],
1906
+ scorerNames: [...scorerSet],
1907
+ grouped,
1908
+ byProvider,
1909
+ hasErrors,
1910
+ maxRun
1911
+ };
1873
1912
  }
1874
- function drawTableLine(widths, position) {
1875
- const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1876
- if (position === "bottom") {
1877
- return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
1913
+ function aggregateProviderTask(providerId, grouped, task) {
1914
+ const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
1915
+ const errorResults = taskResults.filter((r) => r.error);
1916
+ const successResults = taskResults.filter((r) => !r.error);
1917
+ if (successResults.length === 0) {
1918
+ return {
1919
+ providerId,
1920
+ avgScores: {},
1921
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
1922
+ latencyMs: void 0,
1923
+ allErrors: errorResults.length > 0,
1924
+ errorCount: errorResults.length
1925
+ };
1878
1926
  }
1879
- if (position === "merge") {
1880
- return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
1927
+ return {
1928
+ providerId,
1929
+ avgScores: averageScores(successResults),
1930
+ avgDetails: averageDetails(successResults),
1931
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1932
+ allErrors: false,
1933
+ errorCount: errorResults.length
1934
+ };
1935
+ }
1936
+ function averageScores(results) {
1937
+ const sums = {};
1938
+ const counts = {};
1939
+ for (const result of results) {
1940
+ for (const score of result.scores) {
1941
+ if (score.value < 0) continue;
1942
+ sums[score.name] = (sums[score.name] ?? 0) + score.value;
1943
+ counts[score.name] = (counts[score.name] ?? 0) + 1;
1944
+ }
1881
1945
  }
1882
- const segments = widths.map((w) => "\u2500".repeat(w + 2));
1883
- if (position === "top") {
1884
- return dim(`\u250C${segments.join("\u252C")}\u2510`);
1946
+ const avgs = {};
1947
+ for (const name of Object.keys(sums)) {
1948
+ avgs[name] = sums[name] / counts[name];
1885
1949
  }
1886
- return dim(`\u251C${segments.join("\u253C")}\u2524`);
1950
+ return avgs;
1887
1951
  }
1888
- function drawTableRow(cells, widths, aligns) {
1889
- const parts = cells.map(
1890
- (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
1891
- );
1892
- return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
1952
+ function averageDetails(results) {
1953
+ let costSum = 0;
1954
+ let costCount = 0;
1955
+ let tokenSum = 0;
1956
+ let tokenCount = 0;
1957
+ for (const result of results) {
1958
+ const costScore = result.scores.find((s) => s.name === "cost");
1959
+ const details = costScore?.details;
1960
+ if (details?.estimatedUsd != null) {
1961
+ costSum += details.estimatedUsd;
1962
+ costCount++;
1963
+ }
1964
+ if (details?.totalTokens != null) {
1965
+ tokenSum += details.totalTokens;
1966
+ tokenCount++;
1967
+ }
1968
+ }
1969
+ return {
1970
+ costUsd: costCount > 0 ? costSum / costCount : void 0,
1971
+ totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
1972
+ };
1893
1973
  }
1894
- function drawSpanRow(content, widths) {
1895
- const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1896
- const dw = displayWidth(content);
1897
- const padding = Math.max(0, totalInner - dw - 1);
1898
- return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
1974
+ function average(nums) {
1975
+ if (nums.length === 0) return void 0;
1976
+ return nums.reduce((a, b) => a + b, 0) / nums.length;
1899
1977
  }
1900
1978
  function computeColumnStats(providerData, scorerNames) {
1901
1979
  const stats = /* @__PURE__ */ new Map();
@@ -1947,62 +2025,235 @@ function computeColumnStats(providerData, scorerNames) {
1947
2025
  }
1948
2026
  return stats;
1949
2027
  }
1950
- function colorByRank(text, value, colStats, providerCount) {
1951
- if (value === void 0) return dim("\u2014");
1952
- if (providerCount < 2) return text;
1953
- if (colStats.best === void 0 || colStats.worst === void 0) return text;
1954
- if (colStats.best === colStats.worst) return text;
1955
- if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
1956
- if (value === colStats.worst) return `${red}${text}${reset}`;
1957
- return `${yellow}${text}${reset}`;
1958
- }
1959
2028
  function computeMedals(columnStats, providerIds) {
1960
2029
  const medals = /* @__PURE__ */ new Map();
1961
2030
  if (providerIds.length < 2) {
1962
- for (const id of providerIds) medals.set(id, "");
2031
+ for (const id of providerIds) medals.set(id, "none");
1963
2032
  return medals;
1964
2033
  }
1965
2034
  const wins = /* @__PURE__ */ new Map();
1966
2035
  for (const id of providerIds) wins.set(id, 0);
1967
2036
  for (const [, colStats] of columnStats) {
1968
2037
  if (colStats.best === void 0) continue;
1969
- for (const [providerId, value] of colStats.values) {
1970
- if (value !== void 0 && value === colStats.best) {
1971
- wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
1972
- }
2038
+ const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
2039
+ if (bestProviders.length === 1) {
2040
+ wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
1973
2041
  }
1974
2042
  }
1975
2043
  const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
1976
2044
  if (totalWins === 0) {
1977
- for (const id of providerIds) medals.set(id, "");
2045
+ for (const id of providerIds) medals.set(id, "none");
1978
2046
  return medals;
1979
2047
  }
1980
2048
  const sorted = [...wins.entries()].sort(
1981
2049
  (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
1982
2050
  );
1983
- const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
2051
+ const medalList = ["gold", "silver", "bronze"];
1984
2052
  let rank = 0;
1985
2053
  for (let i = 0; i < sorted.length; i++) {
1986
2054
  if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
1987
2055
  rank = i;
1988
2056
  }
1989
- medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
2057
+ const hasWins = sorted[i][1] > 0;
2058
+ medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
1990
2059
  }
1991
2060
  return medals;
1992
2061
  }
2062
+ function providerLabel(providerId) {
2063
+ const prefix = providerId.split("/")[0];
2064
+ switch (prefix) {
2065
+ case "azure":
2066
+ return "(OpenAI via Azure)";
2067
+ case "openai":
2068
+ return "(OpenAI)";
2069
+ case "anthropic":
2070
+ return "(Anthropic)";
2071
+ case "google":
2072
+ return "(Google)";
2073
+ case "mistral":
2074
+ return "(Mistral)";
2075
+ case "meta":
2076
+ return "(Meta)";
2077
+ case "deepseek":
2078
+ return "(DeepSeek)";
2079
+ case "cohere":
2080
+ return "(Cohere)";
2081
+ case "qwen":
2082
+ return "(Qwen)";
2083
+ case "xai":
2084
+ return "(xAI)";
2085
+ case "minimax":
2086
+ return "(MiniMax)";
2087
+ case "moonshot":
2088
+ return "(Moonshot / Kimi)";
2089
+ case "perplexity":
2090
+ return "(Perplexity)";
2091
+ case "amazon":
2092
+ return "(Amazon)";
2093
+ case "nvidia":
2094
+ return "(NVIDIA)";
2095
+ case "microsoft":
2096
+ return "(Microsoft)";
2097
+ case "ai21":
2098
+ return "(AI21 Labs)";
2099
+ case "bytedance":
2100
+ return "(ByteDance)";
2101
+ case "together":
2102
+ return "(Together AI)";
2103
+ case "fireworks":
2104
+ return "(Fireworks AI)";
2105
+ case "groq":
2106
+ return "(Groq)";
2107
+ case "cerebras":
2108
+ return "(Cerebras)";
2109
+ default:
2110
+ return `(${prefix})`;
2111
+ }
2112
+ }
2113
+ function apiKeyHint(providerId, error) {
2114
+ const lower = error.toLowerCase();
2115
+ const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2116
+ if (!isAuthError) return void 0;
2117
+ const prefix = providerId.split("/")[0];
2118
+ switch (prefix) {
2119
+ case "openai":
2120
+ return "Set: export OPENAI_API_KEY=sk-...";
2121
+ case "azure":
2122
+ return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2123
+ case "anthropic":
2124
+ return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2125
+ case "google":
2126
+ return "Set: export GOOGLE_API_KEY=...";
2127
+ default:
2128
+ return `Check the API key for ${providerId}`;
2129
+ }
2130
+ }
2131
+ function rankProviders(successByProvider, providers, scorerName) {
2132
+ const ranked = providers.map((id) => {
2133
+ const runs = successByProvider.get(id) ?? [];
2134
+ const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2135
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2136
+ return { id, avg };
2137
+ }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2138
+ return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2139
+ }
2140
+ function scorerLabel(name) {
2141
+ switch (name) {
2142
+ case "correctness":
2143
+ return "Match";
2144
+ case "schema-correctness":
2145
+ return "Schema";
2146
+ case "fuzzy-similarity":
2147
+ return "Fuzzy";
2148
+ case "llm-judge-correctness":
2149
+ return "Judge";
2150
+ case "tool-usage":
2151
+ return "Tool";
2152
+ default:
2153
+ return name;
2154
+ }
2155
+ }
2156
+ function medalEmoji(medal) {
2157
+ switch (medal) {
2158
+ case "gold":
2159
+ return "\u{1F947}";
2160
+ case "silver":
2161
+ return "\u{1F948}";
2162
+ case "bronze":
2163
+ return "\u{1F949}";
2164
+ case "none":
2165
+ return "";
2166
+ }
2167
+ }
2168
+
2169
+ // src/reporter/console.ts
2170
+ var reset = "\x1B[0m";
2171
+ var boldCode = "\x1B[1m";
2172
+ var dimCode = "\x1B[2m";
2173
+ var green = "\x1B[32m";
2174
+ var red = "\x1B[31m";
2175
+ var yellow = "\x1B[33m";
2176
+ var cyan = "\x1B[36m";
2177
+ var brightGreen = "\x1B[92m";
2178
+ var brightWhite = "\x1B[97m";
2179
+ function bold(s) {
2180
+ return `${boldCode}${s}${reset}`;
2181
+ }
2182
+ function dim(s) {
2183
+ return `${dimCode}${s}${reset}`;
2184
+ }
2185
+ function stripAnsi(s) {
2186
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
2187
+ }
2188
+ function displayWidth(s) {
2189
+ const stripped = stripAnsi(s);
2190
+ let width = 0;
2191
+ for (const ch of stripped) {
2192
+ const code = ch.codePointAt(0) ?? 0;
2193
+ if (code >= 126976) width += 2;
2194
+ else if (code >= 9728 && code <= 10175) width += 2;
2195
+ else width += 1;
2196
+ }
2197
+ return width;
2198
+ }
2199
+ function padCell(str, targetWidth, align) {
2200
+ const dw = displayWidth(str);
2201
+ const padding = Math.max(0, targetWidth - dw);
2202
+ if (align === "right") return " ".repeat(padding) + str;
2203
+ return str + " ".repeat(padding);
2204
+ }
2205
+ function sparkBar(ratio, width = 8) {
2206
+ const clamped = Math.max(0, Math.min(1, ratio));
2207
+ const fillLen = Math.round(clamped * width);
2208
+ const fill = "\u2593".repeat(fillLen);
2209
+ const track = "\u2591".repeat(width - fillLen);
2210
+ return { fill, track };
2211
+ }
2212
+ function drawTableLine(widths, position) {
2213
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2214
+ if (position === "bottom") {
2215
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
2216
+ }
2217
+ if (position === "merge") {
2218
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
2219
+ }
2220
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
2221
+ if (position === "top") {
2222
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
2223
+ }
2224
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
2225
+ }
2226
+ function drawTableRow(cells, widths, aligns) {
2227
+ const parts = cells.map(
2228
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
2229
+ );
2230
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
2231
+ }
2232
+ function drawSpanRow(content, widths) {
2233
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2234
+ const dw = displayWidth(content);
2235
+ const padding = Math.max(0, totalInner - dw - 1);
2236
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
2237
+ }
2238
+ function colorByRank(text, value, colStats, providerCount) {
2239
+ if (value === void 0) return dim("\u2014");
2240
+ if (providerCount < 2) return text;
2241
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
2242
+ if (colStats.best === colStats.worst) return text;
2243
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
2244
+ if (value === colStats.worst) return `${red}${text}${reset}`;
2245
+ return `${yellow}${text}${reset}`;
2246
+ }
1993
2247
  function consoleReporter(results, options) {
1994
2248
  const showSparklines = options?.sparklines ?? true;
1995
2249
  if (results.length === 0) {
1996
2250
  console.log("\nNo results to display.\n");
1997
2251
  return;
1998
2252
  }
1999
- const tasks = [...new Set(results.map((r) => r.taskName))];
2000
- const providers = [...new Set(results.map((r) => r.providerId))];
2001
- const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
2253
+ const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
2002
2254
  const hasCost = scorerNames.includes("cost");
2003
- const hasErrors = results.some((r) => r.error);
2004
2255
  const multi = providers.length >= 2;
2005
- const runsPerCell = Math.max(...results.map((r) => r.run));
2256
+ const runsPerCell = maxRun;
2006
2257
  const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
2007
2258
  console.log("");
2008
2259
  console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
@@ -2011,29 +2262,9 @@ function consoleReporter(results, options) {
2011
2262
  for (const task of tasks) {
2012
2263
  console.log(` ${bold(`Task: ${task}`)}`);
2013
2264
  console.log("");
2014
- const providerData = providers.map((providerId) => {
2015
- const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
2016
- const errorResults2 = taskResults.filter((r) => r.error);
2017
- const successResults = taskResults.filter((r) => !r.error);
2018
- if (successResults.length === 0) {
2019
- return {
2020
- providerId,
2021
- avgScores: {},
2022
- avgDetails: { costUsd: void 0, totalTokens: void 0 },
2023
- latencyMs: void 0,
2024
- allErrors: errorResults2.length > 0,
2025
- errorCount: errorResults2.length
2026
- };
2027
- }
2028
- return {
2029
- providerId,
2030
- avgScores: averageScores(successResults),
2031
- avgDetails: averageDetails(successResults),
2032
- latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
2033
- allErrors: false,
2034
- errorCount: errorResults2.length
2035
- };
2036
- });
2265
+ const providerData = providers.map(
2266
+ (providerId) => aggregateProviderTask(providerId, grouped, task)
2267
+ );
2037
2268
  const columnStats = computeColumnStats(providerData, scorerNames);
2038
2269
  const medals = computeMedals(columnStats, providers);
2039
2270
  const maxProviderLen = Math.max(...providers.map((id) => id.length));
@@ -2048,8 +2279,7 @@ function consoleReporter(results, options) {
2048
2279
  cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
2049
2280
  cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
2050
2281
  } else {
2051
- const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
2052
- cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2282
+ cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2053
2283
  }
2054
2284
  }
2055
2285
  if (hasErrors) {
@@ -2062,7 +2292,7 @@ function consoleReporter(results, options) {
2062
2292
  console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2063
2293
  console.log(` ${drawTableLine(widths, "header")}`);
2064
2294
  for (const pd of providerData) {
2065
- const medal = medals.get(pd.providerId) ?? "";
2295
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
2066
2296
  const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2067
2297
  const cells = [providerCell];
2068
2298
  if (pd.allErrors) {
@@ -2135,7 +2365,7 @@ function consoleReporter(results, options) {
2135
2365
  console.log(` ${drawTableRow(cells, widths, aligns)}`);
2136
2366
  }
2137
2367
  if (multi && providerData.some((p) => !p.allErrors)) {
2138
- const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
2368
+ const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
2139
2369
  if (winnerId) {
2140
2370
  console.log(` ${drawTableLine(widths, "merge")}`);
2141
2371
  const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
@@ -2145,7 +2375,7 @@ function consoleReporter(results, options) {
2145
2375
  console.log(` ${drawTableLine(widths, "bottom")}`);
2146
2376
  console.log("");
2147
2377
  }
2148
- printSummary(results, providers);
2378
+ printSummary(results, providers, byProvider);
2149
2379
  const errorResults = results.filter((r) => r.error);
2150
2380
  if (errorResults.length > 0) {
2151
2381
  console.log(` ${bold("Errors")}`);
@@ -2168,15 +2398,19 @@ function consoleReporter(results, options) {
2168
2398
  console.log("");
2169
2399
  }
2170
2400
  }
2171
- function printSummary(results, providers) {
2401
+ function printSummary(results, providers, byProvider) {
2172
2402
  const successResults = results.filter((r) => !r.error);
2173
2403
  if (successResults.length === 0) return;
2404
+ const successByProvider = /* @__PURE__ */ new Map();
2405
+ for (const id of providers) {
2406
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2407
+ }
2174
2408
  console.log(` ${bold("Summary")}`);
2175
2409
  console.log(` ${dim("\u2501".repeat(72))}`);
2176
2410
  console.log("");
2177
2411
  const single = providers.length === 1;
2178
2412
  const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
2179
- const byCorrectness = rankProviders(successResults, providers, correctnessKey);
2413
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2180
2414
  if (byCorrectness) {
2181
2415
  const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2182
2416
  const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
@@ -2187,7 +2421,7 @@ function printSummary(results, providers) {
2187
2421
  }
2188
2422
  }
2189
2423
  const byLatency = providers.map((id) => {
2190
- const runs = successResults.filter((r) => r.providerId === id);
2424
+ const runs = successByProvider.get(id) ?? [];
2191
2425
  const avg = average(runs.map((r) => r.raw.latencyMs));
2192
2426
  return { id, avg: avg ?? Infinity };
2193
2427
  }).sort((a, b) => a.avg - b.avg)[0];
@@ -2201,7 +2435,7 @@ function printSummary(results, providers) {
2201
2435
  }
2202
2436
  }
2203
2437
  const byCost = providers.map((id) => {
2204
- const runs = successResults.filter((r) => r.providerId === id);
2438
+ const runs = successByProvider.get(id) ?? [];
2205
2439
  const costs = runs.map((r) => {
2206
2440
  const s = r.scores.find((s2) => s2.name === "cost");
2207
2441
  return s && s.value >= 0 ? s.value : void 0;
@@ -2239,133 +2473,6 @@ function printSummary(results, providers) {
2239
2473
  }
2240
2474
  console.log("");
2241
2475
  }
2242
- function rankProviders(results, providers, scorerName) {
2243
- const ranked = providers.map((id) => {
2244
- const runs = results.filter((r) => r.providerId === id);
2245
- const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2246
- const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2247
- return { id, avg };
2248
- }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2249
- return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2250
- }
2251
- function averageScores(results) {
2252
- const sums = {};
2253
- const counts = {};
2254
- for (const result of results) {
2255
- for (const score of result.scores) {
2256
- if (score.value < 0) continue;
2257
- sums[score.name] = (sums[score.name] ?? 0) + score.value;
2258
- counts[score.name] = (counts[score.name] ?? 0) + 1;
2259
- }
2260
- }
2261
- const avgs = {};
2262
- for (const name of Object.keys(sums)) {
2263
- avgs[name] = sums[name] / counts[name];
2264
- }
2265
- return avgs;
2266
- }
2267
- function averageDetails(results) {
2268
- let costSum = 0;
2269
- let costCount = 0;
2270
- let tokenSum = 0;
2271
- let tokenCount = 0;
2272
- for (const result of results) {
2273
- const costScore = result.scores.find((s) => s.name === "cost");
2274
- const details = costScore?.details;
2275
- if (details?.estimatedUsd != null) {
2276
- costSum += details.estimatedUsd;
2277
- costCount++;
2278
- }
2279
- if (details?.totalTokens != null) {
2280
- tokenSum += details.totalTokens;
2281
- tokenCount++;
2282
- }
2283
- }
2284
- return {
2285
- costUsd: costCount > 0 ? costSum / costCount : void 0,
2286
- totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
2287
- };
2288
- }
2289
- function average(nums) {
2290
- if (nums.length === 0) return void 0;
2291
- return nums.reduce((a, b) => a + b, 0) / nums.length;
2292
- }
2293
- function formatCost(usd) {
2294
- if (usd === void 0) return "\u2014";
2295
- if (usd === 0) return "$0.00";
2296
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
2297
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
2298
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
2299
- }
2300
- function apiKeyHint(providerId, error) {
2301
- const lower = error.toLowerCase();
2302
- const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2303
- if (!isAuthError) return void 0;
2304
- const prefix = providerId.split("/")[0];
2305
- switch (prefix) {
2306
- case "openai":
2307
- return "Set: export OPENAI_API_KEY=sk-...";
2308
- case "azure":
2309
- return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2310
- case "anthropic":
2311
- return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2312
- case "google":
2313
- return "Set: export GOOGLE_API_KEY=...";
2314
- default:
2315
- return `Check the API key for ${providerId}`;
2316
- }
2317
- }
2318
- function providerLabel(providerId) {
2319
- const prefix = providerId.split("/")[0];
2320
- switch (prefix) {
2321
- case "azure":
2322
- return "(OpenAI via Azure)";
2323
- case "openai":
2324
- return "(OpenAI)";
2325
- case "anthropic":
2326
- return "(Anthropic)";
2327
- case "google":
2328
- return "(Google)";
2329
- case "mistral":
2330
- return "(Mistral)";
2331
- case "meta":
2332
- return "(Meta)";
2333
- case "deepseek":
2334
- return "(DeepSeek)";
2335
- case "cohere":
2336
- return "(Cohere)";
2337
- case "qwen":
2338
- return "(Qwen)";
2339
- case "xai":
2340
- return "(xAI)";
2341
- case "minimax":
2342
- return "(MiniMax)";
2343
- case "moonshot":
2344
- return "(Moonshot / Kimi)";
2345
- case "perplexity":
2346
- return "(Perplexity)";
2347
- case "amazon":
2348
- return "(Amazon)";
2349
- case "nvidia":
2350
- return "(NVIDIA)";
2351
- case "microsoft":
2352
- return "(Microsoft)";
2353
- case "ai21":
2354
- return "(AI21 Labs)";
2355
- case "bytedance":
2356
- return "(ByteDance)";
2357
- case "together":
2358
- return "(Together AI)";
2359
- case "fireworks":
2360
- return "(Fireworks AI)";
2361
- case "groq":
2362
- return "(Groq)";
2363
- case "cerebras":
2364
- return "(Cerebras)";
2365
- default:
2366
- return `(${prefix})`;
2367
- }
2368
- }
2369
2476
 
2370
2477
  // src/reporter/json.ts
2371
2478
  function jsonReporter(results) {
@@ -2430,7 +2537,7 @@ function anthropic(model, options) {
2430
2537
  model,
2431
2538
  async run(input) {
2432
2539
  const start = Date.now();
2433
- const systemMessage = input.schema ? "Respond with valid JSON matching the requested schema." : void 0;
2540
+ const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
2434
2541
  const response = await client.messages.create({
2435
2542
  model,
2436
2543
  max_tokens: maxTokens,
@@ -2440,13 +2547,7 @@ function anthropic(model, options) {
2440
2547
  const latencyMs = Date.now() - start;
2441
2548
  const textBlock = response.content.find((b) => b.type === "text");
2442
2549
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
2443
- let output = rawContent;
2444
- if (input.schema) {
2445
- try {
2446
- output = JSON.parse(rawContent);
2447
- } catch {
2448
- }
2449
- }
2550
+ const output = parseSchemaOutput(rawContent, !!input.schema);
2450
2551
  return {
2451
2552
  output,
2452
2553
  usage: {
@@ -2460,23 +2561,6 @@ function anthropic(model, options) {
2460
2561
  };
2461
2562
  }
2462
2563
 
2463
- // src/providers/gemini.ts
2464
- var import_openai4 = __toESM(require("openai"), 1);
2465
- function gemini(model, options) {
2466
- const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
2467
- if (!apiKey) {
2468
- throw new Error(
2469
- `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
2470
- );
2471
- }
2472
- const client = new import_openai4.default({
2473
- apiKey,
2474
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
2475
- timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
2476
- });
2477
- return makeProvider(`google/${model}`, "Google AI", model, client, model);
2478
- }
2479
-
2480
2564
  // src/reporter/markdown.ts
2481
2565
  var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2482
2566
  function markdownReporter(report, _current) {
@@ -2521,7 +2605,7 @@ function markdownComparisonTable(comparisons) {
2521
2605
  for (const c of comparisons) {
2522
2606
  const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2523
2607
  const currentStr = formatStats(c.current);
2524
- const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
2608
+ const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
2525
2609
  const status = statusIndicator(c);
2526
2610
  lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2527
2611
  }
@@ -2554,10 +2638,6 @@ function formatStats(stats) {
2554
2638
  }
2555
2639
  return stats.mean.toFixed(3);
2556
2640
  }
2557
- function formatDelta(delta) {
2558
- const sign = delta >= 0 ? "+" : "";
2559
- return `${sign}${delta.toFixed(3)}`;
2560
- }
2561
2641
  function statusIndicator(c) {
2562
2642
  if (c.regressed) return "\u{1F534} regressed";
2563
2643
  if (c.improved) return "\u{1F7E2} improved";
@@ -2565,6 +2645,644 @@ function statusIndicator(c) {
2565
2645
  return "\u26AA unchanged";
2566
2646
  }
2567
2647
 
2648
+ // src/reporter/html.ts
2649
+ function esc(s) {
2650
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
2651
+ }
2652
+ function htmlReporter(results) {
2653
+ if (results.length === 0) {
2654
+ return emptyReport();
2655
+ }
2656
+ const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
2657
+ const hasCost = scorerNames.includes("cost");
2658
+ const multi = providers.length >= 2;
2659
+ const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
2660
+ const taskSections = tasks.map((task) => {
2661
+ const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
2662
+ const columnStats = computeColumnStats(providerData, scorerNames);
2663
+ const medals = computeMedals(columnStats, providers);
2664
+ const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
2665
+ return { task, providerData, columnStats, medals, winnerId };
2666
+ });
2667
+ const successResults = results.filter((r) => !r.error);
2668
+ const successByProvider = /* @__PURE__ */ new Map();
2669
+ for (const id of providers) {
2670
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2671
+ }
2672
+ const correctnessKey = successResults.some(
2673
+ (r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
2674
+ ) ? "llm-judge-correctness" : "correctness";
2675
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2676
+ const byLatency = providers.map((id) => {
2677
+ const runs = successByProvider.get(id) ?? [];
2678
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2679
+ return { id, avg: avg ?? Infinity };
2680
+ }).sort((a, b) => a.avg - b.avg)[0];
2681
+ const byCost = providers.map((id) => {
2682
+ const runs = successByProvider.get(id) ?? [];
2683
+ const costs = runs.map((r) => {
2684
+ const s = r.scores.find((s2) => s2.name === "cost");
2685
+ return s && s.value >= 0 ? s.value : void 0;
2686
+ }).filter((c) => c !== void 0);
2687
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2688
+ return { id, avg };
2689
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2690
+ let overallWinner;
2691
+ if (multi) {
2692
+ const wins = /* @__PURE__ */ new Map();
2693
+ for (const id of providers) wins.set(id, 0);
2694
+ if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2695
+ if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2696
+ if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2697
+ const maxWins = Math.max(...wins.values());
2698
+ if (maxWins > 0) {
2699
+ const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
2700
+ if (tops.length === 1) overallWinner = tops[0][0];
2701
+ }
2702
+ }
2703
+ const errorResults = results.filter((r) => r.error);
2704
+ const deduped = dedupeErrors(errorResults);
2705
+ return `<!DOCTYPE html>
2706
+ <html lang="en">
2707
+ <head>
2708
+ <meta charset="UTF-8">
2709
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2710
+ <title>Agent Duelist Report</title>
2711
+ <meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
2712
+ <meta property="og:title" content="Agent Duelist Report">
2713
+ <meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
2714
+ <meta property="og:type" content="website">
2715
+ ${renderStyle()}
2716
+ </head>
2717
+ <body>
2718
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2719
+ <div class="report">
2720
+
2721
+ ${renderHeader(runsLabel, providers.length, tasks.length)}
2722
+
2723
+ ${tasks.length > 1 ? renderTabs(tasks) : ""}
2724
+
2725
+ <main>
2726
+ ${taskSections.map((s, i) => renderTaskSection(
2727
+ s.task,
2728
+ s.providerData,
2729
+ s.columnStats,
2730
+ s.medals,
2731
+ s.winnerId,
2732
+ scorerNames,
2733
+ hasCost,
2734
+ multi,
2735
+ i
2736
+ )).join("\n")}
2737
+ </main>
2738
+
2739
+ ${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
2740
+
2741
+ ${deduped.length > 0 ? renderErrors(deduped) : ""}
2742
+
2743
+ ${renderFooter()}
2744
+
2745
+ </div>
2746
+ ${renderScript(tasks.length)}
2747
+ </body>
2748
+ </html>`;
2749
+ }
2750
+ function emptyReport() {
2751
+ return `<!DOCTYPE html>
2752
+ <html lang="en">
2753
+ <head>
2754
+ <meta charset="UTF-8">
2755
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2756
+ <title>Agent Duelist Report</title>
2757
+ ${renderStyle()}
2758
+ </head>
2759
+ <body>
2760
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2761
+ <div class="report">
2762
+ ${renderHeader("0 runs", 0, 0)}
2763
+ <main><p class="empty-msg">No results to display.</p></main>
2764
+ ${renderFooter()}
2765
+ </div>
2766
+ </body>
2767
+ </html>`;
2768
+ }
2769
+ function dedupeErrors(errorResults) {
2770
+ const seen = /* @__PURE__ */ new Map();
2771
+ for (const r of errorResults) {
2772
+ const key = `${r.providerId}::${r.error}`;
2773
+ const existing = seen.get(key);
2774
+ if (existing) {
2775
+ existing.count++;
2776
+ } else {
2777
+ seen.set(key, {
2778
+ providerId: r.providerId,
2779
+ error: r.error ?? "Unknown error",
2780
+ count: 1,
2781
+ hint: apiKeyHint(r.providerId, r.error ?? "")
2782
+ });
2783
+ }
2784
+ }
2785
+ return [...seen.values()];
2786
+ }
2787
+ function renderStyle() {
2788
+ return `<style>
2789
+ :root {
2790
+ --bg: #0f172a;
2791
+ --bg-deep: #020617;
2792
+ --panel: rgba(15, 23, 42, 0.85);
2793
+ --accent: #f59e0b;
2794
+ --accent-soft: rgba(245, 158, 11, 0.15);
2795
+ --text: #e2e8f0;
2796
+ --muted: #94a3b8;
2797
+ --border: rgba(148, 163, 184, 0.15);
2798
+ --green: #22c55e;
2799
+ --red: #ef4444;
2800
+ --yellow: #eab308;
2801
+ --radius: 12px;
2802
+ --mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
2803
+ --sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
2804
+ }
2805
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
2806
+ html, body {
2807
+ font-family: var(--sans);
2808
+ background: var(--bg);
2809
+ color: var(--text);
2810
+ min-height: 100vh;
2811
+ }
2812
+ body { padding: 24px; display: flex; justify-content: center; }
2813
+
2814
+ /* Animated gradient mesh */
2815
+ .bg-mesh {
2816
+ position: fixed; inset: 0; z-index: 0;
2817
+ overflow: hidden; pointer-events: none;
2818
+ }
2819
+ .bg-mesh::before, .bg-mesh::after {
2820
+ content: ""; position: absolute; border-radius: 50%;
2821
+ filter: blur(120px); opacity: 0.4;
2822
+ }
2823
+ .bg-mesh::before {
2824
+ width: 600px; height: 600px;
2825
+ background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
2826
+ top: -10%; left: -5%;
2827
+ animation: meshDrift1 18s ease-in-out infinite alternate;
2828
+ }
2829
+ .bg-mesh::after {
2830
+ width: 500px; height: 500px;
2831
+ background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
2832
+ bottom: -10%; right: -5%;
2833
+ animation: meshDrift2 22s ease-in-out infinite alternate;
2834
+ }
2835
+ .bg-mesh-extra {
2836
+ position: absolute; width: 400px; height: 400px;
2837
+ border-radius: 50%; filter: blur(100px); opacity: 0.3;
2838
+ background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
2839
+ top: 50%; left: 60%;
2840
+ animation: meshDrift3 15s ease-in-out infinite alternate;
2841
+ }
2842
+ @keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
2843
+ @keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
2844
+ @keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
2845
+
2846
+ /* Report container */
2847
+ .report {
2848
+ position: relative; z-index: 1;
2849
+ width: 100%; max-width: 960px;
2850
+ }
2851
+
2852
+ /* Header */
2853
+ .report-header {
2854
+ display: flex; justify-content: space-between; align-items: center;
2855
+ padding: 20px 0; margin-bottom: 8px;
2856
+ }
2857
+ .report-brand {
2858
+ display: flex; align-items: center; gap: 10px;
2859
+ text-decoration: none; color: var(--muted);
2860
+ font-weight: 600; font-size: 14px;
2861
+ letter-spacing: 0.04em; text-transform: uppercase;
2862
+ }
2863
+ .report-brand:hover { color: var(--text); }
2864
+ .brand-icon {
2865
+ width: 32px; height: 32px; border-radius: 8px;
2866
+ background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
2867
+ border: 1px solid rgba(245,158,11,0.3);
2868
+ display: flex; align-items: center; justify-content: center;
2869
+ font-size: 16px;
2870
+ }
2871
+ .report-meta {
2872
+ font-size: 12px; color: var(--muted);
2873
+ text-align: right; line-height: 1.6;
2874
+ }
2875
+
2876
+ /* Task tabs */
2877
+ .task-tabs {
2878
+ display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
2879
+ }
2880
+ .task-tab {
2881
+ padding: 6px 16px; border-radius: 999px;
2882
+ border: 1px solid var(--border);
2883
+ background: transparent; color: var(--muted);
2884
+ font-size: 13px; font-weight: 500; cursor: pointer;
2885
+ transition: all 150ms ease;
2886
+ }
2887
+ .task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
2888
+ .task-tab.active {
2889
+ background: var(--accent-soft);
2890
+ border-color: rgba(245,158,11,0.4);
2891
+ color: var(--accent);
2892
+ }
2893
+
2894
+ /* Task sections */
2895
+ .task-section { display: none; }
2896
+ .task-section.active { display: block; }
2897
+ .task-name {
2898
+ font-size: 18px; font-weight: 600;
2899
+ margin-bottom: 12px; letter-spacing: -0.01em;
2900
+ }
2901
+
2902
+ /* Results table */
2903
+ .results-table {
2904
+ width: 100%; border-collapse: collapse;
2905
+ font-size: 13px; margin-bottom: 16px;
2906
+ border-radius: var(--radius); overflow: hidden;
2907
+ border: 1px solid var(--border);
2908
+ }
2909
+ .results-table th, .results-table td {
2910
+ padding: 10px 14px;
2911
+ text-align: left;
2912
+ border-bottom: 1px solid var(--border);
2913
+ }
2914
+ .results-table th {
2915
+ background: rgba(0,0,0,0.3);
2916
+ font-size: 11px; font-weight: 600;
2917
+ text-transform: uppercase; letter-spacing: 0.05em;
2918
+ color: var(--muted); cursor: pointer;
2919
+ user-select: none; white-space: nowrap;
2920
+ }
2921
+ .results-table th:hover { color: var(--text); }
2922
+ .results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
2923
+ .results-table tbody tr {
2924
+ background: var(--panel);
2925
+ transition: background 120ms ease;
2926
+ }
2927
+ .results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
2928
+ .results-table tbody tr:last-child td { border-bottom: none; }
2929
+
2930
+ /* Score cell with progress bar */
2931
+ .score-cell { position: relative; min-width: 90px; }
2932
+ .score-bar {
2933
+ position: absolute; left: 0; bottom: 0;
2934
+ height: 3px; border-radius: 2px;
2935
+ transition: width 300ms ease;
2936
+ }
2937
+ .score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
2938
+
2939
+ /* Color ranking */
2940
+ .rank-best { color: var(--green); font-weight: 600; }
2941
+ .rank-worst { color: var(--red); }
2942
+ .rank-mid { color: var(--yellow); }
2943
+ .rank-neutral { color: var(--text); }
2944
+ .rank-error { color: var(--muted); }
2945
+
2946
+ /* Winner banner */
2947
+ .task-winner {
2948
+ display: flex; align-items: center; gap: 10px;
2949
+ padding: 12px 18px; margin-bottom: 20px;
2950
+ border-radius: var(--radius);
2951
+ background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
2952
+ border: 1px solid rgba(34,197,94,0.2);
2953
+ font-size: 14px; font-weight: 500;
2954
+ }
2955
+ .task-winner .trophy { font-size: 20px; }
2956
+ .task-winner .winner-name { color: var(--green); font-weight: 600; }
2957
+ .task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
2958
+
2959
+ /* Summary cards */
2960
+ .summary-section { margin-top: 32px; }
2961
+ .summary-title {
2962
+ font-size: 16px; font-weight: 600;
2963
+ margin-bottom: 12px; color: var(--text);
2964
+ }
2965
+ .summary-cards {
2966
+ display: grid;
2967
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
2968
+ gap: 12px;
2969
+ }
2970
+ .summary-card {
2971
+ padding: 16px; border-radius: var(--radius);
2972
+ border: 1px solid var(--border);
2973
+ background: var(--panel);
2974
+ }
2975
+ .summary-card .card-label {
2976
+ font-size: 11px; font-weight: 600;
2977
+ text-transform: uppercase; letter-spacing: 0.05em;
2978
+ color: var(--muted); margin-bottom: 6px;
2979
+ }
2980
+ .summary-card .card-value {
2981
+ font-size: 20px; font-weight: 700;
2982
+ color: var(--green); font-family: var(--mono);
2983
+ }
2984
+ .summary-card .card-provider {
2985
+ font-size: 12px; color: var(--muted); margin-top: 4px;
2986
+ }
2987
+
2988
+ /* Errors */
2989
+ .errors-section { margin-top: 24px; }
2990
+ .errors-title {
2991
+ font-size: 16px; font-weight: 600;
2992
+ margin-bottom: 8px; color: var(--red);
2993
+ cursor: pointer;
2994
+ }
2995
+ .errors-list {
2996
+ border-radius: var(--radius);
2997
+ border: 1px solid rgba(239,68,68,0.2);
2998
+ background: rgba(239,68,68,0.04);
2999
+ overflow: hidden;
3000
+ }
3001
+ .error-item {
3002
+ padding: 10px 16px;
3003
+ border-bottom: 1px solid rgba(239,68,68,0.1);
3004
+ font-size: 13px;
3005
+ }
3006
+ .error-item:last-child { border-bottom: none; }
3007
+ .error-provider { font-weight: 600; color: var(--text); }
3008
+ .error-msg { color: var(--muted); margin-left: 8px; }
3009
+ .error-count { color: var(--muted); font-size: 11px; }
3010
+ .error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
3011
+
3012
+ /* Footer */
3013
+ .report-footer {
3014
+ margin-top: 40px; padding: 20px 0;
3015
+ border-top: 1px solid var(--border);
3016
+ display: flex; justify-content: space-between; align-items: center;
3017
+ flex-wrap: wrap; gap: 12px;
3018
+ }
3019
+ .footer-brand {
3020
+ font-size: 13px; color: var(--muted);
3021
+ }
3022
+ .footer-brand a {
3023
+ color: var(--accent); text-decoration: none; font-weight: 500;
3024
+ }
3025
+ .footer-brand a:hover { text-decoration: underline; }
3026
+ .footer-cta {
3027
+ display: inline-flex; align-items: center; gap: 6px;
3028
+ padding: 6px 14px; border-radius: 8px;
3029
+ background: var(--accent-soft);
3030
+ border: 1px solid rgba(245,158,11,0.3);
3031
+ color: var(--accent); font-size: 12px; font-weight: 500;
3032
+ text-decoration: none;
3033
+ transition: transform 120ms ease, box-shadow 120ms ease;
3034
+ }
3035
+ .footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
3036
+
3037
+ /* Empty state */
3038
+ .empty-msg {
3039
+ text-align: center; color: var(--muted);
3040
+ padding: 60px 20px; font-size: 16px;
3041
+ }
3042
+
3043
+ /* Responsive */
3044
+ @media (max-width: 640px) {
3045
+ body { padding: 12px; }
3046
+ .report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
3047
+ .report-meta { text-align: left; }
3048
+ .summary-cards { grid-template-columns: 1fr; }
3049
+ .results-table { font-size: 12px; }
3050
+ .results-table th, .results-table td { padding: 8px 10px; }
3051
+ .report-footer { flex-direction: column; align-items: flex-start; }
3052
+ }
3053
+ </style>`;
3054
+ }
3055
+ function renderHeader(runsLabel, providerCount, taskCount) {
3056
+ const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
3057
+ return `<header class="report-header">
3058
+ <a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3059
+ <div class="brand-icon">&#x2B21;</div>
3060
+ <span>Agent Duelist</span>
3061
+ </a>
3062
+ <div class="report-meta">
3063
+ ${providerCount} provider${providerCount !== 1 ? "s" : ""} &middot;
3064
+ ${taskCount} task${taskCount !== 1 ? "s" : ""} &middot;
3065
+ ${esc(runsLabel)}<br>
3066
+ ${esc(now)}
3067
+ </div>
3068
+ </header>`;
3069
+ }
3070
+ function renderTabs(tasks) {
3071
+ const buttons = tasks.map(
3072
+ (t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
3073
+ ).join("\n ");
3074
+ return `<nav class="task-tabs">
3075
+ ${buttons}
3076
+ </nav>`;
3077
+ }
3078
+ function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
3079
+ const cols = [
3080
+ { label: "Provider", key: "provider", isScore: false }
3081
+ ];
3082
+ for (const name of scorerNames) {
3083
+ if (name === "latency") {
3084
+ cols.push({ label: "Latency", key: "latency", isScore: false });
3085
+ } else if (name === "cost") {
3086
+ cols.push({ label: "Cost", key: "cost", isScore: false });
3087
+ cols.push({ label: "Tokens", key: "tokens", isScore: false });
3088
+ } else {
3089
+ cols.push({ label: scorerLabel(name), key: name, isScore: true });
3090
+ }
3091
+ }
3092
+ const ths = cols.map(
3093
+ (c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
3094
+ ).join("");
3095
+ const rows = providerData.map((pd) => {
3096
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
3097
+ const cells = [];
3098
+ const medalHtml = medal ? `${medal} ` : "";
3099
+ cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
3100
+ if (pd.allErrors) {
3101
+ for (let ci = 1; ci < cols.length; ci++) {
3102
+ cells.push(`<td class="rank-error">&mdash;</td>`);
3103
+ }
3104
+ } else {
3105
+ for (const col of cols.slice(1)) {
3106
+ cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
3107
+ }
3108
+ }
3109
+ return `<tr>${cells.join("")}</tr>`;
3110
+ }).join("\n");
3111
+ const winnerHtml = winnerId ? `<div class="task-winner">
3112
+ <span class="trophy">&#x1F3C6;</span>
3113
+ <span>Winner: <span class="winner-name">${esc(winnerId)}</span>
3114
+ <span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
3115
+ </div>` : "";
3116
+ return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
3117
+ <h2 class="task-name">${esc(task)}</h2>
3118
+ <table class="results-table">
3119
+ <thead><tr>${ths}</tr></thead>
3120
+ <tbody>${rows}</tbody>
3121
+ </table>
3122
+ ${winnerHtml}
3123
+ </section>`;
3124
+ }
3125
+ function renderDataCell(key, _isScore, pd, columnStats, multi) {
3126
+ const colStats = columnStats.get(key);
3127
+ if (key === "latency") {
3128
+ const ms = pd.latencyMs;
3129
+ if (ms === void 0) return `<td class="rank-error">&mdash;</td>`;
3130
+ const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
3131
+ return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
3132
+ }
3133
+ if (key === "cost") {
3134
+ const cost = pd.avgDetails.costUsd;
3135
+ if (cost === void 0) return `<td class="rank-error">&mdash;</td>`;
3136
+ const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
3137
+ return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
3138
+ }
3139
+ if (key === "tokens") {
3140
+ const tokens = pd.avgDetails.totalTokens;
3141
+ if (tokens === void 0) return `<td class="rank-error">&mdash;</td>`;
3142
+ const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
3143
+ return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
3144
+ }
3145
+ const val = pd.avgScores[key];
3146
+ if (val === void 0) return `<td class="rank-error">&mdash;</td>`;
3147
+ const pct = Math.round(val * 100);
3148
+ let rankCls;
3149
+ if (multi && colStats) {
3150
+ rankCls = rankClass_(val, colStats);
3151
+ } else {
3152
+ rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
3153
+ }
3154
+ const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
3155
+ return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
3156
+ <span class="score-val">${pct}%</span>
3157
+ <div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
3158
+ </td>`;
3159
+ }
3160
+ function rankClass_(value, colStats) {
3161
+ if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
3162
+ if (colStats.best === colStats.worst) return "rank-neutral";
3163
+ if (value === colStats.best) return "rank-best";
3164
+ if (value === colStats.worst) return "rank-worst";
3165
+ return "rank-mid";
3166
+ }
3167
+ function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
3168
+ const cards = [];
3169
+ if (byCorrectness) {
3170
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
3171
+ const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
3172
+ cards.push(`<div class="summary-card">
3173
+ <div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
3174
+ <div class="card-value">${pct}</div>
3175
+ ${provider}
3176
+ </div>`);
3177
+ }
3178
+ if (byLatency && byLatency.avg !== Infinity) {
3179
+ const ms = `${Math.round(byLatency.avg)}ms`;
3180
+ const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
3181
+ cards.push(`<div class="summary-card">
3182
+ <div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
3183
+ <div class="card-value">${ms}</div>
3184
+ ${provider}
3185
+ </div>`);
3186
+ }
3187
+ if (byCost?.avg !== void 0) {
3188
+ const cost = esc(formatCost(byCost.avg));
3189
+ const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
3190
+ cards.push(`<div class="summary-card">
3191
+ <div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
3192
+ <div class="card-value">${cost}</div>
3193
+ ${provider}
3194
+ </div>`);
3195
+ }
3196
+ if (overallWinner) {
3197
+ cards.push(`<div class="summary-card">
3198
+ <div class="card-label">Overall Winner</div>
3199
+ <div class="card-value">&#x1F3C6;</div>
3200
+ <div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
3201
+ </div>`);
3202
+ }
3203
+ if (cards.length === 0) return "";
3204
+ return `<section class="summary-section">
3205
+ <h2 class="summary-title">Summary</h2>
3206
+ <div class="summary-cards">
3207
+ ${cards.join("\n ")}
3208
+ </div>
3209
+ </section>`;
3210
+ }
3211
+ function renderErrors(errors) {
3212
+ const items = errors.map((e) => {
3213
+ const suffix = e.count > 1 ? ` <span class="error-count">(&times;${e.count})</span>` : "";
3214
+ const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
3215
+ return `<div class="error-item">
3216
+ <span class="error-provider">${esc(e.providerId)}:</span>
3217
+ <span class="error-msg">${esc(e.error)}</span>${suffix}
3218
+ ${hint}
3219
+ </div>`;
3220
+ }).join("\n");
3221
+ return `<section class="errors-section">
3222
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
3223
+ <div class="errors-list">
3224
+ ${items}
3225
+ </div>
3226
+ </section>`;
3227
+ }
3228
+ function renderFooter() {
3229
+ return `<footer class="report-footer">
3230
+ <div class="footer-brand">
3231
+ Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
3232
+ </div>
3233
+ <a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3234
+ &#x2B50; Star on GitHub
3235
+ </a>
3236
+ </footer>`;
3237
+ }
3238
+ function renderScript(taskCount) {
3239
+ return `<script>
3240
+ (function() {
3241
+ /* Tab switching */
3242
+ ${taskCount > 1 ? `
3243
+ var tabs = document.querySelectorAll('.task-tab');
3244
+ var sections = document.querySelectorAll('.task-section');
3245
+ tabs.forEach(function(tab) {
3246
+ tab.addEventListener('click', function() {
3247
+ var idx = parseInt(tab.getAttribute('data-task'));
3248
+ tabs.forEach(function(t) { t.classList.remove('active'); });
3249
+ sections.forEach(function(s) { s.classList.remove('active'); });
3250
+ tab.classList.add('active');
3251
+ sections[idx].classList.add('active');
3252
+ });
3253
+ });` : ""}
3254
+
3255
+ /* Column sorting */
3256
+ document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
3257
+ var table = th.closest('table');
3258
+ var asc = true;
3259
+ th.addEventListener('click', function() {
3260
+ var tbody = table.querySelector('tbody');
3261
+ var rows = Array.from(tbody.querySelectorAll('tr'));
3262
+ rows.sort(function(a, b) {
3263
+ var aCell = a.children[colIdx];
3264
+ var bCell = b.children[colIdx];
3265
+ var aVal = aCell.getAttribute('data-sort-val');
3266
+ var bVal = bCell.getAttribute('data-sort-val');
3267
+ if (aVal !== null && bVal !== null) {
3268
+ return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
3269
+ }
3270
+ var aText = aCell.textContent || '';
3271
+ var bText = bCell.textContent || '';
3272
+ return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
3273
+ });
3274
+ rows.forEach(function(row) { tbody.appendChild(row); });
3275
+
3276
+ /* Update sort arrows */
3277
+ table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
3278
+ th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
3279
+ asc = !asc;
3280
+ });
3281
+ });
3282
+ })();
3283
+ </script>`;
3284
+ }
3285
+
2568
3286
  // src/ci.ts
2569
3287
  var import_node_fs = require("fs");
2570
3288
  var import_node_path = require("path");
@@ -2586,10 +3304,11 @@ var T_CRITICAL_95 = {
2586
3304
  25: 2.06,
2587
3305
  30: 2.042
2588
3306
  };
3307
+ var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
2589
3308
  function tCritical(df) {
2590
3309
  if (df <= 0) return 1.96;
2591
3310
  if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
2592
- const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
3311
+ const keys = T_CRITICAL_KEYS;
2593
3312
  if (df > keys[keys.length - 1]) return 1.96;
2594
3313
  for (let i = 0; i < keys.length - 1; i++) {
2595
3314
  if (df > keys[i] && df < keys[i + 1]) {
@@ -2699,7 +3418,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
2699
3418
  if (regressions.length > 0) {
2700
3419
  for (const r of regressions) {
2701
3420
  failureReasons.push(
2702
- `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
3421
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
2703
3422
  );
2704
3423
  }
2705
3424
  }
@@ -2734,10 +3453,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
2734
3453
  }
2735
3454
  return current.ci95Lower - baseline.ci95Upper > threshold;
2736
3455
  }
2737
- function formatDelta2(delta) {
2738
- const sign = delta >= 0 ? "+" : "";
2739
- return `${sign}${delta.toFixed(4)}`;
2740
- }
2741
3456
  function loadBaseline(path) {
2742
3457
  try {
2743
3458
  const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
@@ -2794,18 +3509,20 @@ function detectGitHubContext() {
2794
3509
  return { token, owner, repo, prNumber };
2795
3510
  }
2796
3511
  var API_BASE = "https://api.github.com";
3512
+ function ghHeaders(token, extra) {
3513
+ return {
3514
+ Authorization: `Bearer ${token}`,
3515
+ Accept: "application/vnd.github+json",
3516
+ "X-GitHub-Api-Version": "2022-11-28",
3517
+ ...extra
3518
+ };
3519
+ }
2797
3520
  async function findExistingComment(ctx, marker) {
2798
3521
  let page = 1;
2799
3522
  const perPage = 50;
2800
3523
  while (true) {
2801
3524
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
2802
- const res = await fetch(url, {
2803
- headers: {
2804
- Authorization: `Bearer ${ctx.token}`,
2805
- Accept: "application/vnd.github+json",
2806
- "X-GitHub-Api-Version": "2022-11-28"
2807
- }
2808
- });
3525
+ const res = await fetch(url, { headers: ghHeaders(ctx.token) });
2809
3526
  if (!res.ok) return null;
2810
3527
  const comments = await res.json();
2811
3528
  if (comments.length === 0) break;
@@ -2825,12 +3542,7 @@ async function upsertPrComment(ctx, body, marker) {
2825
3542
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
2826
3543
  const res = await fetch(url, {
2827
3544
  method: "PATCH",
2828
- headers: {
2829
- Authorization: `Bearer ${ctx.token}`,
2830
- Accept: "application/vnd.github+json",
2831
- "Content-Type": "application/json",
2832
- "X-GitHub-Api-Version": "2022-11-28"
2833
- },
3545
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
2834
3546
  body: JSON.stringify({ body })
2835
3547
  });
2836
3548
  if (!res.ok) {
@@ -2841,12 +3553,7 @@ async function upsertPrComment(ctx, body, marker) {
2841
3553
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
2842
3554
  const res = await fetch(url, {
2843
3555
  method: "POST",
2844
- headers: {
2845
- Authorization: `Bearer ${ctx.token}`,
2846
- Accept: "application/vnd.github+json",
2847
- "Content-Type": "application/json",
2848
- "X-GitHub-Api-Version": "2022-11-28"
2849
- },
3556
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
2850
3557
  body: JSON.stringify({ body })
2851
3558
  });
2852
3559
  if (!res.ok) {
@@ -2865,6 +3572,7 @@ async function upsertPrComment(ctx, body, marker) {
2865
3572
  defineArena,
2866
3573
  detectGitHubContext,
2867
3574
  gemini,
3575
+ htmlReporter,
2868
3576
  jsonReporter,
2869
3577
  loadBaseline,
2870
3578
  markdownReporter,