agent-duelist 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -3
- package/dist/cli.js +2754 -2102
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1054 -346
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -7
- package/dist/index.d.ts +8 -7
- package/dist/index.js +1053 -346
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1410,11 +1410,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1410
1410
|
}
|
|
1411
1411
|
const a = stringify(task.expected);
|
|
1412
1412
|
const b = stringify(result.output);
|
|
1413
|
-
const
|
|
1413
|
+
const setA = tokenize(a);
|
|
1414
|
+
const setB = tokenize(b);
|
|
1415
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1414
1416
|
return {
|
|
1415
1417
|
name: "fuzzy-similarity",
|
|
1416
1418
|
value: Math.round(similarity * 100) / 100,
|
|
1417
|
-
details: { method: "jaccard", expectedTokens:
|
|
1419
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1418
1420
|
};
|
|
1419
1421
|
};
|
|
1420
1422
|
function stringify(value) {
|
|
@@ -1440,6 +1442,19 @@ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
|
1440
1442
|
// src/providers/openai.ts
|
|
1441
1443
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1442
1444
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1445
|
+
|
|
1446
|
+
// src/providers/shared.ts
|
|
1447
|
+
var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
|
|
1448
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1449
|
+
if (!hasSchema) return rawContent;
|
|
1450
|
+
try {
|
|
1451
|
+
return JSON.parse(rawContent);
|
|
1452
|
+
} catch {
|
|
1453
|
+
return rawContent;
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
// src/providers/openai.ts
|
|
1443
1458
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1444
1459
|
function openai(model, options) {
|
|
1445
1460
|
const client = new OpenAI({
|
|
@@ -1486,7 +1501,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1486
1501
|
if (input.schema) {
|
|
1487
1502
|
params.response_format = { type: "json_object" };
|
|
1488
1503
|
params.messages = [
|
|
1489
|
-
{ role: "system", content:
|
|
1504
|
+
{ role: "system", content: SCHEMA_SYSTEM_MESSAGE },
|
|
1490
1505
|
...params.messages
|
|
1491
1506
|
];
|
|
1492
1507
|
}
|
|
@@ -1539,13 +1554,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1539
1554
|
if (stripThinking) {
|
|
1540
1555
|
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1541
1556
|
}
|
|
1542
|
-
|
|
1543
|
-
if (input.schema) {
|
|
1544
|
-
try {
|
|
1545
|
-
output = JSON.parse(rawContent);
|
|
1546
|
-
} catch {
|
|
1547
|
-
}
|
|
1548
|
-
}
|
|
1557
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1549
1558
|
return {
|
|
1550
1559
|
output,
|
|
1551
1560
|
usage: {
|
|
@@ -1559,6 +1568,20 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1559
1568
|
}
|
|
1560
1569
|
};
|
|
1561
1570
|
}
|
|
1571
|
+
function gemini(model, options) {
|
|
1572
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1573
|
+
if (!apiKey) {
|
|
1574
|
+
throw new Error(
|
|
1575
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1576
|
+
);
|
|
1577
|
+
}
|
|
1578
|
+
const client = new OpenAI({
|
|
1579
|
+
apiKey,
|
|
1580
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1581
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1582
|
+
});
|
|
1583
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1584
|
+
}
|
|
1562
1585
|
function toolDefToOpenAI(tool) {
|
|
1563
1586
|
return {
|
|
1564
1587
|
type: "function",
|
|
@@ -1636,8 +1659,7 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1636
1659
|
const response = await client.chat.completions.create({
|
|
1637
1660
|
model,
|
|
1638
1661
|
messages: [{ role: "user", content: prompt }],
|
|
1639
|
-
|
|
1640
|
-
max_tokens: 2048
|
|
1662
|
+
max_completion_tokens: 2048
|
|
1641
1663
|
});
|
|
1642
1664
|
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1643
1665
|
const parsed = {};
|
|
@@ -1733,118 +1755,173 @@ async function runBenchmarks(options) {
|
|
|
1733
1755
|
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1734
1756
|
const results = [];
|
|
1735
1757
|
for (const task of tasks) {
|
|
1736
|
-
for (
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1758
|
+
for (let run = 1; run <= runs; run++) {
|
|
1759
|
+
const runResults = await Promise.all(
|
|
1760
|
+
providers.map(async (provider) => {
|
|
1761
|
+
let result;
|
|
1762
|
+
try {
|
|
1763
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1764
|
+
prompt: task.prompt,
|
|
1765
|
+
schema: task.schema,
|
|
1766
|
+
tools: task.tools,
|
|
1767
|
+
signal
|
|
1768
|
+
}), timeout);
|
|
1769
|
+
const scores = await Promise.all(
|
|
1770
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1771
|
+
);
|
|
1772
|
+
result = {
|
|
1773
|
+
providerId: provider.id,
|
|
1774
|
+
taskName: task.name,
|
|
1775
|
+
run,
|
|
1776
|
+
scores,
|
|
1777
|
+
raw: {
|
|
1778
|
+
output: taskResult.output,
|
|
1779
|
+
latencyMs: taskResult.latencyMs,
|
|
1780
|
+
usage: taskResult.usage,
|
|
1781
|
+
toolCalls: taskResult.toolCalls
|
|
1782
|
+
}
|
|
1783
|
+
};
|
|
1784
|
+
} catch (err) {
|
|
1785
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1786
|
+
result = {
|
|
1787
|
+
providerId: provider.id,
|
|
1788
|
+
taskName: task.name,
|
|
1789
|
+
run,
|
|
1790
|
+
scores: [],
|
|
1791
|
+
error: message,
|
|
1792
|
+
raw: { output: "", latencyMs: 0 }
|
|
1793
|
+
};
|
|
1794
|
+
}
|
|
1795
|
+
onResult?.(result);
|
|
1796
|
+
return result;
|
|
1797
|
+
})
|
|
1798
|
+
);
|
|
1799
|
+
results.push(...runResults);
|
|
1775
1800
|
}
|
|
1776
1801
|
}
|
|
1777
1802
|
return results;
|
|
1778
1803
|
}
|
|
1779
1804
|
|
|
1780
|
-
// src/
|
|
1781
|
-
var
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
return `${boldCode}${s}${reset}`;
|
|
1792
|
-
}
|
|
1793
|
-
function dim(s) {
|
|
1794
|
-
return `${dimCode}${s}${reset}`;
|
|
1805
|
+
// src/utils/format.ts
|
|
1806
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1807
|
+
function formatCost(usd) {
|
|
1808
|
+
if (usd === void 0) return "\u2014";
|
|
1809
|
+
if (usd === 0) return "$0.00";
|
|
1810
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1811
|
+
const digits = Math.min(
|
|
1812
|
+
MAX_FRACTION_DIGITS,
|
|
1813
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1814
|
+
);
|
|
1815
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1795
1816
|
}
|
|
1796
|
-
function
|
|
1797
|
-
|
|
1817
|
+
function formatDelta(delta, precision = 4) {
|
|
1818
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1819
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1798
1820
|
}
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1821
|
+
|
|
1822
|
+
// src/reporter/shared.ts
|
|
1823
|
+
function groupResults(results) {
|
|
1824
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1825
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1826
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1827
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1828
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1829
|
+
let hasErrors = false;
|
|
1830
|
+
let maxRun = 0;
|
|
1831
|
+
for (const r of results) {
|
|
1832
|
+
taskSet.add(r.taskName);
|
|
1833
|
+
providerSet.add(r.providerId);
|
|
1834
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1835
|
+
if (r.error) hasErrors = true;
|
|
1836
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1837
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1838
|
+
let group = grouped.get(key);
|
|
1839
|
+
if (!group) {
|
|
1840
|
+
group = [];
|
|
1841
|
+
grouped.set(key, group);
|
|
1842
|
+
}
|
|
1843
|
+
group.push(r);
|
|
1844
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1845
|
+
if (!provGroup) {
|
|
1846
|
+
provGroup = [];
|
|
1847
|
+
byProvider.set(r.providerId, provGroup);
|
|
1848
|
+
}
|
|
1849
|
+
provGroup.push(r);
|
|
1807
1850
|
}
|
|
1808
|
-
return
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1818
|
-
const fillLen = Math.round(clamped * width);
|
|
1819
|
-
const fill = "\u2593".repeat(fillLen);
|
|
1820
|
-
const track = "\u2591".repeat(width - fillLen);
|
|
1821
|
-
return { fill, track };
|
|
1851
|
+
return {
|
|
1852
|
+
tasks: [...taskSet],
|
|
1853
|
+
providers: [...providerSet],
|
|
1854
|
+
scorerNames: [...scorerSet],
|
|
1855
|
+
grouped,
|
|
1856
|
+
byProvider,
|
|
1857
|
+
hasErrors,
|
|
1858
|
+
maxRun
|
|
1859
|
+
};
|
|
1822
1860
|
}
|
|
1823
|
-
function
|
|
1824
|
-
const
|
|
1825
|
-
|
|
1826
|
-
|
|
1861
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1862
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1863
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1864
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1865
|
+
if (successResults.length === 0) {
|
|
1866
|
+
return {
|
|
1867
|
+
providerId,
|
|
1868
|
+
avgScores: {},
|
|
1869
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1870
|
+
latencyMs: void 0,
|
|
1871
|
+
allErrors: errorResults.length > 0,
|
|
1872
|
+
errorCount: errorResults.length
|
|
1873
|
+
};
|
|
1827
1874
|
}
|
|
1828
|
-
|
|
1829
|
-
|
|
1875
|
+
return {
|
|
1876
|
+
providerId,
|
|
1877
|
+
avgScores: averageScores(successResults),
|
|
1878
|
+
avgDetails: averageDetails(successResults),
|
|
1879
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1880
|
+
allErrors: false,
|
|
1881
|
+
errorCount: errorResults.length
|
|
1882
|
+
};
|
|
1883
|
+
}
|
|
1884
|
+
function averageScores(results) {
|
|
1885
|
+
const sums = {};
|
|
1886
|
+
const counts = {};
|
|
1887
|
+
for (const result of results) {
|
|
1888
|
+
for (const score of result.scores) {
|
|
1889
|
+
if (score.value < 0) continue;
|
|
1890
|
+
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
1891
|
+
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
1892
|
+
}
|
|
1830
1893
|
}
|
|
1831
|
-
const
|
|
1832
|
-
|
|
1833
|
-
|
|
1894
|
+
const avgs = {};
|
|
1895
|
+
for (const name of Object.keys(sums)) {
|
|
1896
|
+
avgs[name] = sums[name] / counts[name];
|
|
1834
1897
|
}
|
|
1835
|
-
return
|
|
1898
|
+
return avgs;
|
|
1836
1899
|
}
|
|
1837
|
-
function
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1900
|
+
function averageDetails(results) {
|
|
1901
|
+
let costSum = 0;
|
|
1902
|
+
let costCount = 0;
|
|
1903
|
+
let tokenSum = 0;
|
|
1904
|
+
let tokenCount = 0;
|
|
1905
|
+
for (const result of results) {
|
|
1906
|
+
const costScore = result.scores.find((s) => s.name === "cost");
|
|
1907
|
+
const details = costScore?.details;
|
|
1908
|
+
if (details?.estimatedUsd != null) {
|
|
1909
|
+
costSum += details.estimatedUsd;
|
|
1910
|
+
costCount++;
|
|
1911
|
+
}
|
|
1912
|
+
if (details?.totalTokens != null) {
|
|
1913
|
+
tokenSum += details.totalTokens;
|
|
1914
|
+
tokenCount++;
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
return {
|
|
1918
|
+
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
1919
|
+
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
1920
|
+
};
|
|
1842
1921
|
}
|
|
1843
|
-
function
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
const padding = Math.max(0, totalInner - dw - 1);
|
|
1847
|
-
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
1922
|
+
function average(nums) {
|
|
1923
|
+
if (nums.length === 0) return void 0;
|
|
1924
|
+
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1848
1925
|
}
|
|
1849
1926
|
function computeColumnStats(providerData, scorerNames) {
|
|
1850
1927
|
const stats = /* @__PURE__ */ new Map();
|
|
@@ -1896,62 +1973,235 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1896
1973
|
}
|
|
1897
1974
|
return stats;
|
|
1898
1975
|
}
|
|
1899
|
-
function colorByRank(text, value, colStats, providerCount) {
|
|
1900
|
-
if (value === void 0) return dim("\u2014");
|
|
1901
|
-
if (providerCount < 2) return text;
|
|
1902
|
-
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
1903
|
-
if (colStats.best === colStats.worst) return text;
|
|
1904
|
-
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
1905
|
-
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
1906
|
-
return `${yellow}${text}${reset}`;
|
|
1907
|
-
}
|
|
1908
1976
|
function computeMedals(columnStats, providerIds) {
|
|
1909
1977
|
const medals = /* @__PURE__ */ new Map();
|
|
1910
1978
|
if (providerIds.length < 2) {
|
|
1911
|
-
for (const id of providerIds) medals.set(id, "");
|
|
1979
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1912
1980
|
return medals;
|
|
1913
1981
|
}
|
|
1914
1982
|
const wins = /* @__PURE__ */ new Map();
|
|
1915
1983
|
for (const id of providerIds) wins.set(id, 0);
|
|
1916
1984
|
for (const [, colStats] of columnStats) {
|
|
1917
1985
|
if (colStats.best === void 0) continue;
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
}
|
|
1986
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
1987
|
+
if (bestProviders.length === 1) {
|
|
1988
|
+
wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
|
|
1922
1989
|
}
|
|
1923
1990
|
}
|
|
1924
1991
|
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
1925
1992
|
if (totalWins === 0) {
|
|
1926
|
-
for (const id of providerIds) medals.set(id, "");
|
|
1993
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1927
1994
|
return medals;
|
|
1928
1995
|
}
|
|
1929
1996
|
const sorted = [...wins.entries()].sort(
|
|
1930
1997
|
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
1931
1998
|
);
|
|
1932
|
-
const medalList = ["
|
|
1999
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
1933
2000
|
let rank = 0;
|
|
1934
2001
|
for (let i = 0; i < sorted.length; i++) {
|
|
1935
2002
|
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
1936
2003
|
rank = i;
|
|
1937
2004
|
}
|
|
1938
|
-
|
|
2005
|
+
const hasWins = sorted[i][1] > 0;
|
|
2006
|
+
medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
|
|
1939
2007
|
}
|
|
1940
2008
|
return medals;
|
|
1941
2009
|
}
|
|
2010
|
+
function providerLabel(providerId) {
|
|
2011
|
+
const prefix = providerId.split("/")[0];
|
|
2012
|
+
switch (prefix) {
|
|
2013
|
+
case "azure":
|
|
2014
|
+
return "(OpenAI via Azure)";
|
|
2015
|
+
case "openai":
|
|
2016
|
+
return "(OpenAI)";
|
|
2017
|
+
case "anthropic":
|
|
2018
|
+
return "(Anthropic)";
|
|
2019
|
+
case "google":
|
|
2020
|
+
return "(Google)";
|
|
2021
|
+
case "mistral":
|
|
2022
|
+
return "(Mistral)";
|
|
2023
|
+
case "meta":
|
|
2024
|
+
return "(Meta)";
|
|
2025
|
+
case "deepseek":
|
|
2026
|
+
return "(DeepSeek)";
|
|
2027
|
+
case "cohere":
|
|
2028
|
+
return "(Cohere)";
|
|
2029
|
+
case "qwen":
|
|
2030
|
+
return "(Qwen)";
|
|
2031
|
+
case "xai":
|
|
2032
|
+
return "(xAI)";
|
|
2033
|
+
case "minimax":
|
|
2034
|
+
return "(MiniMax)";
|
|
2035
|
+
case "moonshot":
|
|
2036
|
+
return "(Moonshot / Kimi)";
|
|
2037
|
+
case "perplexity":
|
|
2038
|
+
return "(Perplexity)";
|
|
2039
|
+
case "amazon":
|
|
2040
|
+
return "(Amazon)";
|
|
2041
|
+
case "nvidia":
|
|
2042
|
+
return "(NVIDIA)";
|
|
2043
|
+
case "microsoft":
|
|
2044
|
+
return "(Microsoft)";
|
|
2045
|
+
case "ai21":
|
|
2046
|
+
return "(AI21 Labs)";
|
|
2047
|
+
case "bytedance":
|
|
2048
|
+
return "(ByteDance)";
|
|
2049
|
+
case "together":
|
|
2050
|
+
return "(Together AI)";
|
|
2051
|
+
case "fireworks":
|
|
2052
|
+
return "(Fireworks AI)";
|
|
2053
|
+
case "groq":
|
|
2054
|
+
return "(Groq)";
|
|
2055
|
+
case "cerebras":
|
|
2056
|
+
return "(Cerebras)";
|
|
2057
|
+
default:
|
|
2058
|
+
return `(${prefix})`;
|
|
2059
|
+
}
|
|
2060
|
+
}
|
|
2061
|
+
function apiKeyHint(providerId, error) {
|
|
2062
|
+
const lower = error.toLowerCase();
|
|
2063
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2064
|
+
if (!isAuthError) return void 0;
|
|
2065
|
+
const prefix = providerId.split("/")[0];
|
|
2066
|
+
switch (prefix) {
|
|
2067
|
+
case "openai":
|
|
2068
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2069
|
+
case "azure":
|
|
2070
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2071
|
+
case "anthropic":
|
|
2072
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2073
|
+
case "google":
|
|
2074
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2075
|
+
default:
|
|
2076
|
+
return `Check the API key for ${providerId}`;
|
|
2077
|
+
}
|
|
2078
|
+
}
|
|
2079
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2080
|
+
const ranked = providers.map((id) => {
|
|
2081
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2082
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2083
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2084
|
+
return { id, avg };
|
|
2085
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2086
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2087
|
+
}
|
|
2088
|
+
function scorerLabel(name) {
|
|
2089
|
+
switch (name) {
|
|
2090
|
+
case "correctness":
|
|
2091
|
+
return "Match";
|
|
2092
|
+
case "schema-correctness":
|
|
2093
|
+
return "Schema";
|
|
2094
|
+
case "fuzzy-similarity":
|
|
2095
|
+
return "Fuzzy";
|
|
2096
|
+
case "llm-judge-correctness":
|
|
2097
|
+
return "Judge";
|
|
2098
|
+
case "tool-usage":
|
|
2099
|
+
return "Tool";
|
|
2100
|
+
default:
|
|
2101
|
+
return name;
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2104
|
+
function medalEmoji(medal) {
|
|
2105
|
+
switch (medal) {
|
|
2106
|
+
case "gold":
|
|
2107
|
+
return "\u{1F947}";
|
|
2108
|
+
case "silver":
|
|
2109
|
+
return "\u{1F948}";
|
|
2110
|
+
case "bronze":
|
|
2111
|
+
return "\u{1F949}";
|
|
2112
|
+
case "none":
|
|
2113
|
+
return "";
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2117
|
+
// src/reporter/console.ts
|
|
2118
|
+
var reset = "\x1B[0m";
|
|
2119
|
+
var boldCode = "\x1B[1m";
|
|
2120
|
+
var dimCode = "\x1B[2m";
|
|
2121
|
+
var green = "\x1B[32m";
|
|
2122
|
+
var red = "\x1B[31m";
|
|
2123
|
+
var yellow = "\x1B[33m";
|
|
2124
|
+
var cyan = "\x1B[36m";
|
|
2125
|
+
var brightGreen = "\x1B[92m";
|
|
2126
|
+
var brightWhite = "\x1B[97m";
|
|
2127
|
+
function bold(s) {
|
|
2128
|
+
return `${boldCode}${s}${reset}`;
|
|
2129
|
+
}
|
|
2130
|
+
function dim(s) {
|
|
2131
|
+
return `${dimCode}${s}${reset}`;
|
|
2132
|
+
}
|
|
2133
|
+
function stripAnsi(s) {
|
|
2134
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2135
|
+
}
|
|
2136
|
+
function displayWidth(s) {
|
|
2137
|
+
const stripped = stripAnsi(s);
|
|
2138
|
+
let width = 0;
|
|
2139
|
+
for (const ch of stripped) {
|
|
2140
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2141
|
+
if (code >= 126976) width += 2;
|
|
2142
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2143
|
+
else width += 1;
|
|
2144
|
+
}
|
|
2145
|
+
return width;
|
|
2146
|
+
}
|
|
2147
|
+
function padCell(str, targetWidth, align) {
|
|
2148
|
+
const dw = displayWidth(str);
|
|
2149
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2150
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2151
|
+
return str + " ".repeat(padding);
|
|
2152
|
+
}
|
|
2153
|
+
function sparkBar(ratio, width = 8) {
|
|
2154
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2155
|
+
const fillLen = Math.round(clamped * width);
|
|
2156
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2157
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2158
|
+
return { fill, track };
|
|
2159
|
+
}
|
|
2160
|
+
function drawTableLine(widths, position) {
|
|
2161
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2162
|
+
if (position === "bottom") {
|
|
2163
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2164
|
+
}
|
|
2165
|
+
if (position === "merge") {
|
|
2166
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2167
|
+
}
|
|
2168
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2169
|
+
if (position === "top") {
|
|
2170
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2171
|
+
}
|
|
2172
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2173
|
+
}
|
|
2174
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2175
|
+
const parts = cells.map(
|
|
2176
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2177
|
+
);
|
|
2178
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2179
|
+
}
|
|
2180
|
+
function drawSpanRow(content, widths) {
|
|
2181
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2182
|
+
const dw = displayWidth(content);
|
|
2183
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2184
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2185
|
+
}
|
|
2186
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2187
|
+
if (value === void 0) return dim("\u2014");
|
|
2188
|
+
if (providerCount < 2) return text;
|
|
2189
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2190
|
+
if (colStats.best === colStats.worst) return text;
|
|
2191
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2192
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2193
|
+
return `${yellow}${text}${reset}`;
|
|
2194
|
+
}
|
|
1942
2195
|
function consoleReporter(results, options) {
|
|
1943
2196
|
const showSparklines = options?.sparklines ?? true;
|
|
1944
2197
|
if (results.length === 0) {
|
|
1945
2198
|
console.log("\nNo results to display.\n");
|
|
1946
2199
|
return;
|
|
1947
2200
|
}
|
|
1948
|
-
const tasks =
|
|
1949
|
-
const providers = [...new Set(results.map((r) => r.providerId))];
|
|
1950
|
-
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
2201
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
1951
2202
|
const hasCost = scorerNames.includes("cost");
|
|
1952
|
-
const hasErrors = results.some((r) => r.error);
|
|
1953
2203
|
const multi = providers.length >= 2;
|
|
1954
|
-
const runsPerCell =
|
|
2204
|
+
const runsPerCell = maxRun;
|
|
1955
2205
|
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
1956
2206
|
console.log("");
|
|
1957
2207
|
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
@@ -1960,29 +2210,9 @@ function consoleReporter(results, options) {
|
|
|
1960
2210
|
for (const task of tasks) {
|
|
1961
2211
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
1962
2212
|
console.log("");
|
|
1963
|
-
const providerData = providers.map(
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
1967
|
-
if (successResults.length === 0) {
|
|
1968
|
-
return {
|
|
1969
|
-
providerId,
|
|
1970
|
-
avgScores: {},
|
|
1971
|
-
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1972
|
-
latencyMs: void 0,
|
|
1973
|
-
allErrors: errorResults2.length > 0,
|
|
1974
|
-
errorCount: errorResults2.length
|
|
1975
|
-
};
|
|
1976
|
-
}
|
|
1977
|
-
return {
|
|
1978
|
-
providerId,
|
|
1979
|
-
avgScores: averageScores(successResults),
|
|
1980
|
-
avgDetails: averageDetails(successResults),
|
|
1981
|
-
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1982
|
-
allErrors: false,
|
|
1983
|
-
errorCount: errorResults2.length
|
|
1984
|
-
};
|
|
1985
|
-
});
|
|
2213
|
+
const providerData = providers.map(
|
|
2214
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2215
|
+
);
|
|
1986
2216
|
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
1987
2217
|
const medals = computeMedals(columnStats, providers);
|
|
1988
2218
|
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
@@ -1997,8 +2227,7 @@ function consoleReporter(results, options) {
|
|
|
1997
2227
|
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
1998
2228
|
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
1999
2229
|
} else {
|
|
2000
|
-
|
|
2001
|
-
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2230
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2002
2231
|
}
|
|
2003
2232
|
}
|
|
2004
2233
|
if (hasErrors) {
|
|
@@ -2011,7 +2240,7 @@ function consoleReporter(results, options) {
|
|
|
2011
2240
|
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2012
2241
|
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2013
2242
|
for (const pd of providerData) {
|
|
2014
|
-
const medal = medals.get(pd.providerId) ?? "";
|
|
2243
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2015
2244
|
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2016
2245
|
const cells = [providerCell];
|
|
2017
2246
|
if (pd.allErrors) {
|
|
@@ -2084,7 +2313,7 @@ function consoleReporter(results, options) {
|
|
|
2084
2313
|
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2085
2314
|
}
|
|
2086
2315
|
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2087
|
-
const winnerId = [...medals.entries()].find(([, m]) => m === "
|
|
2316
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2088
2317
|
if (winnerId) {
|
|
2089
2318
|
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2090
2319
|
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
@@ -2094,7 +2323,7 @@ function consoleReporter(results, options) {
|
|
|
2094
2323
|
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2095
2324
|
console.log("");
|
|
2096
2325
|
}
|
|
2097
|
-
printSummary(results, providers);
|
|
2326
|
+
printSummary(results, providers, byProvider);
|
|
2098
2327
|
const errorResults = results.filter((r) => r.error);
|
|
2099
2328
|
if (errorResults.length > 0) {
|
|
2100
2329
|
console.log(` ${bold("Errors")}`);
|
|
@@ -2117,15 +2346,19 @@ function consoleReporter(results, options) {
|
|
|
2117
2346
|
console.log("");
|
|
2118
2347
|
}
|
|
2119
2348
|
}
|
|
2120
|
-
function printSummary(results, providers) {
|
|
2349
|
+
function printSummary(results, providers, byProvider) {
|
|
2121
2350
|
const successResults = results.filter((r) => !r.error);
|
|
2122
2351
|
if (successResults.length === 0) return;
|
|
2352
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2353
|
+
for (const id of providers) {
|
|
2354
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2355
|
+
}
|
|
2123
2356
|
console.log(` ${bold("Summary")}`);
|
|
2124
2357
|
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2125
2358
|
console.log("");
|
|
2126
2359
|
const single = providers.length === 1;
|
|
2127
2360
|
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2128
|
-
const byCorrectness = rankProviders(
|
|
2361
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2129
2362
|
if (byCorrectness) {
|
|
2130
2363
|
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2131
2364
|
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
@@ -2136,7 +2369,7 @@ function printSummary(results, providers) {
|
|
|
2136
2369
|
}
|
|
2137
2370
|
}
|
|
2138
2371
|
const byLatency = providers.map((id) => {
|
|
2139
|
-
const runs =
|
|
2372
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2140
2373
|
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2141
2374
|
return { id, avg: avg ?? Infinity };
|
|
2142
2375
|
}).sort((a, b) => a.avg - b.avg)[0];
|
|
@@ -2150,7 +2383,7 @@ function printSummary(results, providers) {
|
|
|
2150
2383
|
}
|
|
2151
2384
|
}
|
|
2152
2385
|
const byCost = providers.map((id) => {
|
|
2153
|
-
const runs =
|
|
2386
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2154
2387
|
const costs = runs.map((r) => {
|
|
2155
2388
|
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2156
2389
|
return s && s.value >= 0 ? s.value : void 0;
|
|
@@ -2188,133 +2421,6 @@ function printSummary(results, providers) {
|
|
|
2188
2421
|
}
|
|
2189
2422
|
console.log("");
|
|
2190
2423
|
}
|
|
2191
|
-
function rankProviders(results, providers, scorerName) {
|
|
2192
|
-
const ranked = providers.map((id) => {
|
|
2193
|
-
const runs = results.filter((r) => r.providerId === id);
|
|
2194
|
-
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2195
|
-
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2196
|
-
return { id, avg };
|
|
2197
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2198
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2199
|
-
}
|
|
2200
|
-
function averageScores(results) {
|
|
2201
|
-
const sums = {};
|
|
2202
|
-
const counts = {};
|
|
2203
|
-
for (const result of results) {
|
|
2204
|
-
for (const score of result.scores) {
|
|
2205
|
-
if (score.value < 0) continue;
|
|
2206
|
-
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
2207
|
-
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
2208
|
-
}
|
|
2209
|
-
}
|
|
2210
|
-
const avgs = {};
|
|
2211
|
-
for (const name of Object.keys(sums)) {
|
|
2212
|
-
avgs[name] = sums[name] / counts[name];
|
|
2213
|
-
}
|
|
2214
|
-
return avgs;
|
|
2215
|
-
}
|
|
2216
|
-
function averageDetails(results) {
|
|
2217
|
-
let costSum = 0;
|
|
2218
|
-
let costCount = 0;
|
|
2219
|
-
let tokenSum = 0;
|
|
2220
|
-
let tokenCount = 0;
|
|
2221
|
-
for (const result of results) {
|
|
2222
|
-
const costScore = result.scores.find((s) => s.name === "cost");
|
|
2223
|
-
const details = costScore?.details;
|
|
2224
|
-
if (details?.estimatedUsd != null) {
|
|
2225
|
-
costSum += details.estimatedUsd;
|
|
2226
|
-
costCount++;
|
|
2227
|
-
}
|
|
2228
|
-
if (details?.totalTokens != null) {
|
|
2229
|
-
tokenSum += details.totalTokens;
|
|
2230
|
-
tokenCount++;
|
|
2231
|
-
}
|
|
2232
|
-
}
|
|
2233
|
-
return {
|
|
2234
|
-
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
2235
|
-
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
2236
|
-
};
|
|
2237
|
-
}
|
|
2238
|
-
function average(nums) {
|
|
2239
|
-
if (nums.length === 0) return void 0;
|
|
2240
|
-
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
2241
|
-
}
|
|
2242
|
-
function formatCost(usd) {
|
|
2243
|
-
if (usd === void 0) return "\u2014";
|
|
2244
|
-
if (usd === 0) return "$0.00";
|
|
2245
|
-
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
2246
|
-
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
2247
|
-
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
2248
|
-
}
|
|
2249
|
-
function apiKeyHint(providerId, error) {
|
|
2250
|
-
const lower = error.toLowerCase();
|
|
2251
|
-
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2252
|
-
if (!isAuthError) return void 0;
|
|
2253
|
-
const prefix = providerId.split("/")[0];
|
|
2254
|
-
switch (prefix) {
|
|
2255
|
-
case "openai":
|
|
2256
|
-
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2257
|
-
case "azure":
|
|
2258
|
-
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2259
|
-
case "anthropic":
|
|
2260
|
-
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2261
|
-
case "google":
|
|
2262
|
-
return "Set: export GOOGLE_API_KEY=...";
|
|
2263
|
-
default:
|
|
2264
|
-
return `Check the API key for ${providerId}`;
|
|
2265
|
-
}
|
|
2266
|
-
}
|
|
2267
|
-
function providerLabel(providerId) {
|
|
2268
|
-
const prefix = providerId.split("/")[0];
|
|
2269
|
-
switch (prefix) {
|
|
2270
|
-
case "azure":
|
|
2271
|
-
return "(OpenAI via Azure)";
|
|
2272
|
-
case "openai":
|
|
2273
|
-
return "(OpenAI)";
|
|
2274
|
-
case "anthropic":
|
|
2275
|
-
return "(Anthropic)";
|
|
2276
|
-
case "google":
|
|
2277
|
-
return "(Google)";
|
|
2278
|
-
case "mistral":
|
|
2279
|
-
return "(Mistral)";
|
|
2280
|
-
case "meta":
|
|
2281
|
-
return "(Meta)";
|
|
2282
|
-
case "deepseek":
|
|
2283
|
-
return "(DeepSeek)";
|
|
2284
|
-
case "cohere":
|
|
2285
|
-
return "(Cohere)";
|
|
2286
|
-
case "qwen":
|
|
2287
|
-
return "(Qwen)";
|
|
2288
|
-
case "xai":
|
|
2289
|
-
return "(xAI)";
|
|
2290
|
-
case "minimax":
|
|
2291
|
-
return "(MiniMax)";
|
|
2292
|
-
case "moonshot":
|
|
2293
|
-
return "(Moonshot / Kimi)";
|
|
2294
|
-
case "perplexity":
|
|
2295
|
-
return "(Perplexity)";
|
|
2296
|
-
case "amazon":
|
|
2297
|
-
return "(Amazon)";
|
|
2298
|
-
case "nvidia":
|
|
2299
|
-
return "(NVIDIA)";
|
|
2300
|
-
case "microsoft":
|
|
2301
|
-
return "(Microsoft)";
|
|
2302
|
-
case "ai21":
|
|
2303
|
-
return "(AI21 Labs)";
|
|
2304
|
-
case "bytedance":
|
|
2305
|
-
return "(ByteDance)";
|
|
2306
|
-
case "together":
|
|
2307
|
-
return "(Together AI)";
|
|
2308
|
-
case "fireworks":
|
|
2309
|
-
return "(Fireworks AI)";
|
|
2310
|
-
case "groq":
|
|
2311
|
-
return "(Groq)";
|
|
2312
|
-
case "cerebras":
|
|
2313
|
-
return "(Cerebras)";
|
|
2314
|
-
default:
|
|
2315
|
-
return `(${prefix})`;
|
|
2316
|
-
}
|
|
2317
|
-
}
|
|
2318
2424
|
|
|
2319
2425
|
// src/reporter/json.ts
|
|
2320
2426
|
function jsonReporter(results) {
|
|
@@ -2379,7 +2485,7 @@ function anthropic(model, options) {
|
|
|
2379
2485
|
model,
|
|
2380
2486
|
async run(input) {
|
|
2381
2487
|
const start = Date.now();
|
|
2382
|
-
const systemMessage = input.schema ?
|
|
2488
|
+
const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
|
|
2383
2489
|
const response = await client.messages.create({
|
|
2384
2490
|
model,
|
|
2385
2491
|
max_tokens: maxTokens,
|
|
@@ -2389,13 +2495,7 @@ function anthropic(model, options) {
|
|
|
2389
2495
|
const latencyMs = Date.now() - start;
|
|
2390
2496
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2391
2497
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2392
|
-
|
|
2393
|
-
if (input.schema) {
|
|
2394
|
-
try {
|
|
2395
|
-
output = JSON.parse(rawContent);
|
|
2396
|
-
} catch {
|
|
2397
|
-
}
|
|
2398
|
-
}
|
|
2498
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2399
2499
|
return {
|
|
2400
2500
|
output,
|
|
2401
2501
|
usage: {
|
|
@@ -2409,23 +2509,6 @@ function anthropic(model, options) {
|
|
|
2409
2509
|
};
|
|
2410
2510
|
}
|
|
2411
2511
|
|
|
2412
|
-
// src/providers/gemini.ts
|
|
2413
|
-
import OpenAI3 from "openai";
|
|
2414
|
-
function gemini(model, options) {
|
|
2415
|
-
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
2416
|
-
if (!apiKey) {
|
|
2417
|
-
throw new Error(
|
|
2418
|
-
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
2419
|
-
);
|
|
2420
|
-
}
|
|
2421
|
-
const client = new OpenAI3({
|
|
2422
|
-
apiKey,
|
|
2423
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2424
|
-
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2425
|
-
});
|
|
2426
|
-
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2427
|
-
}
|
|
2428
|
-
|
|
2429
2512
|
// src/reporter/markdown.ts
|
|
2430
2513
|
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2431
2514
|
function markdownReporter(report, _current) {
|
|
@@ -2470,7 +2553,7 @@ function markdownComparisonTable(comparisons) {
|
|
|
2470
2553
|
for (const c of comparisons) {
|
|
2471
2554
|
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2472
2555
|
const currentStr = formatStats(c.current);
|
|
2473
|
-
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2556
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2474
2557
|
const status = statusIndicator(c);
|
|
2475
2558
|
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2476
2559
|
}
|
|
@@ -2503,10 +2586,6 @@ function formatStats(stats) {
|
|
|
2503
2586
|
}
|
|
2504
2587
|
return stats.mean.toFixed(3);
|
|
2505
2588
|
}
|
|
2506
|
-
function formatDelta(delta) {
|
|
2507
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2508
|
-
return `${sign}${delta.toFixed(3)}`;
|
|
2509
|
-
}
|
|
2510
2589
|
function statusIndicator(c) {
|
|
2511
2590
|
if (c.regressed) return "\u{1F534} regressed";
|
|
2512
2591
|
if (c.improved) return "\u{1F7E2} improved";
|
|
@@ -2514,6 +2593,644 @@ function statusIndicator(c) {
|
|
|
2514
2593
|
return "\u26AA unchanged";
|
|
2515
2594
|
}
|
|
2516
2595
|
|
|
2596
|
+
// src/reporter/html.ts
|
|
2597
|
+
function esc(s) {
|
|
2598
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2599
|
+
}
|
|
2600
|
+
function htmlReporter(results) {
|
|
2601
|
+
if (results.length === 0) {
|
|
2602
|
+
return emptyReport();
|
|
2603
|
+
}
|
|
2604
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2605
|
+
const hasCost = scorerNames.includes("cost");
|
|
2606
|
+
const multi = providers.length >= 2;
|
|
2607
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2608
|
+
const taskSections = tasks.map((task) => {
|
|
2609
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2610
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2611
|
+
const medals = computeMedals(columnStats, providers);
|
|
2612
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2613
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2614
|
+
});
|
|
2615
|
+
const successResults = results.filter((r) => !r.error);
|
|
2616
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2617
|
+
for (const id of providers) {
|
|
2618
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2619
|
+
}
|
|
2620
|
+
const correctnessKey = successResults.some(
|
|
2621
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2622
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2623
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2624
|
+
const byLatency = providers.map((id) => {
|
|
2625
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2626
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2627
|
+
return { id, avg: avg ?? Infinity };
|
|
2628
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2629
|
+
const byCost = providers.map((id) => {
|
|
2630
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2631
|
+
const costs = runs.map((r) => {
|
|
2632
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2633
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2634
|
+
}).filter((c) => c !== void 0);
|
|
2635
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2636
|
+
return { id, avg };
|
|
2637
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2638
|
+
let overallWinner;
|
|
2639
|
+
if (multi) {
|
|
2640
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2641
|
+
for (const id of providers) wins.set(id, 0);
|
|
2642
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2643
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2644
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2645
|
+
const maxWins = Math.max(...wins.values());
|
|
2646
|
+
if (maxWins > 0) {
|
|
2647
|
+
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2648
|
+
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
const errorResults = results.filter((r) => r.error);
|
|
2652
|
+
const deduped = dedupeErrors(errorResults);
|
|
2653
|
+
return `<!DOCTYPE html>
|
|
2654
|
+
<html lang="en">
|
|
2655
|
+
<head>
|
|
2656
|
+
<meta charset="UTF-8">
|
|
2657
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2658
|
+
<title>Agent Duelist Report</title>
|
|
2659
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2660
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2661
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2662
|
+
<meta property="og:type" content="website">
|
|
2663
|
+
${renderStyle()}
|
|
2664
|
+
</head>
|
|
2665
|
+
<body>
|
|
2666
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2667
|
+
<div class="report">
|
|
2668
|
+
|
|
2669
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2670
|
+
|
|
2671
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2672
|
+
|
|
2673
|
+
<main>
|
|
2674
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2675
|
+
s.task,
|
|
2676
|
+
s.providerData,
|
|
2677
|
+
s.columnStats,
|
|
2678
|
+
s.medals,
|
|
2679
|
+
s.winnerId,
|
|
2680
|
+
scorerNames,
|
|
2681
|
+
hasCost,
|
|
2682
|
+
multi,
|
|
2683
|
+
i
|
|
2684
|
+
)).join("\n")}
|
|
2685
|
+
</main>
|
|
2686
|
+
|
|
2687
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2688
|
+
|
|
2689
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2690
|
+
|
|
2691
|
+
${renderFooter()}
|
|
2692
|
+
|
|
2693
|
+
</div>
|
|
2694
|
+
${renderScript(tasks.length)}
|
|
2695
|
+
</body>
|
|
2696
|
+
</html>`;
|
|
2697
|
+
}
|
|
2698
|
+
function emptyReport() {
|
|
2699
|
+
return `<!DOCTYPE html>
|
|
2700
|
+
<html lang="en">
|
|
2701
|
+
<head>
|
|
2702
|
+
<meta charset="UTF-8">
|
|
2703
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2704
|
+
<title>Agent Duelist Report</title>
|
|
2705
|
+
${renderStyle()}
|
|
2706
|
+
</head>
|
|
2707
|
+
<body>
|
|
2708
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2709
|
+
<div class="report">
|
|
2710
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2711
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2712
|
+
${renderFooter()}
|
|
2713
|
+
</div>
|
|
2714
|
+
</body>
|
|
2715
|
+
</html>`;
|
|
2716
|
+
}
|
|
2717
|
+
function dedupeErrors(errorResults) {
|
|
2718
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2719
|
+
for (const r of errorResults) {
|
|
2720
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2721
|
+
const existing = seen.get(key);
|
|
2722
|
+
if (existing) {
|
|
2723
|
+
existing.count++;
|
|
2724
|
+
} else {
|
|
2725
|
+
seen.set(key, {
|
|
2726
|
+
providerId: r.providerId,
|
|
2727
|
+
error: r.error ?? "Unknown error",
|
|
2728
|
+
count: 1,
|
|
2729
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2730
|
+
});
|
|
2731
|
+
}
|
|
2732
|
+
}
|
|
2733
|
+
return [...seen.values()];
|
|
2734
|
+
}
|
|
2735
|
+
function renderStyle() {
|
|
2736
|
+
return `<style>
|
|
2737
|
+
:root {
|
|
2738
|
+
--bg: #0f172a;
|
|
2739
|
+
--bg-deep: #020617;
|
|
2740
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2741
|
+
--accent: #f59e0b;
|
|
2742
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2743
|
+
--text: #e2e8f0;
|
|
2744
|
+
--muted: #94a3b8;
|
|
2745
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2746
|
+
--green: #22c55e;
|
|
2747
|
+
--red: #ef4444;
|
|
2748
|
+
--yellow: #eab308;
|
|
2749
|
+
--radius: 12px;
|
|
2750
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2751
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2752
|
+
}
|
|
2753
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2754
|
+
html, body {
|
|
2755
|
+
font-family: var(--sans);
|
|
2756
|
+
background: var(--bg);
|
|
2757
|
+
color: var(--text);
|
|
2758
|
+
min-height: 100vh;
|
|
2759
|
+
}
|
|
2760
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2761
|
+
|
|
2762
|
+
/* Animated gradient mesh */
|
|
2763
|
+
.bg-mesh {
|
|
2764
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2765
|
+
overflow: hidden; pointer-events: none;
|
|
2766
|
+
}
|
|
2767
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2768
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2769
|
+
filter: blur(120px); opacity: 0.4;
|
|
2770
|
+
}
|
|
2771
|
+
.bg-mesh::before {
|
|
2772
|
+
width: 600px; height: 600px;
|
|
2773
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2774
|
+
top: -10%; left: -5%;
|
|
2775
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2776
|
+
}
|
|
2777
|
+
.bg-mesh::after {
|
|
2778
|
+
width: 500px; height: 500px;
|
|
2779
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2780
|
+
bottom: -10%; right: -5%;
|
|
2781
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2782
|
+
}
|
|
2783
|
+
.bg-mesh-extra {
|
|
2784
|
+
position: absolute; width: 400px; height: 400px;
|
|
2785
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2786
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2787
|
+
top: 50%; left: 60%;
|
|
2788
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2789
|
+
}
|
|
2790
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2791
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2792
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2793
|
+
|
|
2794
|
+
/* Report container */
|
|
2795
|
+
.report {
|
|
2796
|
+
position: relative; z-index: 1;
|
|
2797
|
+
width: 100%; max-width: 960px;
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
/* Header */
|
|
2801
|
+
.report-header {
|
|
2802
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2803
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2804
|
+
}
|
|
2805
|
+
.report-brand {
|
|
2806
|
+
display: flex; align-items: center; gap: 10px;
|
|
2807
|
+
text-decoration: none; color: var(--muted);
|
|
2808
|
+
font-weight: 600; font-size: 14px;
|
|
2809
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2810
|
+
}
|
|
2811
|
+
.report-brand:hover { color: var(--text); }
|
|
2812
|
+
.brand-icon {
|
|
2813
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2814
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2815
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2816
|
+
display: flex; align-items: center; justify-content: center;
|
|
2817
|
+
font-size: 16px;
|
|
2818
|
+
}
|
|
2819
|
+
.report-meta {
|
|
2820
|
+
font-size: 12px; color: var(--muted);
|
|
2821
|
+
text-align: right; line-height: 1.6;
|
|
2822
|
+
}
|
|
2823
|
+
|
|
2824
|
+
/* Task tabs */
|
|
2825
|
+
.task-tabs {
|
|
2826
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2827
|
+
}
|
|
2828
|
+
.task-tab {
|
|
2829
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2830
|
+
border: 1px solid var(--border);
|
|
2831
|
+
background: transparent; color: var(--muted);
|
|
2832
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2833
|
+
transition: all 150ms ease;
|
|
2834
|
+
}
|
|
2835
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2836
|
+
.task-tab.active {
|
|
2837
|
+
background: var(--accent-soft);
|
|
2838
|
+
border-color: rgba(245,158,11,0.4);
|
|
2839
|
+
color: var(--accent);
|
|
2840
|
+
}
|
|
2841
|
+
|
|
2842
|
+
/* Task sections */
|
|
2843
|
+
.task-section { display: none; }
|
|
2844
|
+
.task-section.active { display: block; }
|
|
2845
|
+
.task-name {
|
|
2846
|
+
font-size: 18px; font-weight: 600;
|
|
2847
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2848
|
+
}
|
|
2849
|
+
|
|
2850
|
+
/* Results table */
|
|
2851
|
+
.results-table {
|
|
2852
|
+
width: 100%; border-collapse: collapse;
|
|
2853
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2854
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2855
|
+
border: 1px solid var(--border);
|
|
2856
|
+
}
|
|
2857
|
+
.results-table th, .results-table td {
|
|
2858
|
+
padding: 10px 14px;
|
|
2859
|
+
text-align: left;
|
|
2860
|
+
border-bottom: 1px solid var(--border);
|
|
2861
|
+
}
|
|
2862
|
+
.results-table th {
|
|
2863
|
+
background: rgba(0,0,0,0.3);
|
|
2864
|
+
font-size: 11px; font-weight: 600;
|
|
2865
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2866
|
+
color: var(--muted); cursor: pointer;
|
|
2867
|
+
user-select: none; white-space: nowrap;
|
|
2868
|
+
}
|
|
2869
|
+
.results-table th:hover { color: var(--text); }
|
|
2870
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
2871
|
+
.results-table tbody tr {
|
|
2872
|
+
background: var(--panel);
|
|
2873
|
+
transition: background 120ms ease;
|
|
2874
|
+
}
|
|
2875
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
2876
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
2877
|
+
|
|
2878
|
+
/* Score cell with progress bar */
|
|
2879
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
2880
|
+
.score-bar {
|
|
2881
|
+
position: absolute; left: 0; bottom: 0;
|
|
2882
|
+
height: 3px; border-radius: 2px;
|
|
2883
|
+
transition: width 300ms ease;
|
|
2884
|
+
}
|
|
2885
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
2886
|
+
|
|
2887
|
+
/* Color ranking */
|
|
2888
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
2889
|
+
.rank-worst { color: var(--red); }
|
|
2890
|
+
.rank-mid { color: var(--yellow); }
|
|
2891
|
+
.rank-neutral { color: var(--text); }
|
|
2892
|
+
.rank-error { color: var(--muted); }
|
|
2893
|
+
|
|
2894
|
+
/* Winner banner */
|
|
2895
|
+
.task-winner {
|
|
2896
|
+
display: flex; align-items: center; gap: 10px;
|
|
2897
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
2898
|
+
border-radius: var(--radius);
|
|
2899
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
2900
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
2901
|
+
font-size: 14px; font-weight: 500;
|
|
2902
|
+
}
|
|
2903
|
+
.task-winner .trophy { font-size: 20px; }
|
|
2904
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
2905
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
2906
|
+
|
|
2907
|
+
/* Summary cards */
|
|
2908
|
+
.summary-section { margin-top: 32px; }
|
|
2909
|
+
.summary-title {
|
|
2910
|
+
font-size: 16px; font-weight: 600;
|
|
2911
|
+
margin-bottom: 12px; color: var(--text);
|
|
2912
|
+
}
|
|
2913
|
+
.summary-cards {
|
|
2914
|
+
display: grid;
|
|
2915
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
2916
|
+
gap: 12px;
|
|
2917
|
+
}
|
|
2918
|
+
.summary-card {
|
|
2919
|
+
padding: 16px; border-radius: var(--radius);
|
|
2920
|
+
border: 1px solid var(--border);
|
|
2921
|
+
background: var(--panel);
|
|
2922
|
+
}
|
|
2923
|
+
.summary-card .card-label {
|
|
2924
|
+
font-size: 11px; font-weight: 600;
|
|
2925
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2926
|
+
color: var(--muted); margin-bottom: 6px;
|
|
2927
|
+
}
|
|
2928
|
+
.summary-card .card-value {
|
|
2929
|
+
font-size: 20px; font-weight: 700;
|
|
2930
|
+
color: var(--green); font-family: var(--mono);
|
|
2931
|
+
}
|
|
2932
|
+
.summary-card .card-provider {
|
|
2933
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
2934
|
+
}
|
|
2935
|
+
|
|
2936
|
+
/* Errors */
|
|
2937
|
+
.errors-section { margin-top: 24px; }
|
|
2938
|
+
.errors-title {
|
|
2939
|
+
font-size: 16px; font-weight: 600;
|
|
2940
|
+
margin-bottom: 8px; color: var(--red);
|
|
2941
|
+
cursor: pointer;
|
|
2942
|
+
}
|
|
2943
|
+
.errors-list {
|
|
2944
|
+
border-radius: var(--radius);
|
|
2945
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
2946
|
+
background: rgba(239,68,68,0.04);
|
|
2947
|
+
overflow: hidden;
|
|
2948
|
+
}
|
|
2949
|
+
.error-item {
|
|
2950
|
+
padding: 10px 16px;
|
|
2951
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
2952
|
+
font-size: 13px;
|
|
2953
|
+
}
|
|
2954
|
+
.error-item:last-child { border-bottom: none; }
|
|
2955
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
2956
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
2957
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
2958
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
2959
|
+
|
|
2960
|
+
/* Footer */
|
|
2961
|
+
.report-footer {
|
|
2962
|
+
margin-top: 40px; padding: 20px 0;
|
|
2963
|
+
border-top: 1px solid var(--border);
|
|
2964
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2965
|
+
flex-wrap: wrap; gap: 12px;
|
|
2966
|
+
}
|
|
2967
|
+
.footer-brand {
|
|
2968
|
+
font-size: 13px; color: var(--muted);
|
|
2969
|
+
}
|
|
2970
|
+
.footer-brand a {
|
|
2971
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
2972
|
+
}
|
|
2973
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
2974
|
+
.footer-cta {
|
|
2975
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
2976
|
+
padding: 6px 14px; border-radius: 8px;
|
|
2977
|
+
background: var(--accent-soft);
|
|
2978
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2979
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
2980
|
+
text-decoration: none;
|
|
2981
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
2982
|
+
}
|
|
2983
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
2984
|
+
|
|
2985
|
+
/* Empty state */
|
|
2986
|
+
.empty-msg {
|
|
2987
|
+
text-align: center; color: var(--muted);
|
|
2988
|
+
padding: 60px 20px; font-size: 16px;
|
|
2989
|
+
}
|
|
2990
|
+
|
|
2991
|
+
/* Responsive */
|
|
2992
|
+
@media (max-width: 640px) {
|
|
2993
|
+
body { padding: 12px; }
|
|
2994
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
2995
|
+
.report-meta { text-align: left; }
|
|
2996
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
2997
|
+
.results-table { font-size: 12px; }
|
|
2998
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
2999
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3000
|
+
}
|
|
3001
|
+
</style>`;
|
|
3002
|
+
}
|
|
3003
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3004
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3005
|
+
return `<header class="report-header">
|
|
3006
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3007
|
+
<div class="brand-icon">⬡</div>
|
|
3008
|
+
<span>Agent Duelist</span>
|
|
3009
|
+
</a>
|
|
3010
|
+
<div class="report-meta">
|
|
3011
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3012
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3013
|
+
${esc(runsLabel)}<br>
|
|
3014
|
+
${esc(now)}
|
|
3015
|
+
</div>
|
|
3016
|
+
</header>`;
|
|
3017
|
+
}
|
|
3018
|
+
function renderTabs(tasks) {
|
|
3019
|
+
const buttons = tasks.map(
|
|
3020
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3021
|
+
).join("\n ");
|
|
3022
|
+
return `<nav class="task-tabs">
|
|
3023
|
+
${buttons}
|
|
3024
|
+
</nav>`;
|
|
3025
|
+
}
|
|
3026
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3027
|
+
const cols = [
|
|
3028
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3029
|
+
];
|
|
3030
|
+
for (const name of scorerNames) {
|
|
3031
|
+
if (name === "latency") {
|
|
3032
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3033
|
+
} else if (name === "cost") {
|
|
3034
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3035
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3036
|
+
} else {
|
|
3037
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3038
|
+
}
|
|
3039
|
+
}
|
|
3040
|
+
const ths = cols.map(
|
|
3041
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3042
|
+
).join("");
|
|
3043
|
+
const rows = providerData.map((pd) => {
|
|
3044
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3045
|
+
const cells = [];
|
|
3046
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3047
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3048
|
+
if (pd.allErrors) {
|
|
3049
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3050
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3051
|
+
}
|
|
3052
|
+
} else {
|
|
3053
|
+
for (const col of cols.slice(1)) {
|
|
3054
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3058
|
+
}).join("\n");
|
|
3059
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3060
|
+
<span class="trophy">🏆</span>
|
|
3061
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3062
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3063
|
+
</div>` : "";
|
|
3064
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3065
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3066
|
+
<table class="results-table">
|
|
3067
|
+
<thead><tr>${ths}</tr></thead>
|
|
3068
|
+
<tbody>${rows}</tbody>
|
|
3069
|
+
</table>
|
|
3070
|
+
${winnerHtml}
|
|
3071
|
+
</section>`;
|
|
3072
|
+
}
|
|
3073
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3074
|
+
const colStats = columnStats.get(key);
|
|
3075
|
+
if (key === "latency") {
|
|
3076
|
+
const ms = pd.latencyMs;
|
|
3077
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3078
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3079
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3080
|
+
}
|
|
3081
|
+
if (key === "cost") {
|
|
3082
|
+
const cost = pd.avgDetails.costUsd;
|
|
3083
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3084
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3085
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3086
|
+
}
|
|
3087
|
+
if (key === "tokens") {
|
|
3088
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3089
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3090
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3091
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3092
|
+
}
|
|
3093
|
+
const val = pd.avgScores[key];
|
|
3094
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3095
|
+
const pct = Math.round(val * 100);
|
|
3096
|
+
let rankCls;
|
|
3097
|
+
if (multi && colStats) {
|
|
3098
|
+
rankCls = rankClass_(val, colStats);
|
|
3099
|
+
} else {
|
|
3100
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3101
|
+
}
|
|
3102
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3103
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3104
|
+
<span class="score-val">${pct}%</span>
|
|
3105
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3106
|
+
</td>`;
|
|
3107
|
+
}
|
|
3108
|
+
function rankClass_(value, colStats) {
|
|
3109
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3110
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3111
|
+
if (value === colStats.best) return "rank-best";
|
|
3112
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3113
|
+
return "rank-mid";
|
|
3114
|
+
}
|
|
3115
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3116
|
+
const cards = [];
|
|
3117
|
+
if (byCorrectness) {
|
|
3118
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3119
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3120
|
+
cards.push(`<div class="summary-card">
|
|
3121
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3122
|
+
<div class="card-value">${pct}</div>
|
|
3123
|
+
${provider}
|
|
3124
|
+
</div>`);
|
|
3125
|
+
}
|
|
3126
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3127
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3128
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3129
|
+
cards.push(`<div class="summary-card">
|
|
3130
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3131
|
+
<div class="card-value">${ms}</div>
|
|
3132
|
+
${provider}
|
|
3133
|
+
</div>`);
|
|
3134
|
+
}
|
|
3135
|
+
if (byCost?.avg !== void 0) {
|
|
3136
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3137
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3138
|
+
cards.push(`<div class="summary-card">
|
|
3139
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3140
|
+
<div class="card-value">${cost}</div>
|
|
3141
|
+
${provider}
|
|
3142
|
+
</div>`);
|
|
3143
|
+
}
|
|
3144
|
+
if (overallWinner) {
|
|
3145
|
+
cards.push(`<div class="summary-card">
|
|
3146
|
+
<div class="card-label">Overall Winner</div>
|
|
3147
|
+
<div class="card-value">🏆</div>
|
|
3148
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3149
|
+
</div>`);
|
|
3150
|
+
}
|
|
3151
|
+
if (cards.length === 0) return "";
|
|
3152
|
+
return `<section class="summary-section">
|
|
3153
|
+
<h2 class="summary-title">Summary</h2>
|
|
3154
|
+
<div class="summary-cards">
|
|
3155
|
+
${cards.join("\n ")}
|
|
3156
|
+
</div>
|
|
3157
|
+
</section>`;
|
|
3158
|
+
}
|
|
3159
|
+
function renderErrors(errors) {
|
|
3160
|
+
const items = errors.map((e) => {
|
|
3161
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3162
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3163
|
+
return `<div class="error-item">
|
|
3164
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3165
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3166
|
+
${hint}
|
|
3167
|
+
</div>`;
|
|
3168
|
+
}).join("\n");
|
|
3169
|
+
return `<section class="errors-section">
|
|
3170
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
|
|
3171
|
+
<div class="errors-list">
|
|
3172
|
+
${items}
|
|
3173
|
+
</div>
|
|
3174
|
+
</section>`;
|
|
3175
|
+
}
|
|
3176
|
+
function renderFooter() {
|
|
3177
|
+
return `<footer class="report-footer">
|
|
3178
|
+
<div class="footer-brand">
|
|
3179
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3180
|
+
</div>
|
|
3181
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3182
|
+
⭐ Star on GitHub
|
|
3183
|
+
</a>
|
|
3184
|
+
</footer>`;
|
|
3185
|
+
}
|
|
3186
|
+
function renderScript(taskCount) {
|
|
3187
|
+
return `<script>
|
|
3188
|
+
(function() {
|
|
3189
|
+
/* Tab switching */
|
|
3190
|
+
${taskCount > 1 ? `
|
|
3191
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3192
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3193
|
+
tabs.forEach(function(tab) {
|
|
3194
|
+
tab.addEventListener('click', function() {
|
|
3195
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3196
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3197
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3198
|
+
tab.classList.add('active');
|
|
3199
|
+
sections[idx].classList.add('active');
|
|
3200
|
+
});
|
|
3201
|
+
});` : ""}
|
|
3202
|
+
|
|
3203
|
+
/* Column sorting */
|
|
3204
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3205
|
+
var table = th.closest('table');
|
|
3206
|
+
var asc = true;
|
|
3207
|
+
th.addEventListener('click', function() {
|
|
3208
|
+
var tbody = table.querySelector('tbody');
|
|
3209
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3210
|
+
rows.sort(function(a, b) {
|
|
3211
|
+
var aCell = a.children[colIdx];
|
|
3212
|
+
var bCell = b.children[colIdx];
|
|
3213
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3214
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3215
|
+
if (aVal !== null && bVal !== null) {
|
|
3216
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3217
|
+
}
|
|
3218
|
+
var aText = aCell.textContent || '';
|
|
3219
|
+
var bText = bCell.textContent || '';
|
|
3220
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3221
|
+
});
|
|
3222
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3223
|
+
|
|
3224
|
+
/* Update sort arrows */
|
|
3225
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3226
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3227
|
+
asc = !asc;
|
|
3228
|
+
});
|
|
3229
|
+
});
|
|
3230
|
+
})();
|
|
3231
|
+
</script>`;
|
|
3232
|
+
}
|
|
3233
|
+
|
|
2517
3234
|
// src/ci.ts
|
|
2518
3235
|
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
2519
3236
|
import { dirname } from "path";
|
|
@@ -2535,10 +3252,11 @@ var T_CRITICAL_95 = {
|
|
|
2535
3252
|
25: 2.06,
|
|
2536
3253
|
30: 2.042
|
|
2537
3254
|
};
|
|
3255
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2538
3256
|
function tCritical(df) {
|
|
2539
3257
|
if (df <= 0) return 1.96;
|
|
2540
3258
|
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2541
|
-
const keys =
|
|
3259
|
+
const keys = T_CRITICAL_KEYS;
|
|
2542
3260
|
if (df > keys[keys.length - 1]) return 1.96;
|
|
2543
3261
|
for (let i = 0; i < keys.length - 1; i++) {
|
|
2544
3262
|
if (df > keys[i] && df < keys[i + 1]) {
|
|
@@ -2648,7 +3366,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
|
|
|
2648
3366
|
if (regressions.length > 0) {
|
|
2649
3367
|
for (const r of regressions) {
|
|
2650
3368
|
failureReasons.push(
|
|
2651
|
-
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${
|
|
3369
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
2652
3370
|
);
|
|
2653
3371
|
}
|
|
2654
3372
|
}
|
|
@@ -2683,10 +3401,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
|
2683
3401
|
}
|
|
2684
3402
|
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2685
3403
|
}
|
|
2686
|
-
function formatDelta2(delta) {
|
|
2687
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2688
|
-
return `${sign}${delta.toFixed(4)}`;
|
|
2689
|
-
}
|
|
2690
3404
|
function loadBaseline(path) {
|
|
2691
3405
|
try {
|
|
2692
3406
|
const raw = readFileSync(path, "utf-8");
|
|
@@ -2743,18 +3457,20 @@ function detectGitHubContext() {
|
|
|
2743
3457
|
return { token, owner, repo, prNumber };
|
|
2744
3458
|
}
|
|
2745
3459
|
var API_BASE = "https://api.github.com";
|
|
3460
|
+
function ghHeaders(token, extra) {
|
|
3461
|
+
return {
|
|
3462
|
+
Authorization: `Bearer ${token}`,
|
|
3463
|
+
Accept: "application/vnd.github+json",
|
|
3464
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3465
|
+
...extra
|
|
3466
|
+
};
|
|
3467
|
+
}
|
|
2746
3468
|
async function findExistingComment(ctx, marker) {
|
|
2747
3469
|
let page = 1;
|
|
2748
3470
|
const perPage = 50;
|
|
2749
3471
|
while (true) {
|
|
2750
3472
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2751
|
-
const res = await fetch(url, {
|
|
2752
|
-
headers: {
|
|
2753
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2754
|
-
Accept: "application/vnd.github+json",
|
|
2755
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2756
|
-
}
|
|
2757
|
-
});
|
|
3473
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
2758
3474
|
if (!res.ok) return null;
|
|
2759
3475
|
const comments = await res.json();
|
|
2760
3476
|
if (comments.length === 0) break;
|
|
@@ -2774,12 +3490,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2774
3490
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2775
3491
|
const res = await fetch(url, {
|
|
2776
3492
|
method: "PATCH",
|
|
2777
|
-
headers: {
|
|
2778
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2779
|
-
Accept: "application/vnd.github+json",
|
|
2780
|
-
"Content-Type": "application/json",
|
|
2781
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2782
|
-
},
|
|
3493
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2783
3494
|
body: JSON.stringify({ body })
|
|
2784
3495
|
});
|
|
2785
3496
|
if (!res.ok) {
|
|
@@ -2790,12 +3501,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2790
3501
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2791
3502
|
const res = await fetch(url, {
|
|
2792
3503
|
method: "POST",
|
|
2793
|
-
headers: {
|
|
2794
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2795
|
-
Accept: "application/vnd.github+json",
|
|
2796
|
-
"Content-Type": "application/json",
|
|
2797
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2798
|
-
},
|
|
3504
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2799
3505
|
body: JSON.stringify({ body })
|
|
2800
3506
|
});
|
|
2801
3507
|
if (!res.ok) {
|
|
@@ -2813,6 +3519,7 @@ export {
|
|
|
2813
3519
|
defineArena,
|
|
2814
3520
|
detectGitHubContext,
|
|
2815
3521
|
gemini,
|
|
3522
|
+
htmlReporter,
|
|
2816
3523
|
jsonReporter,
|
|
2817
3524
|
loadBaseline,
|
|
2818
3525
|
markdownReporter,
|