agent-duelist 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -3
- package/dist/cli.js +2754 -2102
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1054 -346
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +8 -7
- package/dist/index.d.ts +8 -7
- package/dist/index.js +1053 -346
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -38,6 +38,7 @@ __export(index_exports, {
|
|
|
38
38
|
defineArena: () => defineArena,
|
|
39
39
|
detectGitHubContext: () => detectGitHubContext,
|
|
40
40
|
gemini: () => gemini,
|
|
41
|
+
htmlReporter: () => htmlReporter,
|
|
41
42
|
jsonReporter: () => jsonReporter,
|
|
42
43
|
loadBaseline: () => loadBaseline,
|
|
43
44
|
markdownReporter: () => markdownReporter,
|
|
@@ -1461,11 +1462,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1461
1462
|
}
|
|
1462
1463
|
const a = stringify(task.expected);
|
|
1463
1464
|
const b = stringify(result.output);
|
|
1464
|
-
const
|
|
1465
|
+
const setA = tokenize(a);
|
|
1466
|
+
const setB = tokenize(b);
|
|
1467
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1465
1468
|
return {
|
|
1466
1469
|
name: "fuzzy-similarity",
|
|
1467
1470
|
value: Math.round(similarity * 100) / 100,
|
|
1468
|
-
details: { method: "jaccard", expectedTokens:
|
|
1471
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1469
1472
|
};
|
|
1470
1473
|
};
|
|
1471
1474
|
function stringify(value) {
|
|
@@ -1491,6 +1494,19 @@ var import_openai2 = __toESM(require("openai"), 1);
|
|
|
1491
1494
|
// src/providers/openai.ts
|
|
1492
1495
|
var import_openai = __toESM(require("openai"), 1);
|
|
1493
1496
|
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1497
|
+
|
|
1498
|
+
// src/providers/shared.ts
|
|
1499
|
+
var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
|
|
1500
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1501
|
+
if (!hasSchema) return rawContent;
|
|
1502
|
+
try {
|
|
1503
|
+
return JSON.parse(rawContent);
|
|
1504
|
+
} catch {
|
|
1505
|
+
return rawContent;
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
// src/providers/openai.ts
|
|
1494
1510
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1495
1511
|
function openai(model, options) {
|
|
1496
1512
|
const client = new import_openai.default({
|
|
@@ -1537,7 +1553,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1537
1553
|
if (input.schema) {
|
|
1538
1554
|
params.response_format = { type: "json_object" };
|
|
1539
1555
|
params.messages = [
|
|
1540
|
-
{ role: "system", content:
|
|
1556
|
+
{ role: "system", content: SCHEMA_SYSTEM_MESSAGE },
|
|
1541
1557
|
...params.messages
|
|
1542
1558
|
];
|
|
1543
1559
|
}
|
|
@@ -1590,13 +1606,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1590
1606
|
if (stripThinking) {
|
|
1591
1607
|
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1592
1608
|
}
|
|
1593
|
-
|
|
1594
|
-
if (input.schema) {
|
|
1595
|
-
try {
|
|
1596
|
-
output = JSON.parse(rawContent);
|
|
1597
|
-
} catch {
|
|
1598
|
-
}
|
|
1599
|
-
}
|
|
1609
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1600
1610
|
return {
|
|
1601
1611
|
output,
|
|
1602
1612
|
usage: {
|
|
@@ -1610,6 +1620,20 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1610
1620
|
}
|
|
1611
1621
|
};
|
|
1612
1622
|
}
|
|
1623
|
+
function gemini(model, options) {
|
|
1624
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1625
|
+
if (!apiKey) {
|
|
1626
|
+
throw new Error(
|
|
1627
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1628
|
+
);
|
|
1629
|
+
}
|
|
1630
|
+
const client = new import_openai.default({
|
|
1631
|
+
apiKey,
|
|
1632
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1633
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1634
|
+
});
|
|
1635
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1636
|
+
}
|
|
1613
1637
|
function toolDefToOpenAI(tool) {
|
|
1614
1638
|
return {
|
|
1615
1639
|
type: "function",
|
|
@@ -1687,8 +1711,7 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1687
1711
|
const response = await client.chat.completions.create({
|
|
1688
1712
|
model,
|
|
1689
1713
|
messages: [{ role: "user", content: prompt }],
|
|
1690
|
-
|
|
1691
|
-
max_tokens: 2048
|
|
1714
|
+
max_completion_tokens: 2048
|
|
1692
1715
|
});
|
|
1693
1716
|
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1694
1717
|
const parsed = {};
|
|
@@ -1784,118 +1807,173 @@ async function runBenchmarks(options) {
|
|
|
1784
1807
|
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1785
1808
|
const results = [];
|
|
1786
1809
|
for (const task of tasks) {
|
|
1787
|
-
for (
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1810
|
+
for (let run = 1; run <= runs; run++) {
|
|
1811
|
+
const runResults = await Promise.all(
|
|
1812
|
+
providers.map(async (provider) => {
|
|
1813
|
+
let result;
|
|
1814
|
+
try {
|
|
1815
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1816
|
+
prompt: task.prompt,
|
|
1817
|
+
schema: task.schema,
|
|
1818
|
+
tools: task.tools,
|
|
1819
|
+
signal
|
|
1820
|
+
}), timeout);
|
|
1821
|
+
const scores = await Promise.all(
|
|
1822
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1823
|
+
);
|
|
1824
|
+
result = {
|
|
1825
|
+
providerId: provider.id,
|
|
1826
|
+
taskName: task.name,
|
|
1827
|
+
run,
|
|
1828
|
+
scores,
|
|
1829
|
+
raw: {
|
|
1830
|
+
output: taskResult.output,
|
|
1831
|
+
latencyMs: taskResult.latencyMs,
|
|
1832
|
+
usage: taskResult.usage,
|
|
1833
|
+
toolCalls: taskResult.toolCalls
|
|
1834
|
+
}
|
|
1835
|
+
};
|
|
1836
|
+
} catch (err) {
|
|
1837
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1838
|
+
result = {
|
|
1839
|
+
providerId: provider.id,
|
|
1840
|
+
taskName: task.name,
|
|
1841
|
+
run,
|
|
1842
|
+
scores: [],
|
|
1843
|
+
error: message,
|
|
1844
|
+
raw: { output: "", latencyMs: 0 }
|
|
1845
|
+
};
|
|
1846
|
+
}
|
|
1847
|
+
onResult?.(result);
|
|
1848
|
+
return result;
|
|
1849
|
+
})
|
|
1850
|
+
);
|
|
1851
|
+
results.push(...runResults);
|
|
1826
1852
|
}
|
|
1827
1853
|
}
|
|
1828
1854
|
return results;
|
|
1829
1855
|
}
|
|
1830
1856
|
|
|
1831
|
-
// src/
|
|
1832
|
-
var
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
return `${boldCode}${s}${reset}`;
|
|
1843
|
-
}
|
|
1844
|
-
function dim(s) {
|
|
1845
|
-
return `${dimCode}${s}${reset}`;
|
|
1857
|
+
// src/utils/format.ts
|
|
1858
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1859
|
+
function formatCost(usd) {
|
|
1860
|
+
if (usd === void 0) return "\u2014";
|
|
1861
|
+
if (usd === 0) return "$0.00";
|
|
1862
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1863
|
+
const digits = Math.min(
|
|
1864
|
+
MAX_FRACTION_DIGITS,
|
|
1865
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1866
|
+
);
|
|
1867
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1846
1868
|
}
|
|
1847
|
-
function
|
|
1848
|
-
|
|
1869
|
+
function formatDelta(delta, precision = 4) {
|
|
1870
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1871
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1849
1872
|
}
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1873
|
+
|
|
1874
|
+
// src/reporter/shared.ts
|
|
1875
|
+
function groupResults(results) {
|
|
1876
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1877
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1878
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1879
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1880
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1881
|
+
let hasErrors = false;
|
|
1882
|
+
let maxRun = 0;
|
|
1883
|
+
for (const r of results) {
|
|
1884
|
+
taskSet.add(r.taskName);
|
|
1885
|
+
providerSet.add(r.providerId);
|
|
1886
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1887
|
+
if (r.error) hasErrors = true;
|
|
1888
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1889
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1890
|
+
let group = grouped.get(key);
|
|
1891
|
+
if (!group) {
|
|
1892
|
+
group = [];
|
|
1893
|
+
grouped.set(key, group);
|
|
1894
|
+
}
|
|
1895
|
+
group.push(r);
|
|
1896
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1897
|
+
if (!provGroup) {
|
|
1898
|
+
provGroup = [];
|
|
1899
|
+
byProvider.set(r.providerId, provGroup);
|
|
1900
|
+
}
|
|
1901
|
+
provGroup.push(r);
|
|
1858
1902
|
}
|
|
1859
|
-
return
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1869
|
-
const fillLen = Math.round(clamped * width);
|
|
1870
|
-
const fill = "\u2593".repeat(fillLen);
|
|
1871
|
-
const track = "\u2591".repeat(width - fillLen);
|
|
1872
|
-
return { fill, track };
|
|
1903
|
+
return {
|
|
1904
|
+
tasks: [...taskSet],
|
|
1905
|
+
providers: [...providerSet],
|
|
1906
|
+
scorerNames: [...scorerSet],
|
|
1907
|
+
grouped,
|
|
1908
|
+
byProvider,
|
|
1909
|
+
hasErrors,
|
|
1910
|
+
maxRun
|
|
1911
|
+
};
|
|
1873
1912
|
}
|
|
1874
|
-
function
|
|
1875
|
-
const
|
|
1876
|
-
|
|
1877
|
-
|
|
1913
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1914
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1915
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1916
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1917
|
+
if (successResults.length === 0) {
|
|
1918
|
+
return {
|
|
1919
|
+
providerId,
|
|
1920
|
+
avgScores: {},
|
|
1921
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1922
|
+
latencyMs: void 0,
|
|
1923
|
+
allErrors: errorResults.length > 0,
|
|
1924
|
+
errorCount: errorResults.length
|
|
1925
|
+
};
|
|
1878
1926
|
}
|
|
1879
|
-
|
|
1880
|
-
|
|
1927
|
+
return {
|
|
1928
|
+
providerId,
|
|
1929
|
+
avgScores: averageScores(successResults),
|
|
1930
|
+
avgDetails: averageDetails(successResults),
|
|
1931
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1932
|
+
allErrors: false,
|
|
1933
|
+
errorCount: errorResults.length
|
|
1934
|
+
};
|
|
1935
|
+
}
|
|
1936
|
+
function averageScores(results) {
|
|
1937
|
+
const sums = {};
|
|
1938
|
+
const counts = {};
|
|
1939
|
+
for (const result of results) {
|
|
1940
|
+
for (const score of result.scores) {
|
|
1941
|
+
if (score.value < 0) continue;
|
|
1942
|
+
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
1943
|
+
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
1944
|
+
}
|
|
1881
1945
|
}
|
|
1882
|
-
const
|
|
1883
|
-
|
|
1884
|
-
|
|
1946
|
+
const avgs = {};
|
|
1947
|
+
for (const name of Object.keys(sums)) {
|
|
1948
|
+
avgs[name] = sums[name] / counts[name];
|
|
1885
1949
|
}
|
|
1886
|
-
return
|
|
1950
|
+
return avgs;
|
|
1887
1951
|
}
|
|
1888
|
-
function
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1952
|
+
function averageDetails(results) {
|
|
1953
|
+
let costSum = 0;
|
|
1954
|
+
let costCount = 0;
|
|
1955
|
+
let tokenSum = 0;
|
|
1956
|
+
let tokenCount = 0;
|
|
1957
|
+
for (const result of results) {
|
|
1958
|
+
const costScore = result.scores.find((s) => s.name === "cost");
|
|
1959
|
+
const details = costScore?.details;
|
|
1960
|
+
if (details?.estimatedUsd != null) {
|
|
1961
|
+
costSum += details.estimatedUsd;
|
|
1962
|
+
costCount++;
|
|
1963
|
+
}
|
|
1964
|
+
if (details?.totalTokens != null) {
|
|
1965
|
+
tokenSum += details.totalTokens;
|
|
1966
|
+
tokenCount++;
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1969
|
+
return {
|
|
1970
|
+
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
1971
|
+
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
1972
|
+
};
|
|
1893
1973
|
}
|
|
1894
|
-
function
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
const padding = Math.max(0, totalInner - dw - 1);
|
|
1898
|
-
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
1974
|
+
function average(nums) {
|
|
1975
|
+
if (nums.length === 0) return void 0;
|
|
1976
|
+
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1899
1977
|
}
|
|
1900
1978
|
function computeColumnStats(providerData, scorerNames) {
|
|
1901
1979
|
const stats = /* @__PURE__ */ new Map();
|
|
@@ -1947,62 +2025,235 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1947
2025
|
}
|
|
1948
2026
|
return stats;
|
|
1949
2027
|
}
|
|
1950
|
-
function colorByRank(text, value, colStats, providerCount) {
|
|
1951
|
-
if (value === void 0) return dim("\u2014");
|
|
1952
|
-
if (providerCount < 2) return text;
|
|
1953
|
-
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
1954
|
-
if (colStats.best === colStats.worst) return text;
|
|
1955
|
-
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
1956
|
-
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
1957
|
-
return `${yellow}${text}${reset}`;
|
|
1958
|
-
}
|
|
1959
2028
|
function computeMedals(columnStats, providerIds) {
|
|
1960
2029
|
const medals = /* @__PURE__ */ new Map();
|
|
1961
2030
|
if (providerIds.length < 2) {
|
|
1962
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2031
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1963
2032
|
return medals;
|
|
1964
2033
|
}
|
|
1965
2034
|
const wins = /* @__PURE__ */ new Map();
|
|
1966
2035
|
for (const id of providerIds) wins.set(id, 0);
|
|
1967
2036
|
for (const [, colStats] of columnStats) {
|
|
1968
2037
|
if (colStats.best === void 0) continue;
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
}
|
|
2038
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2039
|
+
if (bestProviders.length === 1) {
|
|
2040
|
+
wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
|
|
1973
2041
|
}
|
|
1974
2042
|
}
|
|
1975
2043
|
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
1976
2044
|
if (totalWins === 0) {
|
|
1977
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2045
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1978
2046
|
return medals;
|
|
1979
2047
|
}
|
|
1980
2048
|
const sorted = [...wins.entries()].sort(
|
|
1981
2049
|
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
1982
2050
|
);
|
|
1983
|
-
const medalList = ["
|
|
2051
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
1984
2052
|
let rank = 0;
|
|
1985
2053
|
for (let i = 0; i < sorted.length; i++) {
|
|
1986
2054
|
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
1987
2055
|
rank = i;
|
|
1988
2056
|
}
|
|
1989
|
-
|
|
2057
|
+
const hasWins = sorted[i][1] > 0;
|
|
2058
|
+
medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
|
|
1990
2059
|
}
|
|
1991
2060
|
return medals;
|
|
1992
2061
|
}
|
|
2062
|
+
function providerLabel(providerId) {
|
|
2063
|
+
const prefix = providerId.split("/")[0];
|
|
2064
|
+
switch (prefix) {
|
|
2065
|
+
case "azure":
|
|
2066
|
+
return "(OpenAI via Azure)";
|
|
2067
|
+
case "openai":
|
|
2068
|
+
return "(OpenAI)";
|
|
2069
|
+
case "anthropic":
|
|
2070
|
+
return "(Anthropic)";
|
|
2071
|
+
case "google":
|
|
2072
|
+
return "(Google)";
|
|
2073
|
+
case "mistral":
|
|
2074
|
+
return "(Mistral)";
|
|
2075
|
+
case "meta":
|
|
2076
|
+
return "(Meta)";
|
|
2077
|
+
case "deepseek":
|
|
2078
|
+
return "(DeepSeek)";
|
|
2079
|
+
case "cohere":
|
|
2080
|
+
return "(Cohere)";
|
|
2081
|
+
case "qwen":
|
|
2082
|
+
return "(Qwen)";
|
|
2083
|
+
case "xai":
|
|
2084
|
+
return "(xAI)";
|
|
2085
|
+
case "minimax":
|
|
2086
|
+
return "(MiniMax)";
|
|
2087
|
+
case "moonshot":
|
|
2088
|
+
return "(Moonshot / Kimi)";
|
|
2089
|
+
case "perplexity":
|
|
2090
|
+
return "(Perplexity)";
|
|
2091
|
+
case "amazon":
|
|
2092
|
+
return "(Amazon)";
|
|
2093
|
+
case "nvidia":
|
|
2094
|
+
return "(NVIDIA)";
|
|
2095
|
+
case "microsoft":
|
|
2096
|
+
return "(Microsoft)";
|
|
2097
|
+
case "ai21":
|
|
2098
|
+
return "(AI21 Labs)";
|
|
2099
|
+
case "bytedance":
|
|
2100
|
+
return "(ByteDance)";
|
|
2101
|
+
case "together":
|
|
2102
|
+
return "(Together AI)";
|
|
2103
|
+
case "fireworks":
|
|
2104
|
+
return "(Fireworks AI)";
|
|
2105
|
+
case "groq":
|
|
2106
|
+
return "(Groq)";
|
|
2107
|
+
case "cerebras":
|
|
2108
|
+
return "(Cerebras)";
|
|
2109
|
+
default:
|
|
2110
|
+
return `(${prefix})`;
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
function apiKeyHint(providerId, error) {
|
|
2114
|
+
const lower = error.toLowerCase();
|
|
2115
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2116
|
+
if (!isAuthError) return void 0;
|
|
2117
|
+
const prefix = providerId.split("/")[0];
|
|
2118
|
+
switch (prefix) {
|
|
2119
|
+
case "openai":
|
|
2120
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2121
|
+
case "azure":
|
|
2122
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2123
|
+
case "anthropic":
|
|
2124
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2125
|
+
case "google":
|
|
2126
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2127
|
+
default:
|
|
2128
|
+
return `Check the API key for ${providerId}`;
|
|
2129
|
+
}
|
|
2130
|
+
}
|
|
2131
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2132
|
+
const ranked = providers.map((id) => {
|
|
2133
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2134
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2135
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2136
|
+
return { id, avg };
|
|
2137
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2138
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2139
|
+
}
|
|
2140
|
+
function scorerLabel(name) {
|
|
2141
|
+
switch (name) {
|
|
2142
|
+
case "correctness":
|
|
2143
|
+
return "Match";
|
|
2144
|
+
case "schema-correctness":
|
|
2145
|
+
return "Schema";
|
|
2146
|
+
case "fuzzy-similarity":
|
|
2147
|
+
return "Fuzzy";
|
|
2148
|
+
case "llm-judge-correctness":
|
|
2149
|
+
return "Judge";
|
|
2150
|
+
case "tool-usage":
|
|
2151
|
+
return "Tool";
|
|
2152
|
+
default:
|
|
2153
|
+
return name;
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
function medalEmoji(medal) {
|
|
2157
|
+
switch (medal) {
|
|
2158
|
+
case "gold":
|
|
2159
|
+
return "\u{1F947}";
|
|
2160
|
+
case "silver":
|
|
2161
|
+
return "\u{1F948}";
|
|
2162
|
+
case "bronze":
|
|
2163
|
+
return "\u{1F949}";
|
|
2164
|
+
case "none":
|
|
2165
|
+
return "";
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
// src/reporter/console.ts
|
|
2170
|
+
var reset = "\x1B[0m";
|
|
2171
|
+
var boldCode = "\x1B[1m";
|
|
2172
|
+
var dimCode = "\x1B[2m";
|
|
2173
|
+
var green = "\x1B[32m";
|
|
2174
|
+
var red = "\x1B[31m";
|
|
2175
|
+
var yellow = "\x1B[33m";
|
|
2176
|
+
var cyan = "\x1B[36m";
|
|
2177
|
+
var brightGreen = "\x1B[92m";
|
|
2178
|
+
var brightWhite = "\x1B[97m";
|
|
2179
|
+
function bold(s) {
|
|
2180
|
+
return `${boldCode}${s}${reset}`;
|
|
2181
|
+
}
|
|
2182
|
+
function dim(s) {
|
|
2183
|
+
return `${dimCode}${s}${reset}`;
|
|
2184
|
+
}
|
|
2185
|
+
function stripAnsi(s) {
|
|
2186
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2187
|
+
}
|
|
2188
|
+
function displayWidth(s) {
|
|
2189
|
+
const stripped = stripAnsi(s);
|
|
2190
|
+
let width = 0;
|
|
2191
|
+
for (const ch of stripped) {
|
|
2192
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2193
|
+
if (code >= 126976) width += 2;
|
|
2194
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2195
|
+
else width += 1;
|
|
2196
|
+
}
|
|
2197
|
+
return width;
|
|
2198
|
+
}
|
|
2199
|
+
function padCell(str, targetWidth, align) {
|
|
2200
|
+
const dw = displayWidth(str);
|
|
2201
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2202
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2203
|
+
return str + " ".repeat(padding);
|
|
2204
|
+
}
|
|
2205
|
+
function sparkBar(ratio, width = 8) {
|
|
2206
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2207
|
+
const fillLen = Math.round(clamped * width);
|
|
2208
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2209
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2210
|
+
return { fill, track };
|
|
2211
|
+
}
|
|
2212
|
+
function drawTableLine(widths, position) {
|
|
2213
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2214
|
+
if (position === "bottom") {
|
|
2215
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2216
|
+
}
|
|
2217
|
+
if (position === "merge") {
|
|
2218
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2219
|
+
}
|
|
2220
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2221
|
+
if (position === "top") {
|
|
2222
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2223
|
+
}
|
|
2224
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2225
|
+
}
|
|
2226
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2227
|
+
const parts = cells.map(
|
|
2228
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2229
|
+
);
|
|
2230
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2231
|
+
}
|
|
2232
|
+
function drawSpanRow(content, widths) {
|
|
2233
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2234
|
+
const dw = displayWidth(content);
|
|
2235
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2236
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2237
|
+
}
|
|
2238
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2239
|
+
if (value === void 0) return dim("\u2014");
|
|
2240
|
+
if (providerCount < 2) return text;
|
|
2241
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2242
|
+
if (colStats.best === colStats.worst) return text;
|
|
2243
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2244
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2245
|
+
return `${yellow}${text}${reset}`;
|
|
2246
|
+
}
|
|
1993
2247
|
function consoleReporter(results, options) {
|
|
1994
2248
|
const showSparklines = options?.sparklines ?? true;
|
|
1995
2249
|
if (results.length === 0) {
|
|
1996
2250
|
console.log("\nNo results to display.\n");
|
|
1997
2251
|
return;
|
|
1998
2252
|
}
|
|
1999
|
-
const tasks =
|
|
2000
|
-
const providers = [...new Set(results.map((r) => r.providerId))];
|
|
2001
|
-
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
2253
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
2002
2254
|
const hasCost = scorerNames.includes("cost");
|
|
2003
|
-
const hasErrors = results.some((r) => r.error);
|
|
2004
2255
|
const multi = providers.length >= 2;
|
|
2005
|
-
const runsPerCell =
|
|
2256
|
+
const runsPerCell = maxRun;
|
|
2006
2257
|
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
2007
2258
|
console.log("");
|
|
2008
2259
|
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
@@ -2011,29 +2262,9 @@ function consoleReporter(results, options) {
|
|
|
2011
2262
|
for (const task of tasks) {
|
|
2012
2263
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
2013
2264
|
console.log("");
|
|
2014
|
-
const providerData = providers.map(
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
2018
|
-
if (successResults.length === 0) {
|
|
2019
|
-
return {
|
|
2020
|
-
providerId,
|
|
2021
|
-
avgScores: {},
|
|
2022
|
-
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
2023
|
-
latencyMs: void 0,
|
|
2024
|
-
allErrors: errorResults2.length > 0,
|
|
2025
|
-
errorCount: errorResults2.length
|
|
2026
|
-
};
|
|
2027
|
-
}
|
|
2028
|
-
return {
|
|
2029
|
-
providerId,
|
|
2030
|
-
avgScores: averageScores(successResults),
|
|
2031
|
-
avgDetails: averageDetails(successResults),
|
|
2032
|
-
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
2033
|
-
allErrors: false,
|
|
2034
|
-
errorCount: errorResults2.length
|
|
2035
|
-
};
|
|
2036
|
-
});
|
|
2265
|
+
const providerData = providers.map(
|
|
2266
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2267
|
+
);
|
|
2037
2268
|
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2038
2269
|
const medals = computeMedals(columnStats, providers);
|
|
2039
2270
|
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
@@ -2048,8 +2279,7 @@ function consoleReporter(results, options) {
|
|
|
2048
2279
|
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
2049
2280
|
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
2050
2281
|
} else {
|
|
2051
|
-
|
|
2052
|
-
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2282
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2053
2283
|
}
|
|
2054
2284
|
}
|
|
2055
2285
|
if (hasErrors) {
|
|
@@ -2062,7 +2292,7 @@ function consoleReporter(results, options) {
|
|
|
2062
2292
|
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2063
2293
|
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2064
2294
|
for (const pd of providerData) {
|
|
2065
|
-
const medal = medals.get(pd.providerId) ?? "";
|
|
2295
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2066
2296
|
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2067
2297
|
const cells = [providerCell];
|
|
2068
2298
|
if (pd.allErrors) {
|
|
@@ -2135,7 +2365,7 @@ function consoleReporter(results, options) {
|
|
|
2135
2365
|
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2136
2366
|
}
|
|
2137
2367
|
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2138
|
-
const winnerId = [...medals.entries()].find(([, m]) => m === "
|
|
2368
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2139
2369
|
if (winnerId) {
|
|
2140
2370
|
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2141
2371
|
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
@@ -2145,7 +2375,7 @@ function consoleReporter(results, options) {
|
|
|
2145
2375
|
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2146
2376
|
console.log("");
|
|
2147
2377
|
}
|
|
2148
|
-
printSummary(results, providers);
|
|
2378
|
+
printSummary(results, providers, byProvider);
|
|
2149
2379
|
const errorResults = results.filter((r) => r.error);
|
|
2150
2380
|
if (errorResults.length > 0) {
|
|
2151
2381
|
console.log(` ${bold("Errors")}`);
|
|
@@ -2168,15 +2398,19 @@ function consoleReporter(results, options) {
|
|
|
2168
2398
|
console.log("");
|
|
2169
2399
|
}
|
|
2170
2400
|
}
|
|
2171
|
-
function printSummary(results, providers) {
|
|
2401
|
+
function printSummary(results, providers, byProvider) {
|
|
2172
2402
|
const successResults = results.filter((r) => !r.error);
|
|
2173
2403
|
if (successResults.length === 0) return;
|
|
2404
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2405
|
+
for (const id of providers) {
|
|
2406
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2407
|
+
}
|
|
2174
2408
|
console.log(` ${bold("Summary")}`);
|
|
2175
2409
|
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2176
2410
|
console.log("");
|
|
2177
2411
|
const single = providers.length === 1;
|
|
2178
2412
|
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2179
|
-
const byCorrectness = rankProviders(
|
|
2413
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2180
2414
|
if (byCorrectness) {
|
|
2181
2415
|
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2182
2416
|
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
@@ -2187,7 +2421,7 @@ function printSummary(results, providers) {
|
|
|
2187
2421
|
}
|
|
2188
2422
|
}
|
|
2189
2423
|
const byLatency = providers.map((id) => {
|
|
2190
|
-
const runs =
|
|
2424
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2191
2425
|
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2192
2426
|
return { id, avg: avg ?? Infinity };
|
|
2193
2427
|
}).sort((a, b) => a.avg - b.avg)[0];
|
|
@@ -2201,7 +2435,7 @@ function printSummary(results, providers) {
|
|
|
2201
2435
|
}
|
|
2202
2436
|
}
|
|
2203
2437
|
const byCost = providers.map((id) => {
|
|
2204
|
-
const runs =
|
|
2438
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2205
2439
|
const costs = runs.map((r) => {
|
|
2206
2440
|
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2207
2441
|
return s && s.value >= 0 ? s.value : void 0;
|
|
@@ -2239,133 +2473,6 @@ function printSummary(results, providers) {
|
|
|
2239
2473
|
}
|
|
2240
2474
|
console.log("");
|
|
2241
2475
|
}
|
|
2242
|
-
function rankProviders(results, providers, scorerName) {
|
|
2243
|
-
const ranked = providers.map((id) => {
|
|
2244
|
-
const runs = results.filter((r) => r.providerId === id);
|
|
2245
|
-
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2246
|
-
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2247
|
-
return { id, avg };
|
|
2248
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2249
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2250
|
-
}
|
|
2251
|
-
function averageScores(results) {
|
|
2252
|
-
const sums = {};
|
|
2253
|
-
const counts = {};
|
|
2254
|
-
for (const result of results) {
|
|
2255
|
-
for (const score of result.scores) {
|
|
2256
|
-
if (score.value < 0) continue;
|
|
2257
|
-
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
2258
|
-
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
2259
|
-
}
|
|
2260
|
-
}
|
|
2261
|
-
const avgs = {};
|
|
2262
|
-
for (const name of Object.keys(sums)) {
|
|
2263
|
-
avgs[name] = sums[name] / counts[name];
|
|
2264
|
-
}
|
|
2265
|
-
return avgs;
|
|
2266
|
-
}
|
|
2267
|
-
function averageDetails(results) {
|
|
2268
|
-
let costSum = 0;
|
|
2269
|
-
let costCount = 0;
|
|
2270
|
-
let tokenSum = 0;
|
|
2271
|
-
let tokenCount = 0;
|
|
2272
|
-
for (const result of results) {
|
|
2273
|
-
const costScore = result.scores.find((s) => s.name === "cost");
|
|
2274
|
-
const details = costScore?.details;
|
|
2275
|
-
if (details?.estimatedUsd != null) {
|
|
2276
|
-
costSum += details.estimatedUsd;
|
|
2277
|
-
costCount++;
|
|
2278
|
-
}
|
|
2279
|
-
if (details?.totalTokens != null) {
|
|
2280
|
-
tokenSum += details.totalTokens;
|
|
2281
|
-
tokenCount++;
|
|
2282
|
-
}
|
|
2283
|
-
}
|
|
2284
|
-
return {
|
|
2285
|
-
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
2286
|
-
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
2287
|
-
};
|
|
2288
|
-
}
|
|
2289
|
-
function average(nums) {
|
|
2290
|
-
if (nums.length === 0) return void 0;
|
|
2291
|
-
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
2292
|
-
}
|
|
2293
|
-
function formatCost(usd) {
|
|
2294
|
-
if (usd === void 0) return "\u2014";
|
|
2295
|
-
if (usd === 0) return "$0.00";
|
|
2296
|
-
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
2297
|
-
const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
|
|
2298
|
-
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
2299
|
-
}
|
|
2300
|
-
function apiKeyHint(providerId, error) {
|
|
2301
|
-
const lower = error.toLowerCase();
|
|
2302
|
-
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2303
|
-
if (!isAuthError) return void 0;
|
|
2304
|
-
const prefix = providerId.split("/")[0];
|
|
2305
|
-
switch (prefix) {
|
|
2306
|
-
case "openai":
|
|
2307
|
-
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2308
|
-
case "azure":
|
|
2309
|
-
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2310
|
-
case "anthropic":
|
|
2311
|
-
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2312
|
-
case "google":
|
|
2313
|
-
return "Set: export GOOGLE_API_KEY=...";
|
|
2314
|
-
default:
|
|
2315
|
-
return `Check the API key for ${providerId}`;
|
|
2316
|
-
}
|
|
2317
|
-
}
|
|
2318
|
-
function providerLabel(providerId) {
|
|
2319
|
-
const prefix = providerId.split("/")[0];
|
|
2320
|
-
switch (prefix) {
|
|
2321
|
-
case "azure":
|
|
2322
|
-
return "(OpenAI via Azure)";
|
|
2323
|
-
case "openai":
|
|
2324
|
-
return "(OpenAI)";
|
|
2325
|
-
case "anthropic":
|
|
2326
|
-
return "(Anthropic)";
|
|
2327
|
-
case "google":
|
|
2328
|
-
return "(Google)";
|
|
2329
|
-
case "mistral":
|
|
2330
|
-
return "(Mistral)";
|
|
2331
|
-
case "meta":
|
|
2332
|
-
return "(Meta)";
|
|
2333
|
-
case "deepseek":
|
|
2334
|
-
return "(DeepSeek)";
|
|
2335
|
-
case "cohere":
|
|
2336
|
-
return "(Cohere)";
|
|
2337
|
-
case "qwen":
|
|
2338
|
-
return "(Qwen)";
|
|
2339
|
-
case "xai":
|
|
2340
|
-
return "(xAI)";
|
|
2341
|
-
case "minimax":
|
|
2342
|
-
return "(MiniMax)";
|
|
2343
|
-
case "moonshot":
|
|
2344
|
-
return "(Moonshot / Kimi)";
|
|
2345
|
-
case "perplexity":
|
|
2346
|
-
return "(Perplexity)";
|
|
2347
|
-
case "amazon":
|
|
2348
|
-
return "(Amazon)";
|
|
2349
|
-
case "nvidia":
|
|
2350
|
-
return "(NVIDIA)";
|
|
2351
|
-
case "microsoft":
|
|
2352
|
-
return "(Microsoft)";
|
|
2353
|
-
case "ai21":
|
|
2354
|
-
return "(AI21 Labs)";
|
|
2355
|
-
case "bytedance":
|
|
2356
|
-
return "(ByteDance)";
|
|
2357
|
-
case "together":
|
|
2358
|
-
return "(Together AI)";
|
|
2359
|
-
case "fireworks":
|
|
2360
|
-
return "(Fireworks AI)";
|
|
2361
|
-
case "groq":
|
|
2362
|
-
return "(Groq)";
|
|
2363
|
-
case "cerebras":
|
|
2364
|
-
return "(Cerebras)";
|
|
2365
|
-
default:
|
|
2366
|
-
return `(${prefix})`;
|
|
2367
|
-
}
|
|
2368
|
-
}
|
|
2369
2476
|
|
|
2370
2477
|
// src/reporter/json.ts
|
|
2371
2478
|
function jsonReporter(results) {
|
|
@@ -2430,7 +2537,7 @@ function anthropic(model, options) {
|
|
|
2430
2537
|
model,
|
|
2431
2538
|
async run(input) {
|
|
2432
2539
|
const start = Date.now();
|
|
2433
|
-
const systemMessage = input.schema ?
|
|
2540
|
+
const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
|
|
2434
2541
|
const response = await client.messages.create({
|
|
2435
2542
|
model,
|
|
2436
2543
|
max_tokens: maxTokens,
|
|
@@ -2440,13 +2547,7 @@ function anthropic(model, options) {
|
|
|
2440
2547
|
const latencyMs = Date.now() - start;
|
|
2441
2548
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2442
2549
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2443
|
-
|
|
2444
|
-
if (input.schema) {
|
|
2445
|
-
try {
|
|
2446
|
-
output = JSON.parse(rawContent);
|
|
2447
|
-
} catch {
|
|
2448
|
-
}
|
|
2449
|
-
}
|
|
2550
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2450
2551
|
return {
|
|
2451
2552
|
output,
|
|
2452
2553
|
usage: {
|
|
@@ -2460,23 +2561,6 @@ function anthropic(model, options) {
|
|
|
2460
2561
|
};
|
|
2461
2562
|
}
|
|
2462
2563
|
|
|
2463
|
-
// src/providers/gemini.ts
|
|
2464
|
-
var import_openai4 = __toESM(require("openai"), 1);
|
|
2465
|
-
function gemini(model, options) {
|
|
2466
|
-
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
2467
|
-
if (!apiKey) {
|
|
2468
|
-
throw new Error(
|
|
2469
|
-
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
2470
|
-
);
|
|
2471
|
-
}
|
|
2472
|
-
const client = new import_openai4.default({
|
|
2473
|
-
apiKey,
|
|
2474
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2475
|
-
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2476
|
-
});
|
|
2477
|
-
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2478
|
-
}
|
|
2479
|
-
|
|
2480
2564
|
// src/reporter/markdown.ts
|
|
2481
2565
|
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2482
2566
|
function markdownReporter(report, _current) {
|
|
@@ -2521,7 +2605,7 @@ function markdownComparisonTable(comparisons) {
|
|
|
2521
2605
|
for (const c of comparisons) {
|
|
2522
2606
|
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2523
2607
|
const currentStr = formatStats(c.current);
|
|
2524
|
-
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2608
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2525
2609
|
const status = statusIndicator(c);
|
|
2526
2610
|
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2527
2611
|
}
|
|
@@ -2554,10 +2638,6 @@ function formatStats(stats) {
|
|
|
2554
2638
|
}
|
|
2555
2639
|
return stats.mean.toFixed(3);
|
|
2556
2640
|
}
|
|
2557
|
-
function formatDelta(delta) {
|
|
2558
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2559
|
-
return `${sign}${delta.toFixed(3)}`;
|
|
2560
|
-
}
|
|
2561
2641
|
function statusIndicator(c) {
|
|
2562
2642
|
if (c.regressed) return "\u{1F534} regressed";
|
|
2563
2643
|
if (c.improved) return "\u{1F7E2} improved";
|
|
@@ -2565,6 +2645,644 @@ function statusIndicator(c) {
|
|
|
2565
2645
|
return "\u26AA unchanged";
|
|
2566
2646
|
}
|
|
2567
2647
|
|
|
2648
|
+
// src/reporter/html.ts
|
|
2649
|
+
function esc(s) {
|
|
2650
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2651
|
+
}
|
|
2652
|
+
function htmlReporter(results) {
|
|
2653
|
+
if (results.length === 0) {
|
|
2654
|
+
return emptyReport();
|
|
2655
|
+
}
|
|
2656
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2657
|
+
const hasCost = scorerNames.includes("cost");
|
|
2658
|
+
const multi = providers.length >= 2;
|
|
2659
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2660
|
+
const taskSections = tasks.map((task) => {
|
|
2661
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2662
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2663
|
+
const medals = computeMedals(columnStats, providers);
|
|
2664
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2665
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2666
|
+
});
|
|
2667
|
+
const successResults = results.filter((r) => !r.error);
|
|
2668
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2669
|
+
for (const id of providers) {
|
|
2670
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2671
|
+
}
|
|
2672
|
+
const correctnessKey = successResults.some(
|
|
2673
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2674
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2675
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2676
|
+
const byLatency = providers.map((id) => {
|
|
2677
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2678
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2679
|
+
return { id, avg: avg ?? Infinity };
|
|
2680
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2681
|
+
const byCost = providers.map((id) => {
|
|
2682
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2683
|
+
const costs = runs.map((r) => {
|
|
2684
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2685
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2686
|
+
}).filter((c) => c !== void 0);
|
|
2687
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2688
|
+
return { id, avg };
|
|
2689
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2690
|
+
let overallWinner;
|
|
2691
|
+
if (multi) {
|
|
2692
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2693
|
+
for (const id of providers) wins.set(id, 0);
|
|
2694
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2695
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2696
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2697
|
+
const maxWins = Math.max(...wins.values());
|
|
2698
|
+
if (maxWins > 0) {
|
|
2699
|
+
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2700
|
+
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2701
|
+
}
|
|
2702
|
+
}
|
|
2703
|
+
const errorResults = results.filter((r) => r.error);
|
|
2704
|
+
const deduped = dedupeErrors(errorResults);
|
|
2705
|
+
return `<!DOCTYPE html>
|
|
2706
|
+
<html lang="en">
|
|
2707
|
+
<head>
|
|
2708
|
+
<meta charset="UTF-8">
|
|
2709
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2710
|
+
<title>Agent Duelist Report</title>
|
|
2711
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2712
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2713
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2714
|
+
<meta property="og:type" content="website">
|
|
2715
|
+
${renderStyle()}
|
|
2716
|
+
</head>
|
|
2717
|
+
<body>
|
|
2718
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2719
|
+
<div class="report">
|
|
2720
|
+
|
|
2721
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2722
|
+
|
|
2723
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2724
|
+
|
|
2725
|
+
<main>
|
|
2726
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2727
|
+
s.task,
|
|
2728
|
+
s.providerData,
|
|
2729
|
+
s.columnStats,
|
|
2730
|
+
s.medals,
|
|
2731
|
+
s.winnerId,
|
|
2732
|
+
scorerNames,
|
|
2733
|
+
hasCost,
|
|
2734
|
+
multi,
|
|
2735
|
+
i
|
|
2736
|
+
)).join("\n")}
|
|
2737
|
+
</main>
|
|
2738
|
+
|
|
2739
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2740
|
+
|
|
2741
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2742
|
+
|
|
2743
|
+
${renderFooter()}
|
|
2744
|
+
|
|
2745
|
+
</div>
|
|
2746
|
+
${renderScript(tasks.length)}
|
|
2747
|
+
</body>
|
|
2748
|
+
</html>`;
|
|
2749
|
+
}
|
|
2750
|
+
function emptyReport() {
|
|
2751
|
+
return `<!DOCTYPE html>
|
|
2752
|
+
<html lang="en">
|
|
2753
|
+
<head>
|
|
2754
|
+
<meta charset="UTF-8">
|
|
2755
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2756
|
+
<title>Agent Duelist Report</title>
|
|
2757
|
+
${renderStyle()}
|
|
2758
|
+
</head>
|
|
2759
|
+
<body>
|
|
2760
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2761
|
+
<div class="report">
|
|
2762
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2763
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2764
|
+
${renderFooter()}
|
|
2765
|
+
</div>
|
|
2766
|
+
</body>
|
|
2767
|
+
</html>`;
|
|
2768
|
+
}
|
|
2769
|
+
function dedupeErrors(errorResults) {
|
|
2770
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2771
|
+
for (const r of errorResults) {
|
|
2772
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2773
|
+
const existing = seen.get(key);
|
|
2774
|
+
if (existing) {
|
|
2775
|
+
existing.count++;
|
|
2776
|
+
} else {
|
|
2777
|
+
seen.set(key, {
|
|
2778
|
+
providerId: r.providerId,
|
|
2779
|
+
error: r.error ?? "Unknown error",
|
|
2780
|
+
count: 1,
|
|
2781
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2782
|
+
});
|
|
2783
|
+
}
|
|
2784
|
+
}
|
|
2785
|
+
return [...seen.values()];
|
|
2786
|
+
}
|
|
2787
|
+
function renderStyle() {
|
|
2788
|
+
return `<style>
|
|
2789
|
+
:root {
|
|
2790
|
+
--bg: #0f172a;
|
|
2791
|
+
--bg-deep: #020617;
|
|
2792
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2793
|
+
--accent: #f59e0b;
|
|
2794
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2795
|
+
--text: #e2e8f0;
|
|
2796
|
+
--muted: #94a3b8;
|
|
2797
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2798
|
+
--green: #22c55e;
|
|
2799
|
+
--red: #ef4444;
|
|
2800
|
+
--yellow: #eab308;
|
|
2801
|
+
--radius: 12px;
|
|
2802
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2803
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2804
|
+
}
|
|
2805
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2806
|
+
html, body {
|
|
2807
|
+
font-family: var(--sans);
|
|
2808
|
+
background: var(--bg);
|
|
2809
|
+
color: var(--text);
|
|
2810
|
+
min-height: 100vh;
|
|
2811
|
+
}
|
|
2812
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2813
|
+
|
|
2814
|
+
/* Animated gradient mesh */
|
|
2815
|
+
.bg-mesh {
|
|
2816
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2817
|
+
overflow: hidden; pointer-events: none;
|
|
2818
|
+
}
|
|
2819
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2820
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2821
|
+
filter: blur(120px); opacity: 0.4;
|
|
2822
|
+
}
|
|
2823
|
+
.bg-mesh::before {
|
|
2824
|
+
width: 600px; height: 600px;
|
|
2825
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2826
|
+
top: -10%; left: -5%;
|
|
2827
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2828
|
+
}
|
|
2829
|
+
.bg-mesh::after {
|
|
2830
|
+
width: 500px; height: 500px;
|
|
2831
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2832
|
+
bottom: -10%; right: -5%;
|
|
2833
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2834
|
+
}
|
|
2835
|
+
.bg-mesh-extra {
|
|
2836
|
+
position: absolute; width: 400px; height: 400px;
|
|
2837
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2838
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2839
|
+
top: 50%; left: 60%;
|
|
2840
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2841
|
+
}
|
|
2842
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2843
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2844
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2845
|
+
|
|
2846
|
+
/* Report container */
|
|
2847
|
+
.report {
|
|
2848
|
+
position: relative; z-index: 1;
|
|
2849
|
+
width: 100%; max-width: 960px;
|
|
2850
|
+
}
|
|
2851
|
+
|
|
2852
|
+
/* Header */
|
|
2853
|
+
.report-header {
|
|
2854
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2855
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2856
|
+
}
|
|
2857
|
+
.report-brand {
|
|
2858
|
+
display: flex; align-items: center; gap: 10px;
|
|
2859
|
+
text-decoration: none; color: var(--muted);
|
|
2860
|
+
font-weight: 600; font-size: 14px;
|
|
2861
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2862
|
+
}
|
|
2863
|
+
.report-brand:hover { color: var(--text); }
|
|
2864
|
+
.brand-icon {
|
|
2865
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2866
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2867
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2868
|
+
display: flex; align-items: center; justify-content: center;
|
|
2869
|
+
font-size: 16px;
|
|
2870
|
+
}
|
|
2871
|
+
.report-meta {
|
|
2872
|
+
font-size: 12px; color: var(--muted);
|
|
2873
|
+
text-align: right; line-height: 1.6;
|
|
2874
|
+
}
|
|
2875
|
+
|
|
2876
|
+
/* Task tabs */
|
|
2877
|
+
.task-tabs {
|
|
2878
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2879
|
+
}
|
|
2880
|
+
.task-tab {
|
|
2881
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2882
|
+
border: 1px solid var(--border);
|
|
2883
|
+
background: transparent; color: var(--muted);
|
|
2884
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2885
|
+
transition: all 150ms ease;
|
|
2886
|
+
}
|
|
2887
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2888
|
+
.task-tab.active {
|
|
2889
|
+
background: var(--accent-soft);
|
|
2890
|
+
border-color: rgba(245,158,11,0.4);
|
|
2891
|
+
color: var(--accent);
|
|
2892
|
+
}
|
|
2893
|
+
|
|
2894
|
+
/* Task sections */
|
|
2895
|
+
.task-section { display: none; }
|
|
2896
|
+
.task-section.active { display: block; }
|
|
2897
|
+
.task-name {
|
|
2898
|
+
font-size: 18px; font-weight: 600;
|
|
2899
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2900
|
+
}
|
|
2901
|
+
|
|
2902
|
+
/* Results table */
|
|
2903
|
+
.results-table {
|
|
2904
|
+
width: 100%; border-collapse: collapse;
|
|
2905
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2906
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2907
|
+
border: 1px solid var(--border);
|
|
2908
|
+
}
|
|
2909
|
+
.results-table th, .results-table td {
|
|
2910
|
+
padding: 10px 14px;
|
|
2911
|
+
text-align: left;
|
|
2912
|
+
border-bottom: 1px solid var(--border);
|
|
2913
|
+
}
|
|
2914
|
+
.results-table th {
|
|
2915
|
+
background: rgba(0,0,0,0.3);
|
|
2916
|
+
font-size: 11px; font-weight: 600;
|
|
2917
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2918
|
+
color: var(--muted); cursor: pointer;
|
|
2919
|
+
user-select: none; white-space: nowrap;
|
|
2920
|
+
}
|
|
2921
|
+
.results-table th:hover { color: var(--text); }
|
|
2922
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
2923
|
+
.results-table tbody tr {
|
|
2924
|
+
background: var(--panel);
|
|
2925
|
+
transition: background 120ms ease;
|
|
2926
|
+
}
|
|
2927
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
2928
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
2929
|
+
|
|
2930
|
+
/* Score cell with progress bar */
|
|
2931
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
2932
|
+
.score-bar {
|
|
2933
|
+
position: absolute; left: 0; bottom: 0;
|
|
2934
|
+
height: 3px; border-radius: 2px;
|
|
2935
|
+
transition: width 300ms ease;
|
|
2936
|
+
}
|
|
2937
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
2938
|
+
|
|
2939
|
+
/* Color ranking */
|
|
2940
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
2941
|
+
.rank-worst { color: var(--red); }
|
|
2942
|
+
.rank-mid { color: var(--yellow); }
|
|
2943
|
+
.rank-neutral { color: var(--text); }
|
|
2944
|
+
.rank-error { color: var(--muted); }
|
|
2945
|
+
|
|
2946
|
+
/* Winner banner */
|
|
2947
|
+
.task-winner {
|
|
2948
|
+
display: flex; align-items: center; gap: 10px;
|
|
2949
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
2950
|
+
border-radius: var(--radius);
|
|
2951
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
2952
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
2953
|
+
font-size: 14px; font-weight: 500;
|
|
2954
|
+
}
|
|
2955
|
+
.task-winner .trophy { font-size: 20px; }
|
|
2956
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
2957
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
2958
|
+
|
|
2959
|
+
/* Summary cards */
|
|
2960
|
+
.summary-section { margin-top: 32px; }
|
|
2961
|
+
.summary-title {
|
|
2962
|
+
font-size: 16px; font-weight: 600;
|
|
2963
|
+
margin-bottom: 12px; color: var(--text);
|
|
2964
|
+
}
|
|
2965
|
+
.summary-cards {
|
|
2966
|
+
display: grid;
|
|
2967
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
2968
|
+
gap: 12px;
|
|
2969
|
+
}
|
|
2970
|
+
.summary-card {
|
|
2971
|
+
padding: 16px; border-radius: var(--radius);
|
|
2972
|
+
border: 1px solid var(--border);
|
|
2973
|
+
background: var(--panel);
|
|
2974
|
+
}
|
|
2975
|
+
.summary-card .card-label {
|
|
2976
|
+
font-size: 11px; font-weight: 600;
|
|
2977
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2978
|
+
color: var(--muted); margin-bottom: 6px;
|
|
2979
|
+
}
|
|
2980
|
+
.summary-card .card-value {
|
|
2981
|
+
font-size: 20px; font-weight: 700;
|
|
2982
|
+
color: var(--green); font-family: var(--mono);
|
|
2983
|
+
}
|
|
2984
|
+
.summary-card .card-provider {
|
|
2985
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
2986
|
+
}
|
|
2987
|
+
|
|
2988
|
+
/* Errors */
|
|
2989
|
+
.errors-section { margin-top: 24px; }
|
|
2990
|
+
.errors-title {
|
|
2991
|
+
font-size: 16px; font-weight: 600;
|
|
2992
|
+
margin-bottom: 8px; color: var(--red);
|
|
2993
|
+
cursor: pointer;
|
|
2994
|
+
}
|
|
2995
|
+
.errors-list {
|
|
2996
|
+
border-radius: var(--radius);
|
|
2997
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
2998
|
+
background: rgba(239,68,68,0.04);
|
|
2999
|
+
overflow: hidden;
|
|
3000
|
+
}
|
|
3001
|
+
.error-item {
|
|
3002
|
+
padding: 10px 16px;
|
|
3003
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
3004
|
+
font-size: 13px;
|
|
3005
|
+
}
|
|
3006
|
+
.error-item:last-child { border-bottom: none; }
|
|
3007
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
3008
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
3009
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
3010
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
3011
|
+
|
|
3012
|
+
/* Footer */
|
|
3013
|
+
.report-footer {
|
|
3014
|
+
margin-top: 40px; padding: 20px 0;
|
|
3015
|
+
border-top: 1px solid var(--border);
|
|
3016
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
3017
|
+
flex-wrap: wrap; gap: 12px;
|
|
3018
|
+
}
|
|
3019
|
+
.footer-brand {
|
|
3020
|
+
font-size: 13px; color: var(--muted);
|
|
3021
|
+
}
|
|
3022
|
+
.footer-brand a {
|
|
3023
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
3024
|
+
}
|
|
3025
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
3026
|
+
.footer-cta {
|
|
3027
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
3028
|
+
padding: 6px 14px; border-radius: 8px;
|
|
3029
|
+
background: var(--accent-soft);
|
|
3030
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
3031
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
3032
|
+
text-decoration: none;
|
|
3033
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
3034
|
+
}
|
|
3035
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
3036
|
+
|
|
3037
|
+
/* Empty state */
|
|
3038
|
+
.empty-msg {
|
|
3039
|
+
text-align: center; color: var(--muted);
|
|
3040
|
+
padding: 60px 20px; font-size: 16px;
|
|
3041
|
+
}
|
|
3042
|
+
|
|
3043
|
+
/* Responsive */
|
|
3044
|
+
@media (max-width: 640px) {
|
|
3045
|
+
body { padding: 12px; }
|
|
3046
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
3047
|
+
.report-meta { text-align: left; }
|
|
3048
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
3049
|
+
.results-table { font-size: 12px; }
|
|
3050
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
3051
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3052
|
+
}
|
|
3053
|
+
</style>`;
|
|
3054
|
+
}
|
|
3055
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3056
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3057
|
+
return `<header class="report-header">
|
|
3058
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3059
|
+
<div class="brand-icon">⬡</div>
|
|
3060
|
+
<span>Agent Duelist</span>
|
|
3061
|
+
</a>
|
|
3062
|
+
<div class="report-meta">
|
|
3063
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3064
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3065
|
+
${esc(runsLabel)}<br>
|
|
3066
|
+
${esc(now)}
|
|
3067
|
+
</div>
|
|
3068
|
+
</header>`;
|
|
3069
|
+
}
|
|
3070
|
+
function renderTabs(tasks) {
|
|
3071
|
+
const buttons = tasks.map(
|
|
3072
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3073
|
+
).join("\n ");
|
|
3074
|
+
return `<nav class="task-tabs">
|
|
3075
|
+
${buttons}
|
|
3076
|
+
</nav>`;
|
|
3077
|
+
}
|
|
3078
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3079
|
+
const cols = [
|
|
3080
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3081
|
+
];
|
|
3082
|
+
for (const name of scorerNames) {
|
|
3083
|
+
if (name === "latency") {
|
|
3084
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3085
|
+
} else if (name === "cost") {
|
|
3086
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3087
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3088
|
+
} else {
|
|
3089
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
const ths = cols.map(
|
|
3093
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3094
|
+
).join("");
|
|
3095
|
+
const rows = providerData.map((pd) => {
|
|
3096
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3097
|
+
const cells = [];
|
|
3098
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3099
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3100
|
+
if (pd.allErrors) {
|
|
3101
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3102
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3103
|
+
}
|
|
3104
|
+
} else {
|
|
3105
|
+
for (const col of cols.slice(1)) {
|
|
3106
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3107
|
+
}
|
|
3108
|
+
}
|
|
3109
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3110
|
+
}).join("\n");
|
|
3111
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3112
|
+
<span class="trophy">🏆</span>
|
|
3113
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3114
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3115
|
+
</div>` : "";
|
|
3116
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3117
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3118
|
+
<table class="results-table">
|
|
3119
|
+
<thead><tr>${ths}</tr></thead>
|
|
3120
|
+
<tbody>${rows}</tbody>
|
|
3121
|
+
</table>
|
|
3122
|
+
${winnerHtml}
|
|
3123
|
+
</section>`;
|
|
3124
|
+
}
|
|
3125
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3126
|
+
const colStats = columnStats.get(key);
|
|
3127
|
+
if (key === "latency") {
|
|
3128
|
+
const ms = pd.latencyMs;
|
|
3129
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3130
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3131
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3132
|
+
}
|
|
3133
|
+
if (key === "cost") {
|
|
3134
|
+
const cost = pd.avgDetails.costUsd;
|
|
3135
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3136
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3137
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3138
|
+
}
|
|
3139
|
+
if (key === "tokens") {
|
|
3140
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3141
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3142
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3143
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3144
|
+
}
|
|
3145
|
+
const val = pd.avgScores[key];
|
|
3146
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3147
|
+
const pct = Math.round(val * 100);
|
|
3148
|
+
let rankCls;
|
|
3149
|
+
if (multi && colStats) {
|
|
3150
|
+
rankCls = rankClass_(val, colStats);
|
|
3151
|
+
} else {
|
|
3152
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3153
|
+
}
|
|
3154
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3155
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3156
|
+
<span class="score-val">${pct}%</span>
|
|
3157
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3158
|
+
</td>`;
|
|
3159
|
+
}
|
|
3160
|
+
function rankClass_(value, colStats) {
|
|
3161
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3162
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3163
|
+
if (value === colStats.best) return "rank-best";
|
|
3164
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3165
|
+
return "rank-mid";
|
|
3166
|
+
}
|
|
3167
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3168
|
+
const cards = [];
|
|
3169
|
+
if (byCorrectness) {
|
|
3170
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3171
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3172
|
+
cards.push(`<div class="summary-card">
|
|
3173
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3174
|
+
<div class="card-value">${pct}</div>
|
|
3175
|
+
${provider}
|
|
3176
|
+
</div>`);
|
|
3177
|
+
}
|
|
3178
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3179
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3180
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3181
|
+
cards.push(`<div class="summary-card">
|
|
3182
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3183
|
+
<div class="card-value">${ms}</div>
|
|
3184
|
+
${provider}
|
|
3185
|
+
</div>`);
|
|
3186
|
+
}
|
|
3187
|
+
if (byCost?.avg !== void 0) {
|
|
3188
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3189
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3190
|
+
cards.push(`<div class="summary-card">
|
|
3191
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3192
|
+
<div class="card-value">${cost}</div>
|
|
3193
|
+
${provider}
|
|
3194
|
+
</div>`);
|
|
3195
|
+
}
|
|
3196
|
+
if (overallWinner) {
|
|
3197
|
+
cards.push(`<div class="summary-card">
|
|
3198
|
+
<div class="card-label">Overall Winner</div>
|
|
3199
|
+
<div class="card-value">🏆</div>
|
|
3200
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3201
|
+
</div>`);
|
|
3202
|
+
}
|
|
3203
|
+
if (cards.length === 0) return "";
|
|
3204
|
+
return `<section class="summary-section">
|
|
3205
|
+
<h2 class="summary-title">Summary</h2>
|
|
3206
|
+
<div class="summary-cards">
|
|
3207
|
+
${cards.join("\n ")}
|
|
3208
|
+
</div>
|
|
3209
|
+
</section>`;
|
|
3210
|
+
}
|
|
3211
|
+
function renderErrors(errors) {
|
|
3212
|
+
const items = errors.map((e) => {
|
|
3213
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3214
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3215
|
+
return `<div class="error-item">
|
|
3216
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3217
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3218
|
+
${hint}
|
|
3219
|
+
</div>`;
|
|
3220
|
+
}).join("\n");
|
|
3221
|
+
return `<section class="errors-section">
|
|
3222
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
|
|
3223
|
+
<div class="errors-list">
|
|
3224
|
+
${items}
|
|
3225
|
+
</div>
|
|
3226
|
+
</section>`;
|
|
3227
|
+
}
|
|
3228
|
+
function renderFooter() {
|
|
3229
|
+
return `<footer class="report-footer">
|
|
3230
|
+
<div class="footer-brand">
|
|
3231
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3232
|
+
</div>
|
|
3233
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3234
|
+
⭐ Star on GitHub
|
|
3235
|
+
</a>
|
|
3236
|
+
</footer>`;
|
|
3237
|
+
}
|
|
3238
|
+
function renderScript(taskCount) {
|
|
3239
|
+
return `<script>
|
|
3240
|
+
(function() {
|
|
3241
|
+
/* Tab switching */
|
|
3242
|
+
${taskCount > 1 ? `
|
|
3243
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3244
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3245
|
+
tabs.forEach(function(tab) {
|
|
3246
|
+
tab.addEventListener('click', function() {
|
|
3247
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3248
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3249
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3250
|
+
tab.classList.add('active');
|
|
3251
|
+
sections[idx].classList.add('active');
|
|
3252
|
+
});
|
|
3253
|
+
});` : ""}
|
|
3254
|
+
|
|
3255
|
+
/* Column sorting */
|
|
3256
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3257
|
+
var table = th.closest('table');
|
|
3258
|
+
var asc = true;
|
|
3259
|
+
th.addEventListener('click', function() {
|
|
3260
|
+
var tbody = table.querySelector('tbody');
|
|
3261
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3262
|
+
rows.sort(function(a, b) {
|
|
3263
|
+
var aCell = a.children[colIdx];
|
|
3264
|
+
var bCell = b.children[colIdx];
|
|
3265
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3266
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3267
|
+
if (aVal !== null && bVal !== null) {
|
|
3268
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3269
|
+
}
|
|
3270
|
+
var aText = aCell.textContent || '';
|
|
3271
|
+
var bText = bCell.textContent || '';
|
|
3272
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3273
|
+
});
|
|
3274
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3275
|
+
|
|
3276
|
+
/* Update sort arrows */
|
|
3277
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3278
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3279
|
+
asc = !asc;
|
|
3280
|
+
});
|
|
3281
|
+
});
|
|
3282
|
+
})();
|
|
3283
|
+
</script>`;
|
|
3284
|
+
}
|
|
3285
|
+
|
|
2568
3286
|
// src/ci.ts
|
|
2569
3287
|
var import_node_fs = require("fs");
|
|
2570
3288
|
var import_node_path = require("path");
|
|
@@ -2586,10 +3304,11 @@ var T_CRITICAL_95 = {
|
|
|
2586
3304
|
25: 2.06,
|
|
2587
3305
|
30: 2.042
|
|
2588
3306
|
};
|
|
3307
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2589
3308
|
function tCritical(df) {
|
|
2590
3309
|
if (df <= 0) return 1.96;
|
|
2591
3310
|
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2592
|
-
const keys =
|
|
3311
|
+
const keys = T_CRITICAL_KEYS;
|
|
2593
3312
|
if (df > keys[keys.length - 1]) return 1.96;
|
|
2594
3313
|
for (let i = 0; i < keys.length - 1; i++) {
|
|
2595
3314
|
if (df > keys[i] && df < keys[i + 1]) {
|
|
@@ -2699,7 +3418,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
|
|
|
2699
3418
|
if (regressions.length > 0) {
|
|
2700
3419
|
for (const r of regressions) {
|
|
2701
3420
|
failureReasons.push(
|
|
2702
|
-
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${
|
|
3421
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
2703
3422
|
);
|
|
2704
3423
|
}
|
|
2705
3424
|
}
|
|
@@ -2734,10 +3453,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
|
2734
3453
|
}
|
|
2735
3454
|
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2736
3455
|
}
|
|
2737
|
-
function formatDelta2(delta) {
|
|
2738
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2739
|
-
return `${sign}${delta.toFixed(4)}`;
|
|
2740
|
-
}
|
|
2741
3456
|
function loadBaseline(path) {
|
|
2742
3457
|
try {
|
|
2743
3458
|
const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
|
|
@@ -2794,18 +3509,20 @@ function detectGitHubContext() {
|
|
|
2794
3509
|
return { token, owner, repo, prNumber };
|
|
2795
3510
|
}
|
|
2796
3511
|
var API_BASE = "https://api.github.com";
|
|
3512
|
+
function ghHeaders(token, extra) {
|
|
3513
|
+
return {
|
|
3514
|
+
Authorization: `Bearer ${token}`,
|
|
3515
|
+
Accept: "application/vnd.github+json",
|
|
3516
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3517
|
+
...extra
|
|
3518
|
+
};
|
|
3519
|
+
}
|
|
2797
3520
|
async function findExistingComment(ctx, marker) {
|
|
2798
3521
|
let page = 1;
|
|
2799
3522
|
const perPage = 50;
|
|
2800
3523
|
while (true) {
|
|
2801
3524
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2802
|
-
const res = await fetch(url, {
|
|
2803
|
-
headers: {
|
|
2804
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2805
|
-
Accept: "application/vnd.github+json",
|
|
2806
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2807
|
-
}
|
|
2808
|
-
});
|
|
3525
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
2809
3526
|
if (!res.ok) return null;
|
|
2810
3527
|
const comments = await res.json();
|
|
2811
3528
|
if (comments.length === 0) break;
|
|
@@ -2825,12 +3542,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2825
3542
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2826
3543
|
const res = await fetch(url, {
|
|
2827
3544
|
method: "PATCH",
|
|
2828
|
-
headers: {
|
|
2829
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2830
|
-
Accept: "application/vnd.github+json",
|
|
2831
|
-
"Content-Type": "application/json",
|
|
2832
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2833
|
-
},
|
|
3545
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2834
3546
|
body: JSON.stringify({ body })
|
|
2835
3547
|
});
|
|
2836
3548
|
if (!res.ok) {
|
|
@@ -2841,12 +3553,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2841
3553
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2842
3554
|
const res = await fetch(url, {
|
|
2843
3555
|
method: "POST",
|
|
2844
|
-
headers: {
|
|
2845
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2846
|
-
Accept: "application/vnd.github+json",
|
|
2847
|
-
"Content-Type": "application/json",
|
|
2848
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2849
|
-
},
|
|
3556
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2850
3557
|
body: JSON.stringify({ body })
|
|
2851
3558
|
});
|
|
2852
3559
|
if (!res.ok) {
|
|
@@ -2865,6 +3572,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2865
3572
|
defineArena,
|
|
2866
3573
|
detectGitHubContext,
|
|
2867
3574
|
gemini,
|
|
3575
|
+
htmlReporter,
|
|
2868
3576
|
jsonReporter,
|
|
2869
3577
|
loadBaseline,
|
|
2870
3578
|
markdownReporter,
|