agent-duelist 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -59
- package/dist/cli.js +1793 -394
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1774 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -8
- package/dist/index.d.ts +73 -8
- package/dist/index.js +1765 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.cjs
CHANGED
|
@@ -32,13 +32,21 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
anthropic: () => anthropic,
|
|
34
34
|
azureOpenai: () => azureOpenai,
|
|
35
|
+
compareResults: () => compareResults,
|
|
36
|
+
computeStats: () => computeStats,
|
|
35
37
|
consoleReporter: () => consoleReporter,
|
|
36
38
|
defineArena: () => defineArena,
|
|
39
|
+
detectGitHubContext: () => detectGitHubContext,
|
|
37
40
|
gemini: () => gemini,
|
|
41
|
+
htmlReporter: () => htmlReporter,
|
|
38
42
|
jsonReporter: () => jsonReporter,
|
|
43
|
+
loadBaseline: () => loadBaseline,
|
|
44
|
+
markdownReporter: () => markdownReporter,
|
|
39
45
|
openai: () => openai,
|
|
40
46
|
openaiCompatible: () => openaiCompatible,
|
|
41
|
-
registerPricing: () => registerPricing
|
|
47
|
+
registerPricing: () => registerPricing,
|
|
48
|
+
saveBaseline: () => saveBaseline,
|
|
49
|
+
upsertPrComment: () => upsertPrComment
|
|
42
50
|
});
|
|
43
51
|
module.exports = __toCommonJS(index_exports);
|
|
44
52
|
|
|
@@ -1454,11 +1462,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1454
1462
|
}
|
|
1455
1463
|
const a = stringify(task.expected);
|
|
1456
1464
|
const b = stringify(result.output);
|
|
1457
|
-
const
|
|
1465
|
+
const setA = tokenize(a);
|
|
1466
|
+
const setB = tokenize(b);
|
|
1467
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1458
1468
|
return {
|
|
1459
1469
|
name: "fuzzy-similarity",
|
|
1460
1470
|
value: Math.round(similarity * 100) / 100,
|
|
1461
|
-
details: { method: "jaccard", expectedTokens:
|
|
1471
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1462
1472
|
};
|
|
1463
1473
|
};
|
|
1464
1474
|
function stringify(value) {
|
|
@@ -1479,7 +1489,163 @@ function jaccardSimilarity(a, b) {
|
|
|
1479
1489
|
}
|
|
1480
1490
|
|
|
1481
1491
|
// src/scorers/llm-judge.ts
|
|
1492
|
+
var import_openai2 = __toESM(require("openai"), 1);
|
|
1493
|
+
|
|
1494
|
+
// src/providers/openai.ts
|
|
1482
1495
|
var import_openai = __toESM(require("openai"), 1);
|
|
1496
|
+
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1497
|
+
|
|
1498
|
+
// src/providers/shared.ts
|
|
1499
|
+
var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
|
|
1500
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1501
|
+
if (!hasSchema) return rawContent;
|
|
1502
|
+
try {
|
|
1503
|
+
return JSON.parse(rawContent);
|
|
1504
|
+
} catch {
|
|
1505
|
+
return rawContent;
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
// src/providers/openai.ts
|
|
1510
|
+
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1511
|
+
function openai(model, options) {
|
|
1512
|
+
const client = new import_openai.default({
|
|
1513
|
+
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1514
|
+
baseURL: options?.baseURL,
|
|
1515
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1516
|
+
});
|
|
1517
|
+
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1518
|
+
}
|
|
1519
|
+
function openaiCompatible(options) {
|
|
1520
|
+
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1521
|
+
const client = new import_openai.default({
|
|
1522
|
+
apiKey,
|
|
1523
|
+
baseURL: options.baseURL,
|
|
1524
|
+
timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1525
|
+
});
|
|
1526
|
+
if (options.free) {
|
|
1527
|
+
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1528
|
+
}
|
|
1529
|
+
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1530
|
+
}
|
|
1531
|
+
function azureOpenai(model, options) {
|
|
1532
|
+
const deployment = options?.deployment ?? model;
|
|
1533
|
+
const client = new import_openai.AzureOpenAI({
|
|
1534
|
+
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1535
|
+
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1536
|
+
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1537
|
+
deployment,
|
|
1538
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1539
|
+
});
|
|
1540
|
+
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1541
|
+
}
|
|
1542
|
+
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1543
|
+
return {
|
|
1544
|
+
id,
|
|
1545
|
+
name,
|
|
1546
|
+
model,
|
|
1547
|
+
async run(input) {
|
|
1548
|
+
const start = Date.now();
|
|
1549
|
+
const params = {
|
|
1550
|
+
model: requestModel,
|
|
1551
|
+
messages: [{ role: "user", content: input.prompt }]
|
|
1552
|
+
};
|
|
1553
|
+
if (input.schema) {
|
|
1554
|
+
params.response_format = { type: "json_object" };
|
|
1555
|
+
params.messages = [
|
|
1556
|
+
{ role: "system", content: SCHEMA_SYSTEM_MESSAGE },
|
|
1557
|
+
...params.messages
|
|
1558
|
+
];
|
|
1559
|
+
}
|
|
1560
|
+
if (input.tools?.length) {
|
|
1561
|
+
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1562
|
+
params.tool_choice = "auto";
|
|
1563
|
+
}
|
|
1564
|
+
const response = await client.chat.completions.create(params, { signal: input.signal });
|
|
1565
|
+
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1566
|
+
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1567
|
+
const choice = response.choices[0];
|
|
1568
|
+
const toolCallsRaw = choice?.message?.tool_calls;
|
|
1569
|
+
const collectedToolCalls = [];
|
|
1570
|
+
let finalResponse = response;
|
|
1571
|
+
if (toolCallsRaw?.length && input.tools?.length) {
|
|
1572
|
+
const toolMessages = [
|
|
1573
|
+
...params.messages,
|
|
1574
|
+
choice.message
|
|
1575
|
+
];
|
|
1576
|
+
for (const tc of toolCallsRaw) {
|
|
1577
|
+
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
1578
|
+
let args;
|
|
1579
|
+
try {
|
|
1580
|
+
args = JSON.parse(tc.function.arguments);
|
|
1581
|
+
} catch {
|
|
1582
|
+
args = tc.function.arguments;
|
|
1583
|
+
}
|
|
1584
|
+
let result;
|
|
1585
|
+
if (toolDef?.handler) {
|
|
1586
|
+
result = await toolDef.handler(args);
|
|
1587
|
+
}
|
|
1588
|
+
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
1589
|
+
toolMessages.push({
|
|
1590
|
+
role: "tool",
|
|
1591
|
+
tool_call_id: tc.id,
|
|
1592
|
+
content: JSON.stringify(result ?? {})
|
|
1593
|
+
});
|
|
1594
|
+
}
|
|
1595
|
+
const followUp = await client.chat.completions.create({
|
|
1596
|
+
model: requestModel,
|
|
1597
|
+
messages: toolMessages
|
|
1598
|
+
}, { signal: input.signal });
|
|
1599
|
+
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1600
|
+
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1601
|
+
finalResponse = followUp;
|
|
1602
|
+
}
|
|
1603
|
+
const latencyMs = Date.now() - start;
|
|
1604
|
+
const finalChoice = finalResponse.choices[0];
|
|
1605
|
+
let rawContent = finalChoice?.message?.content ?? "";
|
|
1606
|
+
if (stripThinking) {
|
|
1607
|
+
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1608
|
+
}
|
|
1609
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1610
|
+
return {
|
|
1611
|
+
output,
|
|
1612
|
+
usage: {
|
|
1613
|
+
promptTokens: totalPromptTokens || void 0,
|
|
1614
|
+
completionTokens: totalCompletionTokens || void 0
|
|
1615
|
+
},
|
|
1616
|
+
latencyMs,
|
|
1617
|
+
raw: finalResponse,
|
|
1618
|
+
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
1619
|
+
};
|
|
1620
|
+
}
|
|
1621
|
+
};
|
|
1622
|
+
}
|
|
1623
|
+
function gemini(model, options) {
|
|
1624
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1625
|
+
if (!apiKey) {
|
|
1626
|
+
throw new Error(
|
|
1627
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1628
|
+
);
|
|
1629
|
+
}
|
|
1630
|
+
const client = new import_openai.default({
|
|
1631
|
+
apiKey,
|
|
1632
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1633
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1634
|
+
});
|
|
1635
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1636
|
+
}
|
|
1637
|
+
function toolDefToOpenAI(tool) {
|
|
1638
|
+
return {
|
|
1639
|
+
type: "function",
|
|
1640
|
+
function: {
|
|
1641
|
+
name: tool.name,
|
|
1642
|
+
description: tool.description,
|
|
1643
|
+
parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
1644
|
+
}
|
|
1645
|
+
};
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
// src/scorers/llm-judge.ts
|
|
1483
1649
|
var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
|
|
1484
1650
|
|
|
1485
1651
|
Criteria:
|
|
@@ -1495,40 +1661,42 @@ conciseness: <number>
|
|
|
1495
1661
|
Task: {task}
|
|
1496
1662
|
Expected: {expected}
|
|
1497
1663
|
Actual: {actual}`;
|
|
1498
|
-
function resolveJudgeClient(configModel) {
|
|
1499
|
-
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-
|
|
1664
|
+
function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1665
|
+
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
|
|
1500
1666
|
if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
|
|
1501
1667
|
return {
|
|
1502
|
-
client: new
|
|
1668
|
+
client: new import_openai2.default({
|
|
1503
1669
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
1504
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
1670
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1671
|
+
timeout: timeoutMs
|
|
1505
1672
|
}),
|
|
1506
1673
|
model
|
|
1507
1674
|
};
|
|
1508
1675
|
}
|
|
1509
1676
|
if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
|
|
1510
1677
|
return {
|
|
1511
|
-
client: new
|
|
1678
|
+
client: new import_openai2.AzureOpenAI({
|
|
1512
1679
|
apiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
1513
1680
|
endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
1514
1681
|
apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1515
|
-
deployment: model
|
|
1682
|
+
deployment: model,
|
|
1683
|
+
timeout: timeoutMs
|
|
1516
1684
|
}),
|
|
1517
1685
|
model
|
|
1518
1686
|
};
|
|
1519
1687
|
}
|
|
1520
1688
|
const apiKey = process.env.OPENAI_API_KEY;
|
|
1521
1689
|
if (!apiKey) return void 0;
|
|
1522
|
-
return { client: new
|
|
1690
|
+
return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
|
|
1523
1691
|
}
|
|
1524
|
-
function createLlmJudgeScorer(judgeModel) {
|
|
1692
|
+
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1525
1693
|
let cached = void 0;
|
|
1526
1694
|
return async ({ task, result }) => {
|
|
1527
1695
|
if (task.expected === void 0) {
|
|
1528
1696
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
1529
1697
|
}
|
|
1530
1698
|
if (cached === void 0) {
|
|
1531
|
-
cached = resolveJudgeClient(judgeModel) ?? null;
|
|
1699
|
+
cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
|
|
1532
1700
|
}
|
|
1533
1701
|
if (!cached) {
|
|
1534
1702
|
return {
|
|
@@ -1543,8 +1711,7 @@ function createLlmJudgeScorer(judgeModel) {
|
|
|
1543
1711
|
const response = await client.chat.completions.create({
|
|
1544
1712
|
model,
|
|
1545
1713
|
messages: [{ role: "user", content: prompt }],
|
|
1546
|
-
|
|
1547
|
-
max_tokens: 2048
|
|
1714
|
+
max_completion_tokens: 2048
|
|
1548
1715
|
});
|
|
1549
1716
|
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1550
1717
|
const parsed = {};
|
|
@@ -1601,10 +1768,10 @@ var staticScorers = {
|
|
|
1601
1768
|
"fuzzy-similarity": fuzzySimilarityScorer,
|
|
1602
1769
|
"tool-usage": toolUsageScorer
|
|
1603
1770
|
};
|
|
1604
|
-
function resolveScorers(names, judgeModel) {
|
|
1771
|
+
function resolveScorers(names, judgeModel, timeoutMs) {
|
|
1605
1772
|
return names.map((name) => {
|
|
1606
1773
|
if (name === "llm-judge-correctness") {
|
|
1607
|
-
return createLlmJudgeScorer(judgeModel);
|
|
1774
|
+
return createLlmJudgeScorer(judgeModel, timeoutMs);
|
|
1608
1775
|
}
|
|
1609
1776
|
const scorer = staticScorers[name];
|
|
1610
1777
|
if (!scorer) {
|
|
@@ -1615,219 +1782,156 @@ function resolveScorers(names, judgeModel) {
|
|
|
1615
1782
|
}
|
|
1616
1783
|
|
|
1617
1784
|
// src/runner.ts
|
|
1785
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
1786
|
+
function withTimeout(run, ms) {
|
|
1787
|
+
return new Promise((resolve, reject) => {
|
|
1788
|
+
const controller = new AbortController();
|
|
1789
|
+
const timer = setTimeout(() => {
|
|
1790
|
+
controller.abort();
|
|
1791
|
+
reject(new Error(`Request timed out after ${ms}ms`));
|
|
1792
|
+
}, ms);
|
|
1793
|
+
run(controller.signal).then(
|
|
1794
|
+
(v) => {
|
|
1795
|
+
clearTimeout(timer);
|
|
1796
|
+
resolve(v);
|
|
1797
|
+
},
|
|
1798
|
+
(e) => {
|
|
1799
|
+
clearTimeout(timer);
|
|
1800
|
+
reject(e);
|
|
1801
|
+
}
|
|
1802
|
+
);
|
|
1803
|
+
});
|
|
1804
|
+
}
|
|
1618
1805
|
async function runBenchmarks(options) {
|
|
1619
1806
|
const { providers, tasks, scorers, runs, onResult } = options;
|
|
1807
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1620
1808
|
const results = [];
|
|
1621
1809
|
for (const task of tasks) {
|
|
1622
|
-
for (
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1810
|
+
for (let run = 1; run <= runs; run++) {
|
|
1811
|
+
const runResults = await Promise.all(
|
|
1812
|
+
providers.map(async (provider) => {
|
|
1813
|
+
let result;
|
|
1814
|
+
try {
|
|
1815
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1816
|
+
prompt: task.prompt,
|
|
1817
|
+
schema: task.schema,
|
|
1818
|
+
tools: task.tools,
|
|
1819
|
+
signal
|
|
1820
|
+
}), timeout);
|
|
1821
|
+
const scores = await Promise.all(
|
|
1822
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1823
|
+
);
|
|
1824
|
+
result = {
|
|
1825
|
+
providerId: provider.id,
|
|
1826
|
+
taskName: task.name,
|
|
1827
|
+
run,
|
|
1828
|
+
scores,
|
|
1829
|
+
raw: {
|
|
1830
|
+
output: taskResult.output,
|
|
1831
|
+
latencyMs: taskResult.latencyMs,
|
|
1832
|
+
usage: taskResult.usage,
|
|
1833
|
+
toolCalls: taskResult.toolCalls
|
|
1834
|
+
}
|
|
1835
|
+
};
|
|
1836
|
+
} catch (err) {
|
|
1837
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1838
|
+
result = {
|
|
1839
|
+
providerId: provider.id,
|
|
1840
|
+
taskName: task.name,
|
|
1841
|
+
run,
|
|
1842
|
+
scores: [],
|
|
1843
|
+
error: message,
|
|
1844
|
+
raw: { output: "", latencyMs: 0 }
|
|
1845
|
+
};
|
|
1846
|
+
}
|
|
1847
|
+
onResult?.(result);
|
|
1848
|
+
return result;
|
|
1849
|
+
})
|
|
1850
|
+
);
|
|
1851
|
+
results.push(...runResults);
|
|
1660
1852
|
}
|
|
1661
1853
|
}
|
|
1662
1854
|
return results;
|
|
1663
1855
|
}
|
|
1664
1856
|
|
|
1665
|
-
// src/
|
|
1666
|
-
var
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
}
|
|
1676
|
-
function dim(s) {
|
|
1677
|
-
return `${dimCode}${s}${reset}`;
|
|
1857
|
+
// src/utils/format.ts
|
|
1858
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1859
|
+
function formatCost(usd) {
|
|
1860
|
+
if (usd === void 0) return "\u2014";
|
|
1861
|
+
if (usd === 0) return "$0.00";
|
|
1862
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1863
|
+
const digits = Math.min(
|
|
1864
|
+
MAX_FRACTION_DIGITS,
|
|
1865
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1866
|
+
);
|
|
1867
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1678
1868
|
}
|
|
1679
|
-
function
|
|
1680
|
-
const
|
|
1681
|
-
|
|
1682
|
-
if (value >= 0.8) return `${green}${str}${reset}`;
|
|
1683
|
-
if (value >= 0.5) return `${yellow}${str}${reset}`;
|
|
1684
|
-
return `${red}${str}${reset}`;
|
|
1869
|
+
function formatDelta(delta, precision = 4) {
|
|
1870
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1871
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1685
1872
|
}
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
const
|
|
1692
|
-
const
|
|
1693
|
-
const
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
const
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
else if (name === "cost") {
|
|
1708
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
1709
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
1710
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
1711
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
1712
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
1713
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
1714
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
1715
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
1716
|
-
}
|
|
1717
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
1718
|
-
const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
|
|
1719
|
-
console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
|
|
1720
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
1721
|
-
for (const provider of providers) {
|
|
1722
|
-
const taskResults = results.filter(
|
|
1723
|
-
(r) => r.taskName === task && r.providerId === provider
|
|
1724
|
-
);
|
|
1725
|
-
const errorResults2 = taskResults.filter((r) => r.error);
|
|
1726
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
1727
|
-
if (successResults.length === 0 && errorResults2.length > 0) {
|
|
1728
|
-
const cells2 = [pad(provider, 24, "left")];
|
|
1729
|
-
for (const name of scorerNames) {
|
|
1730
|
-
if (name === "cost") {
|
|
1731
|
-
cells2.push(pad("\u2014", 14, "right"));
|
|
1732
|
-
cells2.push(pad("\u2014", 11, "right"));
|
|
1733
|
-
} else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
|
|
1734
|
-
}
|
|
1735
|
-
if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
|
|
1736
|
-
console.log(` ${cells2.join("")}`);
|
|
1737
|
-
continue;
|
|
1738
|
-
}
|
|
1739
|
-
const avgScores = averageScores(successResults);
|
|
1740
|
-
const avgDetails = averageDetails(successResults);
|
|
1741
|
-
const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
|
|
1742
|
-
const cells = [pad(provider, 24, "left")];
|
|
1743
|
-
for (const name of scorerNames) {
|
|
1744
|
-
if (name === "latency") {
|
|
1745
|
-
cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
|
|
1746
|
-
} else if (name === "cost") {
|
|
1747
|
-
cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
|
|
1748
|
-
cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
|
|
1749
|
-
} else {
|
|
1750
|
-
const val = avgScores[name];
|
|
1751
|
-
if (val === void 0) cells.push(pad("\u2014", 10, "right"));
|
|
1752
|
-
else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
|
|
1753
|
-
}
|
|
1754
|
-
}
|
|
1755
|
-
if (hasErrors) {
|
|
1756
|
-
const failCount = errorResults2.length;
|
|
1757
|
-
cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
|
|
1758
|
-
}
|
|
1759
|
-
console.log(` ${cells.join("")}`);
|
|
1873
|
+
|
|
1874
|
+
// src/reporter/shared.ts
|
|
1875
|
+
function groupResults(results) {
|
|
1876
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1877
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1878
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1879
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1880
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1881
|
+
let hasErrors = false;
|
|
1882
|
+
let maxRun = 0;
|
|
1883
|
+
for (const r of results) {
|
|
1884
|
+
taskSet.add(r.taskName);
|
|
1885
|
+
providerSet.add(r.providerId);
|
|
1886
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1887
|
+
if (r.error) hasErrors = true;
|
|
1888
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1889
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1890
|
+
let group = grouped.get(key);
|
|
1891
|
+
if (!group) {
|
|
1892
|
+
group = [];
|
|
1893
|
+
grouped.set(key, group);
|
|
1760
1894
|
}
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
console.log(` ${bold("Errors")}`);
|
|
1767
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
1768
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1769
|
-
for (const r of errorResults) {
|
|
1770
|
-
const key = `${r.providerId}::${r.error}`;
|
|
1771
|
-
if (seen.has(key)) continue;
|
|
1772
|
-
seen.add(key);
|
|
1773
|
-
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
1774
|
-
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
1775
|
-
console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
1776
|
-
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
1777
|
-
if (hint) console.log(` ${dim(hint)}`);
|
|
1895
|
+
group.push(r);
|
|
1896
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1897
|
+
if (!provGroup) {
|
|
1898
|
+
provGroup = [];
|
|
1899
|
+
byProvider.set(r.providerId, provGroup);
|
|
1778
1900
|
}
|
|
1779
|
-
|
|
1780
|
-
}
|
|
1781
|
-
if (hasCost) {
|
|
1782
|
-
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
1783
|
-
console.log("");
|
|
1901
|
+
provGroup.push(r);
|
|
1784
1902
|
}
|
|
1903
|
+
return {
|
|
1904
|
+
tasks: [...taskSet],
|
|
1905
|
+
providers: [...providerSet],
|
|
1906
|
+
scorerNames: [...scorerSet],
|
|
1907
|
+
grouped,
|
|
1908
|
+
byProvider,
|
|
1909
|
+
hasErrors,
|
|
1910
|
+
maxRun
|
|
1911
|
+
};
|
|
1785
1912
|
}
|
|
1786
|
-
function
|
|
1787
|
-
const
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
const byLatency = providers.map((id) => {
|
|
1800
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
1801
|
-
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
1802
|
-
return { id, avg: avg ?? Infinity };
|
|
1803
|
-
}).sort((a, b) => a.avg - b.avg)[0];
|
|
1804
|
-
if (byLatency && byLatency.avg !== Infinity) {
|
|
1805
|
-
const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
|
|
1806
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
|
|
1807
|
-
}
|
|
1808
|
-
const byCost = providers.map((id) => {
|
|
1809
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
1810
|
-
const costs = runs.map((r) => {
|
|
1811
|
-
const s = r.scores.find((s2) => s2.name === "cost");
|
|
1812
|
-
return s && s.value >= 0 ? s.value : void 0;
|
|
1813
|
-
}).filter((c) => c !== void 0);
|
|
1814
|
-
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
1815
|
-
return { id, avg };
|
|
1816
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
1817
|
-
if (byCost?.avg !== void 0) {
|
|
1818
|
-
const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
|
|
1819
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
|
|
1913
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1914
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1915
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1916
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1917
|
+
if (successResults.length === 0) {
|
|
1918
|
+
return {
|
|
1919
|
+
providerId,
|
|
1920
|
+
avgScores: {},
|
|
1921
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1922
|
+
latencyMs: void 0,
|
|
1923
|
+
allErrors: errorResults.length > 0,
|
|
1924
|
+
errorCount: errorResults.length
|
|
1925
|
+
};
|
|
1820
1926
|
}
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
1830
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
1927
|
+
return {
|
|
1928
|
+
providerId,
|
|
1929
|
+
avgScores: averageScores(successResults),
|
|
1930
|
+
avgDetails: averageDetails(successResults),
|
|
1931
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1932
|
+
allErrors: false,
|
|
1933
|
+
errorCount: errorResults.length
|
|
1934
|
+
};
|
|
1831
1935
|
}
|
|
1832
1936
|
function averageScores(results) {
|
|
1833
1937
|
const sums = {};
|
|
@@ -1871,38 +1975,89 @@ function average(nums) {
|
|
|
1871
1975
|
if (nums.length === 0) return void 0;
|
|
1872
1976
|
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1873
1977
|
}
|
|
1874
|
-
function
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
if (
|
|
1878
|
-
|
|
1879
|
-
|
|
1978
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
1979
|
+
const stats = /* @__PURE__ */ new Map();
|
|
1980
|
+
const valid = providerData.filter((p) => !p.allErrors);
|
|
1981
|
+
if (scorerNames.includes("latency")) {
|
|
1982
|
+
const values = /* @__PURE__ */ new Map();
|
|
1983
|
+
for (const p of providerData) {
|
|
1984
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
|
|
1985
|
+
}
|
|
1986
|
+
const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
|
|
1987
|
+
stats.set("latency", {
|
|
1988
|
+
values,
|
|
1989
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
1990
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
1991
|
+
});
|
|
1992
|
+
}
|
|
1993
|
+
if (scorerNames.includes("cost")) {
|
|
1994
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
1995
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
1996
|
+
for (const p of providerData) {
|
|
1997
|
+
costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
|
|
1998
|
+
tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
|
|
1999
|
+
}
|
|
2000
|
+
const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
|
|
2001
|
+
const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
|
|
2002
|
+
stats.set("cost", {
|
|
2003
|
+
values: costValues,
|
|
2004
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
2005
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
2006
|
+
});
|
|
2007
|
+
stats.set("tokens", {
|
|
2008
|
+
values: tokenValues,
|
|
2009
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
2010
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
2011
|
+
});
|
|
2012
|
+
}
|
|
2013
|
+
for (const name of scorerNames) {
|
|
2014
|
+
if (name === "latency" || name === "cost") continue;
|
|
2015
|
+
const values = /* @__PURE__ */ new Map();
|
|
2016
|
+
for (const p of providerData) {
|
|
2017
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
|
|
2018
|
+
}
|
|
2019
|
+
const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
|
|
2020
|
+
stats.set(name, {
|
|
2021
|
+
values,
|
|
2022
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
2023
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
2024
|
+
});
|
|
2025
|
+
}
|
|
2026
|
+
return stats;
|
|
1880
2027
|
}
|
|
1881
|
-
function
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
2028
|
+
function computeMedals(columnStats, providerIds) {
|
|
2029
|
+
const medals = /* @__PURE__ */ new Map();
|
|
2030
|
+
if (providerIds.length < 2) {
|
|
2031
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
2032
|
+
return medals;
|
|
2033
|
+
}
|
|
2034
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2035
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
2036
|
+
for (const [, colStats] of columnStats) {
|
|
2037
|
+
if (colStats.best === void 0) continue;
|
|
2038
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2039
|
+
if (bestProviders.length === 1) {
|
|
2040
|
+
wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
|
|
2041
|
+
}
|
|
2042
|
+
}
|
|
2043
|
+
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
2044
|
+
if (totalWins === 0) {
|
|
2045
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
2046
|
+
return medals;
|
|
2047
|
+
}
|
|
2048
|
+
const sorted = [...wins.entries()].sort(
|
|
2049
|
+
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
2050
|
+
);
|
|
2051
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
2052
|
+
let rank = 0;
|
|
2053
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
2054
|
+
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
2055
|
+
rank = i;
|
|
2056
|
+
}
|
|
2057
|
+
const hasWins = sorted[i][1] > 0;
|
|
2058
|
+
medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
|
|
2059
|
+
}
|
|
2060
|
+
return medals;
|
|
1906
2061
|
}
|
|
1907
2062
|
function providerLabel(providerId) {
|
|
1908
2063
|
const prefix = providerId.split("/")[0];
|
|
@@ -1955,6 +2110,369 @@ function providerLabel(providerId) {
|
|
|
1955
2110
|
return `(${prefix})`;
|
|
1956
2111
|
}
|
|
1957
2112
|
}
|
|
2113
|
+
function apiKeyHint(providerId, error) {
|
|
2114
|
+
const lower = error.toLowerCase();
|
|
2115
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2116
|
+
if (!isAuthError) return void 0;
|
|
2117
|
+
const prefix = providerId.split("/")[0];
|
|
2118
|
+
switch (prefix) {
|
|
2119
|
+
case "openai":
|
|
2120
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2121
|
+
case "azure":
|
|
2122
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2123
|
+
case "anthropic":
|
|
2124
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2125
|
+
case "google":
|
|
2126
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2127
|
+
default:
|
|
2128
|
+
return `Check the API key for ${providerId}`;
|
|
2129
|
+
}
|
|
2130
|
+
}
|
|
2131
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2132
|
+
const ranked = providers.map((id) => {
|
|
2133
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2134
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2135
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2136
|
+
return { id, avg };
|
|
2137
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2138
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2139
|
+
}
|
|
2140
|
+
function scorerLabel(name) {
|
|
2141
|
+
switch (name) {
|
|
2142
|
+
case "correctness":
|
|
2143
|
+
return "Match";
|
|
2144
|
+
case "schema-correctness":
|
|
2145
|
+
return "Schema";
|
|
2146
|
+
case "fuzzy-similarity":
|
|
2147
|
+
return "Fuzzy";
|
|
2148
|
+
case "llm-judge-correctness":
|
|
2149
|
+
return "Judge";
|
|
2150
|
+
case "tool-usage":
|
|
2151
|
+
return "Tool";
|
|
2152
|
+
default:
|
|
2153
|
+
return name;
|
|
2154
|
+
}
|
|
2155
|
+
}
|
|
2156
|
+
function medalEmoji(medal) {
|
|
2157
|
+
switch (medal) {
|
|
2158
|
+
case "gold":
|
|
2159
|
+
return "\u{1F947}";
|
|
2160
|
+
case "silver":
|
|
2161
|
+
return "\u{1F948}";
|
|
2162
|
+
case "bronze":
|
|
2163
|
+
return "\u{1F949}";
|
|
2164
|
+
case "none":
|
|
2165
|
+
return "";
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
// src/reporter/console.ts
|
|
2170
|
+
var reset = "\x1B[0m";
|
|
2171
|
+
var boldCode = "\x1B[1m";
|
|
2172
|
+
var dimCode = "\x1B[2m";
|
|
2173
|
+
var green = "\x1B[32m";
|
|
2174
|
+
var red = "\x1B[31m";
|
|
2175
|
+
var yellow = "\x1B[33m";
|
|
2176
|
+
var cyan = "\x1B[36m";
|
|
2177
|
+
var brightGreen = "\x1B[92m";
|
|
2178
|
+
var brightWhite = "\x1B[97m";
|
|
2179
|
+
function bold(s) {
|
|
2180
|
+
return `${boldCode}${s}${reset}`;
|
|
2181
|
+
}
|
|
2182
|
+
function dim(s) {
|
|
2183
|
+
return `${dimCode}${s}${reset}`;
|
|
2184
|
+
}
|
|
2185
|
+
function stripAnsi(s) {
|
|
2186
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2187
|
+
}
|
|
2188
|
+
function displayWidth(s) {
|
|
2189
|
+
const stripped = stripAnsi(s);
|
|
2190
|
+
let width = 0;
|
|
2191
|
+
for (const ch of stripped) {
|
|
2192
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2193
|
+
if (code >= 126976) width += 2;
|
|
2194
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2195
|
+
else width += 1;
|
|
2196
|
+
}
|
|
2197
|
+
return width;
|
|
2198
|
+
}
|
|
2199
|
+
function padCell(str, targetWidth, align) {
|
|
2200
|
+
const dw = displayWidth(str);
|
|
2201
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2202
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2203
|
+
return str + " ".repeat(padding);
|
|
2204
|
+
}
|
|
2205
|
+
function sparkBar(ratio, width = 8) {
|
|
2206
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2207
|
+
const fillLen = Math.round(clamped * width);
|
|
2208
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2209
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2210
|
+
return { fill, track };
|
|
2211
|
+
}
|
|
2212
|
+
function drawTableLine(widths, position) {
|
|
2213
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2214
|
+
if (position === "bottom") {
|
|
2215
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2216
|
+
}
|
|
2217
|
+
if (position === "merge") {
|
|
2218
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2219
|
+
}
|
|
2220
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2221
|
+
if (position === "top") {
|
|
2222
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2223
|
+
}
|
|
2224
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2225
|
+
}
|
|
2226
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2227
|
+
const parts = cells.map(
|
|
2228
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2229
|
+
);
|
|
2230
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2231
|
+
}
|
|
2232
|
+
function drawSpanRow(content, widths) {
|
|
2233
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2234
|
+
const dw = displayWidth(content);
|
|
2235
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2236
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2237
|
+
}
|
|
2238
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2239
|
+
if (value === void 0) return dim("\u2014");
|
|
2240
|
+
if (providerCount < 2) return text;
|
|
2241
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2242
|
+
if (colStats.best === colStats.worst) return text;
|
|
2243
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2244
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2245
|
+
return `${yellow}${text}${reset}`;
|
|
2246
|
+
}
|
|
2247
|
+
function consoleReporter(results, options) {
|
|
2248
|
+
const showSparklines = options?.sparklines ?? true;
|
|
2249
|
+
if (results.length === 0) {
|
|
2250
|
+
console.log("\nNo results to display.\n");
|
|
2251
|
+
return;
|
|
2252
|
+
}
|
|
2253
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
2254
|
+
const hasCost = scorerNames.includes("cost");
|
|
2255
|
+
const multi = providers.length >= 2;
|
|
2256
|
+
const runsPerCell = maxRun;
|
|
2257
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
2258
|
+
console.log("");
|
|
2259
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
2260
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2261
|
+
console.log("");
|
|
2262
|
+
for (const task of tasks) {
|
|
2263
|
+
console.log(` ${bold(`Task: ${task}`)}`);
|
|
2264
|
+
console.log("");
|
|
2265
|
+
const providerData = providers.map(
|
|
2266
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2267
|
+
);
|
|
2268
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2269
|
+
const medals = computeMedals(columnStats, providers);
|
|
2270
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
2271
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
2272
|
+
const cols = [
|
|
2273
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
2274
|
+
];
|
|
2275
|
+
for (const name of scorerNames) {
|
|
2276
|
+
if (name === "latency") {
|
|
2277
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
2278
|
+
} else if (name === "cost") {
|
|
2279
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
2280
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
2281
|
+
} else {
|
|
2282
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2283
|
+
}
|
|
2284
|
+
}
|
|
2285
|
+
if (hasErrors) {
|
|
2286
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
2287
|
+
}
|
|
2288
|
+
const widths = cols.map((c) => c.width);
|
|
2289
|
+
const aligns = cols.map((c) => c.align);
|
|
2290
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
2291
|
+
const headerCells = cols.map((c) => bold(c.label));
|
|
2292
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2293
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2294
|
+
for (const pd of providerData) {
|
|
2295
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2296
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2297
|
+
const cells = [providerCell];
|
|
2298
|
+
if (pd.allErrors) {
|
|
2299
|
+
for (const col of cols.slice(1)) {
|
|
2300
|
+
if (col.label === "Status") {
|
|
2301
|
+
cells.push(`${red}FAIL${reset}`);
|
|
2302
|
+
} else {
|
|
2303
|
+
cells.push(dim("\u2014"));
|
|
2304
|
+
}
|
|
2305
|
+
}
|
|
2306
|
+
} else {
|
|
2307
|
+
for (const col of cols.slice(1)) {
|
|
2308
|
+
if (col.label === "Status") {
|
|
2309
|
+
cells.push(
|
|
2310
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
2311
|
+
);
|
|
2312
|
+
continue;
|
|
2313
|
+
}
|
|
2314
|
+
const statsKey = col.statsKey;
|
|
2315
|
+
const colStats = columnStats.get(statsKey);
|
|
2316
|
+
if (statsKey === "latency") {
|
|
2317
|
+
const ms = pd.latencyMs;
|
|
2318
|
+
if (ms === void 0) {
|
|
2319
|
+
cells.push(dim("\u2014"));
|
|
2320
|
+
} else {
|
|
2321
|
+
const text = `${Math.round(ms)}ms`;
|
|
2322
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
2323
|
+
}
|
|
2324
|
+
} else if (statsKey === "cost") {
|
|
2325
|
+
const cost = pd.avgDetails.costUsd;
|
|
2326
|
+
if (cost === void 0) {
|
|
2327
|
+
cells.push(dim("\u2014"));
|
|
2328
|
+
} else {
|
|
2329
|
+
const text = formatCost(cost);
|
|
2330
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
2331
|
+
}
|
|
2332
|
+
} else if (statsKey === "tokens") {
|
|
2333
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
2334
|
+
if (tokens === void 0) {
|
|
2335
|
+
cells.push(dim("\u2014"));
|
|
2336
|
+
} else {
|
|
2337
|
+
const text = `${tokens}`;
|
|
2338
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
2339
|
+
}
|
|
2340
|
+
} else {
|
|
2341
|
+
const val = pd.avgScores[statsKey];
|
|
2342
|
+
if (val === void 0) {
|
|
2343
|
+
cells.push(dim("\u2014"));
|
|
2344
|
+
} else {
|
|
2345
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
2346
|
+
let coloredPct;
|
|
2347
|
+
if (multi && colStats) {
|
|
2348
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
2349
|
+
} else {
|
|
2350
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
2351
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
2352
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
2353
|
+
}
|
|
2354
|
+
if (showSparklines) {
|
|
2355
|
+
const { fill, track } = sparkBar(val);
|
|
2356
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
2357
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
2358
|
+
} else {
|
|
2359
|
+
cells.push(coloredPct);
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
}
|
|
2363
|
+
}
|
|
2364
|
+
}
|
|
2365
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2366
|
+
}
|
|
2367
|
+
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2368
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2369
|
+
if (winnerId) {
|
|
2370
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2371
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
2372
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
2373
|
+
}
|
|
2374
|
+
}
|
|
2375
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2376
|
+
console.log("");
|
|
2377
|
+
}
|
|
2378
|
+
printSummary(results, providers, byProvider);
|
|
2379
|
+
const errorResults = results.filter((r) => r.error);
|
|
2380
|
+
if (errorResults.length > 0) {
|
|
2381
|
+
console.log(` ${bold("Errors")}`);
|
|
2382
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2383
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2384
|
+
for (const r of errorResults) {
|
|
2385
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2386
|
+
if (seen.has(key)) continue;
|
|
2387
|
+
seen.add(key);
|
|
2388
|
+
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
2389
|
+
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
2390
|
+
console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
2391
|
+
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
2392
|
+
if (hint) console.log(` ${dim(hint)}`);
|
|
2393
|
+
}
|
|
2394
|
+
console.log("");
|
|
2395
|
+
}
|
|
2396
|
+
if (hasCost) {
|
|
2397
|
+
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
2398
|
+
console.log("");
|
|
2399
|
+
}
|
|
2400
|
+
}
|
|
2401
|
+
function printSummary(results, providers, byProvider) {
|
|
2402
|
+
const successResults = results.filter((r) => !r.error);
|
|
2403
|
+
if (successResults.length === 0) return;
|
|
2404
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2405
|
+
for (const id of providers) {
|
|
2406
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2407
|
+
}
|
|
2408
|
+
console.log(` ${bold("Summary")}`);
|
|
2409
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2410
|
+
console.log("");
|
|
2411
|
+
const single = providers.length === 1;
|
|
2412
|
+
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2413
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2414
|
+
if (byCorrectness) {
|
|
2415
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2416
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2417
|
+
if (single) {
|
|
2418
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2419
|
+
} else {
|
|
2420
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2421
|
+
}
|
|
2422
|
+
}
|
|
2423
|
+
const byLatency = providers.map((id) => {
|
|
2424
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2425
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2426
|
+
return { id, avg: avg ?? Infinity };
|
|
2427
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2428
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
2429
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2430
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2431
|
+
if (single) {
|
|
2432
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2433
|
+
} else {
|
|
2434
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2435
|
+
}
|
|
2436
|
+
}
|
|
2437
|
+
const byCost = providers.map((id) => {
|
|
2438
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2439
|
+
const costs = runs.map((r) => {
|
|
2440
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2441
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2442
|
+
}).filter((c) => c !== void 0);
|
|
2443
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2444
|
+
return { id, avg };
|
|
2445
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2446
|
+
if (byCost?.avg !== void 0) {
|
|
2447
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2448
|
+
const costStr = formatCost(byCost.avg);
|
|
2449
|
+
if (single) {
|
|
2450
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2451
|
+
} else {
|
|
2452
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2453
|
+
}
|
|
2454
|
+
}
|
|
2455
|
+
if (!single) {
|
|
2456
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2457
|
+
for (const id of providers) wins.set(id, 0);
|
|
2458
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2459
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2460
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2461
|
+
const maxWins = Math.max(...wins.values());
|
|
2462
|
+
if (maxWins > 0) {
|
|
2463
|
+
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2464
|
+
console.log("");
|
|
2465
|
+
if (topProviders.length === 1) {
|
|
2466
|
+
const [winnerId, winCount] = topProviders[0];
|
|
2467
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2468
|
+
} else {
|
|
2469
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2470
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
console.log("");
|
|
2475
|
+
}
|
|
1958
2476
|
|
|
1959
2477
|
// src/reporter/json.ts
|
|
1960
2478
|
function jsonReporter(results) {
|
|
@@ -1989,7 +2507,7 @@ function defineArena(config) {
|
|
|
1989
2507
|
throw new Error("At least one task is required");
|
|
1990
2508
|
}
|
|
1991
2509
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
1992
|
-
const scorerFns = resolveScorers(scorerNames, config.judgeModel);
|
|
2510
|
+
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
1993
2511
|
const runs = config.runs ?? 1;
|
|
1994
2512
|
return {
|
|
1995
2513
|
config,
|
|
@@ -1999,141 +2517,13 @@ function defineArena(config) {
|
|
|
1999
2517
|
tasks: config.tasks,
|
|
2000
2518
|
scorers: scorerFns,
|
|
2001
2519
|
runs,
|
|
2520
|
+
timeout: config.timeout,
|
|
2002
2521
|
onResult: options?.onResult
|
|
2003
2522
|
});
|
|
2004
2523
|
}
|
|
2005
2524
|
};
|
|
2006
2525
|
}
|
|
2007
2526
|
|
|
2008
|
-
// src/providers/openai.ts
|
|
2009
|
-
var import_openai2 = __toESM(require("openai"), 1);
|
|
2010
|
-
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
2011
|
-
function openai(model, options) {
|
|
2012
|
-
const client = new import_openai2.default({
|
|
2013
|
-
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
2014
|
-
baseURL: options?.baseURL
|
|
2015
|
-
});
|
|
2016
|
-
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
2017
|
-
}
|
|
2018
|
-
function openaiCompatible(options) {
|
|
2019
|
-
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
2020
|
-
const client = new import_openai2.default({
|
|
2021
|
-
apiKey,
|
|
2022
|
-
baseURL: options.baseURL
|
|
2023
|
-
});
|
|
2024
|
-
if (options.free) {
|
|
2025
|
-
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
2026
|
-
}
|
|
2027
|
-
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
2028
|
-
}
|
|
2029
|
-
function azureOpenai(model, options) {
|
|
2030
|
-
const deployment = options?.deployment ?? model;
|
|
2031
|
-
const client = new import_openai2.AzureOpenAI({
|
|
2032
|
-
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
2033
|
-
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
2034
|
-
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
2035
|
-
deployment
|
|
2036
|
-
});
|
|
2037
|
-
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
2038
|
-
}
|
|
2039
|
-
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
2040
|
-
return {
|
|
2041
|
-
id,
|
|
2042
|
-
name,
|
|
2043
|
-
model,
|
|
2044
|
-
async run(input) {
|
|
2045
|
-
const start = Date.now();
|
|
2046
|
-
const params = {
|
|
2047
|
-
model: requestModel,
|
|
2048
|
-
messages: [{ role: "user", content: input.prompt }]
|
|
2049
|
-
};
|
|
2050
|
-
if (input.schema) {
|
|
2051
|
-
params.response_format = { type: "json_object" };
|
|
2052
|
-
params.messages = [
|
|
2053
|
-
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
2054
|
-
...params.messages
|
|
2055
|
-
];
|
|
2056
|
-
}
|
|
2057
|
-
if (input.tools?.length) {
|
|
2058
|
-
params.tools = input.tools.map(toolDefToOpenAI);
|
|
2059
|
-
params.tool_choice = "auto";
|
|
2060
|
-
}
|
|
2061
|
-
const response = await client.chat.completions.create(params);
|
|
2062
|
-
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
2063
|
-
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
2064
|
-
const choice = response.choices[0];
|
|
2065
|
-
const toolCallsRaw = choice?.message?.tool_calls;
|
|
2066
|
-
const collectedToolCalls = [];
|
|
2067
|
-
let finalResponse = response;
|
|
2068
|
-
if (toolCallsRaw?.length && input.tools?.length) {
|
|
2069
|
-
const toolMessages = [
|
|
2070
|
-
...params.messages,
|
|
2071
|
-
choice.message
|
|
2072
|
-
];
|
|
2073
|
-
for (const tc of toolCallsRaw) {
|
|
2074
|
-
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
2075
|
-
let args;
|
|
2076
|
-
try {
|
|
2077
|
-
args = JSON.parse(tc.function.arguments);
|
|
2078
|
-
} catch {
|
|
2079
|
-
args = tc.function.arguments;
|
|
2080
|
-
}
|
|
2081
|
-
let result;
|
|
2082
|
-
if (toolDef?.handler) {
|
|
2083
|
-
result = await toolDef.handler(args);
|
|
2084
|
-
}
|
|
2085
|
-
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
2086
|
-
toolMessages.push({
|
|
2087
|
-
role: "tool",
|
|
2088
|
-
tool_call_id: tc.id,
|
|
2089
|
-
content: JSON.stringify(result ?? {})
|
|
2090
|
-
});
|
|
2091
|
-
}
|
|
2092
|
-
const followUp = await client.chat.completions.create({
|
|
2093
|
-
model: requestModel,
|
|
2094
|
-
messages: toolMessages
|
|
2095
|
-
});
|
|
2096
|
-
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
2097
|
-
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
2098
|
-
finalResponse = followUp;
|
|
2099
|
-
}
|
|
2100
|
-
const latencyMs = Date.now() - start;
|
|
2101
|
-
const finalChoice = finalResponse.choices[0];
|
|
2102
|
-
let rawContent = finalChoice?.message?.content ?? "";
|
|
2103
|
-
if (stripThinking) {
|
|
2104
|
-
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
2105
|
-
}
|
|
2106
|
-
let output = rawContent;
|
|
2107
|
-
if (input.schema) {
|
|
2108
|
-
try {
|
|
2109
|
-
output = JSON.parse(rawContent);
|
|
2110
|
-
} catch {
|
|
2111
|
-
}
|
|
2112
|
-
}
|
|
2113
|
-
return {
|
|
2114
|
-
output,
|
|
2115
|
-
usage: {
|
|
2116
|
-
promptTokens: totalPromptTokens || void 0,
|
|
2117
|
-
completionTokens: totalCompletionTokens || void 0
|
|
2118
|
-
},
|
|
2119
|
-
latencyMs,
|
|
2120
|
-
raw: finalResponse,
|
|
2121
|
-
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
2122
|
-
};
|
|
2123
|
-
}
|
|
2124
|
-
};
|
|
2125
|
-
}
|
|
2126
|
-
function toolDefToOpenAI(tool) {
|
|
2127
|
-
return {
|
|
2128
|
-
type: "function",
|
|
2129
|
-
function: {
|
|
2130
|
-
name: tool.name,
|
|
2131
|
-
description: tool.description,
|
|
2132
|
-
parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
2133
|
-
}
|
|
2134
|
-
};
|
|
2135
|
-
}
|
|
2136
|
-
|
|
2137
2527
|
// src/providers/anthropic.ts
|
|
2138
2528
|
var import_sdk = __toESM(require("@anthropic-ai/sdk"), 1);
|
|
2139
2529
|
function anthropic(model, options) {
|
|
@@ -2147,23 +2537,17 @@ function anthropic(model, options) {
|
|
|
2147
2537
|
model,
|
|
2148
2538
|
async run(input) {
|
|
2149
2539
|
const start = Date.now();
|
|
2150
|
-
const systemMessage = input.schema ?
|
|
2540
|
+
const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
|
|
2151
2541
|
const response = await client.messages.create({
|
|
2152
2542
|
model,
|
|
2153
2543
|
max_tokens: maxTokens,
|
|
2154
2544
|
system: systemMessage,
|
|
2155
2545
|
messages: [{ role: "user", content: input.prompt }]
|
|
2156
|
-
});
|
|
2546
|
+
}, { signal: input.signal });
|
|
2157
2547
|
const latencyMs = Date.now() - start;
|
|
2158
2548
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2159
2549
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2160
|
-
|
|
2161
|
-
if (input.schema) {
|
|
2162
|
-
try {
|
|
2163
|
-
output = JSON.parse(rawContent);
|
|
2164
|
-
} catch {
|
|
2165
|
-
}
|
|
2166
|
-
}
|
|
2550
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2167
2551
|
return {
|
|
2168
2552
|
output,
|
|
2169
2553
|
usage: {
|
|
@@ -2177,31 +2561,1025 @@ function anthropic(model, options) {
|
|
|
2177
2561
|
};
|
|
2178
2562
|
}
|
|
2179
2563
|
|
|
2180
|
-
// src/
|
|
2181
|
-
var
|
|
2182
|
-
function
|
|
2183
|
-
const
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2564
|
+
// src/reporter/markdown.ts
|
|
2565
|
+
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2566
|
+
function markdownReporter(report, _current) {
|
|
2567
|
+
const lines = [COMMENT_MARKER, ""];
|
|
2568
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
2569
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
2570
|
+
lines.push("");
|
|
2571
|
+
if (report.comparisons.length > 0) {
|
|
2572
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
2573
|
+
lines.push("");
|
|
2188
2574
|
}
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2575
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
2576
|
+
lines.push(markdownCostSummary(report.cost));
|
|
2577
|
+
lines.push("");
|
|
2578
|
+
}
|
|
2579
|
+
if (report.flakyResults.length > 0) {
|
|
2580
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
2581
|
+
lines.push("");
|
|
2582
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
2583
|
+
lines.push("");
|
|
2584
|
+
for (const f of report.flakyResults) {
|
|
2585
|
+
lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
|
|
2586
|
+
}
|
|
2587
|
+
lines.push("");
|
|
2588
|
+
}
|
|
2589
|
+
if (report.failureReasons.length > 0) {
|
|
2590
|
+
lines.push("### Failure Reasons");
|
|
2591
|
+
lines.push("");
|
|
2592
|
+
for (const reason of report.failureReasons) {
|
|
2593
|
+
lines.push(`- ${reason}`);
|
|
2594
|
+
}
|
|
2595
|
+
lines.push("");
|
|
2596
|
+
}
|
|
2597
|
+
lines.push("---");
|
|
2598
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
2599
|
+
return lines.join("\n");
|
|
2600
|
+
}
|
|
2601
|
+
function markdownComparisonTable(comparisons) {
|
|
2602
|
+
const lines = [];
|
|
2603
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
2604
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
2605
|
+
for (const c of comparisons) {
|
|
2606
|
+
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2607
|
+
const currentStr = formatStats(c.current);
|
|
2608
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2609
|
+
const status = statusIndicator(c);
|
|
2610
|
+
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2611
|
+
}
|
|
2612
|
+
return lines.join("\n");
|
|
2613
|
+
}
|
|
2614
|
+
function markdownCostSummary(cost) {
|
|
2615
|
+
const lines = [];
|
|
2616
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
2617
|
+
lines.push("");
|
|
2618
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
2619
|
+
if (cost.budget !== void 0) {
|
|
2620
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
2621
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
2622
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
2623
|
+
}
|
|
2624
|
+
if (cost.perProvider.size > 1) {
|
|
2625
|
+
lines.push("");
|
|
2626
|
+
lines.push("| Provider | Cost |");
|
|
2627
|
+
lines.push("|----------|------|");
|
|
2628
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
2629
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
2630
|
+
}
|
|
2631
|
+
}
|
|
2632
|
+
return lines.join("\n");
|
|
2633
|
+
}
|
|
2634
|
+
function formatStats(stats) {
|
|
2635
|
+
if (stats.n > 1) {
|
|
2636
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
2637
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
2638
|
+
}
|
|
2639
|
+
return stats.mean.toFixed(3);
|
|
2640
|
+
}
|
|
2641
|
+
function statusIndicator(c) {
|
|
2642
|
+
if (c.regressed) return "\u{1F534} regressed";
|
|
2643
|
+
if (c.improved) return "\u{1F7E2} improved";
|
|
2644
|
+
if (c.baseline === null) return "\u{1F195} new";
|
|
2645
|
+
return "\u26AA unchanged";
|
|
2646
|
+
}
|
|
2647
|
+
|
|
2648
|
+
// src/reporter/html.ts
|
|
2649
|
+
function esc(s) {
|
|
2650
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2651
|
+
}
|
|
2652
|
+
function htmlReporter(results) {
|
|
2653
|
+
if (results.length === 0) {
|
|
2654
|
+
return emptyReport();
|
|
2655
|
+
}
|
|
2656
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2657
|
+
const hasCost = scorerNames.includes("cost");
|
|
2658
|
+
const multi = providers.length >= 2;
|
|
2659
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2660
|
+
const taskSections = tasks.map((task) => {
|
|
2661
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2662
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2663
|
+
const medals = computeMedals(columnStats, providers);
|
|
2664
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2665
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2192
2666
|
});
|
|
2193
|
-
|
|
2667
|
+
const successResults = results.filter((r) => !r.error);
|
|
2668
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2669
|
+
for (const id of providers) {
|
|
2670
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2671
|
+
}
|
|
2672
|
+
const correctnessKey = successResults.some(
|
|
2673
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2674
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2675
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2676
|
+
const byLatency = providers.map((id) => {
|
|
2677
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2678
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2679
|
+
return { id, avg: avg ?? Infinity };
|
|
2680
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2681
|
+
const byCost = providers.map((id) => {
|
|
2682
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2683
|
+
const costs = runs.map((r) => {
|
|
2684
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2685
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2686
|
+
}).filter((c) => c !== void 0);
|
|
2687
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2688
|
+
return { id, avg };
|
|
2689
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2690
|
+
let overallWinner;
|
|
2691
|
+
if (multi) {
|
|
2692
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2693
|
+
for (const id of providers) wins.set(id, 0);
|
|
2694
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2695
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2696
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2697
|
+
const maxWins = Math.max(...wins.values());
|
|
2698
|
+
if (maxWins > 0) {
|
|
2699
|
+
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2700
|
+
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2701
|
+
}
|
|
2702
|
+
}
|
|
2703
|
+
const errorResults = results.filter((r) => r.error);
|
|
2704
|
+
const deduped = dedupeErrors(errorResults);
|
|
2705
|
+
return `<!DOCTYPE html>
|
|
2706
|
+
<html lang="en">
|
|
2707
|
+
<head>
|
|
2708
|
+
<meta charset="UTF-8">
|
|
2709
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2710
|
+
<title>Agent Duelist Report</title>
|
|
2711
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2712
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2713
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2714
|
+
<meta property="og:type" content="website">
|
|
2715
|
+
${renderStyle()}
|
|
2716
|
+
</head>
|
|
2717
|
+
<body>
|
|
2718
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2719
|
+
<div class="report">
|
|
2720
|
+
|
|
2721
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2722
|
+
|
|
2723
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2724
|
+
|
|
2725
|
+
<main>
|
|
2726
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2727
|
+
s.task,
|
|
2728
|
+
s.providerData,
|
|
2729
|
+
s.columnStats,
|
|
2730
|
+
s.medals,
|
|
2731
|
+
s.winnerId,
|
|
2732
|
+
scorerNames,
|
|
2733
|
+
hasCost,
|
|
2734
|
+
multi,
|
|
2735
|
+
i
|
|
2736
|
+
)).join("\n")}
|
|
2737
|
+
</main>
|
|
2738
|
+
|
|
2739
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2740
|
+
|
|
2741
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2742
|
+
|
|
2743
|
+
${renderFooter()}
|
|
2744
|
+
|
|
2745
|
+
</div>
|
|
2746
|
+
${renderScript(tasks.length)}
|
|
2747
|
+
</body>
|
|
2748
|
+
</html>`;
|
|
2749
|
+
}
|
|
2750
|
+
function emptyReport() {
|
|
2751
|
+
return `<!DOCTYPE html>
|
|
2752
|
+
<html lang="en">
|
|
2753
|
+
<head>
|
|
2754
|
+
<meta charset="UTF-8">
|
|
2755
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2756
|
+
<title>Agent Duelist Report</title>
|
|
2757
|
+
${renderStyle()}
|
|
2758
|
+
</head>
|
|
2759
|
+
<body>
|
|
2760
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2761
|
+
<div class="report">
|
|
2762
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2763
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2764
|
+
${renderFooter()}
|
|
2765
|
+
</div>
|
|
2766
|
+
</body>
|
|
2767
|
+
</html>`;
|
|
2768
|
+
}
|
|
2769
|
+
function dedupeErrors(errorResults) {
|
|
2770
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2771
|
+
for (const r of errorResults) {
|
|
2772
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2773
|
+
const existing = seen.get(key);
|
|
2774
|
+
if (existing) {
|
|
2775
|
+
existing.count++;
|
|
2776
|
+
} else {
|
|
2777
|
+
seen.set(key, {
|
|
2778
|
+
providerId: r.providerId,
|
|
2779
|
+
error: r.error ?? "Unknown error",
|
|
2780
|
+
count: 1,
|
|
2781
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2782
|
+
});
|
|
2783
|
+
}
|
|
2784
|
+
}
|
|
2785
|
+
return [...seen.values()];
|
|
2786
|
+
}
|
|
2787
|
+
function renderStyle() {
|
|
2788
|
+
return `<style>
|
|
2789
|
+
:root {
|
|
2790
|
+
--bg: #0f172a;
|
|
2791
|
+
--bg-deep: #020617;
|
|
2792
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2793
|
+
--accent: #f59e0b;
|
|
2794
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2795
|
+
--text: #e2e8f0;
|
|
2796
|
+
--muted: #94a3b8;
|
|
2797
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2798
|
+
--green: #22c55e;
|
|
2799
|
+
--red: #ef4444;
|
|
2800
|
+
--yellow: #eab308;
|
|
2801
|
+
--radius: 12px;
|
|
2802
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2803
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2804
|
+
}
|
|
2805
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2806
|
+
html, body {
|
|
2807
|
+
font-family: var(--sans);
|
|
2808
|
+
background: var(--bg);
|
|
2809
|
+
color: var(--text);
|
|
2810
|
+
min-height: 100vh;
|
|
2811
|
+
}
|
|
2812
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2813
|
+
|
|
2814
|
+
/* Animated gradient mesh */
|
|
2815
|
+
.bg-mesh {
|
|
2816
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2817
|
+
overflow: hidden; pointer-events: none;
|
|
2818
|
+
}
|
|
2819
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2820
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2821
|
+
filter: blur(120px); opacity: 0.4;
|
|
2822
|
+
}
|
|
2823
|
+
.bg-mesh::before {
|
|
2824
|
+
width: 600px; height: 600px;
|
|
2825
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2826
|
+
top: -10%; left: -5%;
|
|
2827
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2828
|
+
}
|
|
2829
|
+
.bg-mesh::after {
|
|
2830
|
+
width: 500px; height: 500px;
|
|
2831
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2832
|
+
bottom: -10%; right: -5%;
|
|
2833
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2834
|
+
}
|
|
2835
|
+
.bg-mesh-extra {
|
|
2836
|
+
position: absolute; width: 400px; height: 400px;
|
|
2837
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2838
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2839
|
+
top: 50%; left: 60%;
|
|
2840
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2841
|
+
}
|
|
2842
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2843
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2844
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2845
|
+
|
|
2846
|
+
/* Report container */
|
|
2847
|
+
.report {
|
|
2848
|
+
position: relative; z-index: 1;
|
|
2849
|
+
width: 100%; max-width: 960px;
|
|
2850
|
+
}
|
|
2851
|
+
|
|
2852
|
+
/* Header */
|
|
2853
|
+
.report-header {
|
|
2854
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2855
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2856
|
+
}
|
|
2857
|
+
.report-brand {
|
|
2858
|
+
display: flex; align-items: center; gap: 10px;
|
|
2859
|
+
text-decoration: none; color: var(--muted);
|
|
2860
|
+
font-weight: 600; font-size: 14px;
|
|
2861
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2862
|
+
}
|
|
2863
|
+
.report-brand:hover { color: var(--text); }
|
|
2864
|
+
.brand-icon {
|
|
2865
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2866
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2867
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2868
|
+
display: flex; align-items: center; justify-content: center;
|
|
2869
|
+
font-size: 16px;
|
|
2870
|
+
}
|
|
2871
|
+
.report-meta {
|
|
2872
|
+
font-size: 12px; color: var(--muted);
|
|
2873
|
+
text-align: right; line-height: 1.6;
|
|
2874
|
+
}
|
|
2875
|
+
|
|
2876
|
+
/* Task tabs */
|
|
2877
|
+
.task-tabs {
|
|
2878
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2879
|
+
}
|
|
2880
|
+
.task-tab {
|
|
2881
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2882
|
+
border: 1px solid var(--border);
|
|
2883
|
+
background: transparent; color: var(--muted);
|
|
2884
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2885
|
+
transition: all 150ms ease;
|
|
2886
|
+
}
|
|
2887
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2888
|
+
.task-tab.active {
|
|
2889
|
+
background: var(--accent-soft);
|
|
2890
|
+
border-color: rgba(245,158,11,0.4);
|
|
2891
|
+
color: var(--accent);
|
|
2892
|
+
}
|
|
2893
|
+
|
|
2894
|
+
/* Task sections */
|
|
2895
|
+
.task-section { display: none; }
|
|
2896
|
+
.task-section.active { display: block; }
|
|
2897
|
+
.task-name {
|
|
2898
|
+
font-size: 18px; font-weight: 600;
|
|
2899
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2900
|
+
}
|
|
2901
|
+
|
|
2902
|
+
/* Results table */
|
|
2903
|
+
.results-table {
|
|
2904
|
+
width: 100%; border-collapse: collapse;
|
|
2905
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2906
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2907
|
+
border: 1px solid var(--border);
|
|
2908
|
+
}
|
|
2909
|
+
.results-table th, .results-table td {
|
|
2910
|
+
padding: 10px 14px;
|
|
2911
|
+
text-align: left;
|
|
2912
|
+
border-bottom: 1px solid var(--border);
|
|
2913
|
+
}
|
|
2914
|
+
.results-table th {
|
|
2915
|
+
background: rgba(0,0,0,0.3);
|
|
2916
|
+
font-size: 11px; font-weight: 600;
|
|
2917
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2918
|
+
color: var(--muted); cursor: pointer;
|
|
2919
|
+
user-select: none; white-space: nowrap;
|
|
2920
|
+
}
|
|
2921
|
+
.results-table th:hover { color: var(--text); }
|
|
2922
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
2923
|
+
.results-table tbody tr {
|
|
2924
|
+
background: var(--panel);
|
|
2925
|
+
transition: background 120ms ease;
|
|
2926
|
+
}
|
|
2927
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
2928
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
2929
|
+
|
|
2930
|
+
/* Score cell with progress bar */
|
|
2931
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
2932
|
+
.score-bar {
|
|
2933
|
+
position: absolute; left: 0; bottom: 0;
|
|
2934
|
+
height: 3px; border-radius: 2px;
|
|
2935
|
+
transition: width 300ms ease;
|
|
2936
|
+
}
|
|
2937
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
2938
|
+
|
|
2939
|
+
/* Color ranking */
|
|
2940
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
2941
|
+
.rank-worst { color: var(--red); }
|
|
2942
|
+
.rank-mid { color: var(--yellow); }
|
|
2943
|
+
.rank-neutral { color: var(--text); }
|
|
2944
|
+
.rank-error { color: var(--muted); }
|
|
2945
|
+
|
|
2946
|
+
/* Winner banner */
|
|
2947
|
+
.task-winner {
|
|
2948
|
+
display: flex; align-items: center; gap: 10px;
|
|
2949
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
2950
|
+
border-radius: var(--radius);
|
|
2951
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
2952
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
2953
|
+
font-size: 14px; font-weight: 500;
|
|
2954
|
+
}
|
|
2955
|
+
.task-winner .trophy { font-size: 20px; }
|
|
2956
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
2957
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
2958
|
+
|
|
2959
|
+
/* Summary cards */
|
|
2960
|
+
.summary-section { margin-top: 32px; }
|
|
2961
|
+
.summary-title {
|
|
2962
|
+
font-size: 16px; font-weight: 600;
|
|
2963
|
+
margin-bottom: 12px; color: var(--text);
|
|
2964
|
+
}
|
|
2965
|
+
.summary-cards {
|
|
2966
|
+
display: grid;
|
|
2967
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
2968
|
+
gap: 12px;
|
|
2969
|
+
}
|
|
2970
|
+
.summary-card {
|
|
2971
|
+
padding: 16px; border-radius: var(--radius);
|
|
2972
|
+
border: 1px solid var(--border);
|
|
2973
|
+
background: var(--panel);
|
|
2974
|
+
}
|
|
2975
|
+
.summary-card .card-label {
|
|
2976
|
+
font-size: 11px; font-weight: 600;
|
|
2977
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2978
|
+
color: var(--muted); margin-bottom: 6px;
|
|
2979
|
+
}
|
|
2980
|
+
.summary-card .card-value {
|
|
2981
|
+
font-size: 20px; font-weight: 700;
|
|
2982
|
+
color: var(--green); font-family: var(--mono);
|
|
2983
|
+
}
|
|
2984
|
+
.summary-card .card-provider {
|
|
2985
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
2986
|
+
}
|
|
2987
|
+
|
|
2988
|
+
/* Errors */
|
|
2989
|
+
.errors-section { margin-top: 24px; }
|
|
2990
|
+
.errors-title {
|
|
2991
|
+
font-size: 16px; font-weight: 600;
|
|
2992
|
+
margin-bottom: 8px; color: var(--red);
|
|
2993
|
+
cursor: pointer;
|
|
2994
|
+
}
|
|
2995
|
+
.errors-list {
|
|
2996
|
+
border-radius: var(--radius);
|
|
2997
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
2998
|
+
background: rgba(239,68,68,0.04);
|
|
2999
|
+
overflow: hidden;
|
|
3000
|
+
}
|
|
3001
|
+
.error-item {
|
|
3002
|
+
padding: 10px 16px;
|
|
3003
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
3004
|
+
font-size: 13px;
|
|
3005
|
+
}
|
|
3006
|
+
.error-item:last-child { border-bottom: none; }
|
|
3007
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
3008
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
3009
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
3010
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
3011
|
+
|
|
3012
|
+
/* Footer */
|
|
3013
|
+
.report-footer {
|
|
3014
|
+
margin-top: 40px; padding: 20px 0;
|
|
3015
|
+
border-top: 1px solid var(--border);
|
|
3016
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
3017
|
+
flex-wrap: wrap; gap: 12px;
|
|
3018
|
+
}
|
|
3019
|
+
.footer-brand {
|
|
3020
|
+
font-size: 13px; color: var(--muted);
|
|
3021
|
+
}
|
|
3022
|
+
.footer-brand a {
|
|
3023
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
3024
|
+
}
|
|
3025
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
3026
|
+
.footer-cta {
|
|
3027
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
3028
|
+
padding: 6px 14px; border-radius: 8px;
|
|
3029
|
+
background: var(--accent-soft);
|
|
3030
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
3031
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
3032
|
+
text-decoration: none;
|
|
3033
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
3034
|
+
}
|
|
3035
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
3036
|
+
|
|
3037
|
+
/* Empty state */
|
|
3038
|
+
.empty-msg {
|
|
3039
|
+
text-align: center; color: var(--muted);
|
|
3040
|
+
padding: 60px 20px; font-size: 16px;
|
|
3041
|
+
}
|
|
3042
|
+
|
|
3043
|
+
/* Responsive */
|
|
3044
|
+
@media (max-width: 640px) {
|
|
3045
|
+
body { padding: 12px; }
|
|
3046
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
3047
|
+
.report-meta { text-align: left; }
|
|
3048
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
3049
|
+
.results-table { font-size: 12px; }
|
|
3050
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
3051
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3052
|
+
}
|
|
3053
|
+
</style>`;
|
|
3054
|
+
}
|
|
3055
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3056
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3057
|
+
return `<header class="report-header">
|
|
3058
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3059
|
+
<div class="brand-icon">⬡</div>
|
|
3060
|
+
<span>Agent Duelist</span>
|
|
3061
|
+
</a>
|
|
3062
|
+
<div class="report-meta">
|
|
3063
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3064
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3065
|
+
${esc(runsLabel)}<br>
|
|
3066
|
+
${esc(now)}
|
|
3067
|
+
</div>
|
|
3068
|
+
</header>`;
|
|
3069
|
+
}
|
|
3070
|
+
function renderTabs(tasks) {
|
|
3071
|
+
const buttons = tasks.map(
|
|
3072
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3073
|
+
).join("\n ");
|
|
3074
|
+
return `<nav class="task-tabs">
|
|
3075
|
+
${buttons}
|
|
3076
|
+
</nav>`;
|
|
3077
|
+
}
|
|
3078
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3079
|
+
const cols = [
|
|
3080
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3081
|
+
];
|
|
3082
|
+
for (const name of scorerNames) {
|
|
3083
|
+
if (name === "latency") {
|
|
3084
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3085
|
+
} else if (name === "cost") {
|
|
3086
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3087
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3088
|
+
} else {
|
|
3089
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
const ths = cols.map(
|
|
3093
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3094
|
+
).join("");
|
|
3095
|
+
const rows = providerData.map((pd) => {
|
|
3096
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3097
|
+
const cells = [];
|
|
3098
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3099
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3100
|
+
if (pd.allErrors) {
|
|
3101
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3102
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3103
|
+
}
|
|
3104
|
+
} else {
|
|
3105
|
+
for (const col of cols.slice(1)) {
|
|
3106
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3107
|
+
}
|
|
3108
|
+
}
|
|
3109
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3110
|
+
}).join("\n");
|
|
3111
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3112
|
+
<span class="trophy">🏆</span>
|
|
3113
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3114
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3115
|
+
</div>` : "";
|
|
3116
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3117
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3118
|
+
<table class="results-table">
|
|
3119
|
+
<thead><tr>${ths}</tr></thead>
|
|
3120
|
+
<tbody>${rows}</tbody>
|
|
3121
|
+
</table>
|
|
3122
|
+
${winnerHtml}
|
|
3123
|
+
</section>`;
|
|
3124
|
+
}
|
|
3125
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3126
|
+
const colStats = columnStats.get(key);
|
|
3127
|
+
if (key === "latency") {
|
|
3128
|
+
const ms = pd.latencyMs;
|
|
3129
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3130
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3131
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3132
|
+
}
|
|
3133
|
+
if (key === "cost") {
|
|
3134
|
+
const cost = pd.avgDetails.costUsd;
|
|
3135
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3136
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3137
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3138
|
+
}
|
|
3139
|
+
if (key === "tokens") {
|
|
3140
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3141
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3142
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3143
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3144
|
+
}
|
|
3145
|
+
const val = pd.avgScores[key];
|
|
3146
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3147
|
+
const pct = Math.round(val * 100);
|
|
3148
|
+
let rankCls;
|
|
3149
|
+
if (multi && colStats) {
|
|
3150
|
+
rankCls = rankClass_(val, colStats);
|
|
3151
|
+
} else {
|
|
3152
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3153
|
+
}
|
|
3154
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3155
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3156
|
+
<span class="score-val">${pct}%</span>
|
|
3157
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3158
|
+
</td>`;
|
|
3159
|
+
}
|
|
3160
|
+
function rankClass_(value, colStats) {
|
|
3161
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3162
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3163
|
+
if (value === colStats.best) return "rank-best";
|
|
3164
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3165
|
+
return "rank-mid";
|
|
3166
|
+
}
|
|
3167
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3168
|
+
const cards = [];
|
|
3169
|
+
if (byCorrectness) {
|
|
3170
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3171
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3172
|
+
cards.push(`<div class="summary-card">
|
|
3173
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3174
|
+
<div class="card-value">${pct}</div>
|
|
3175
|
+
${provider}
|
|
3176
|
+
</div>`);
|
|
3177
|
+
}
|
|
3178
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3179
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3180
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3181
|
+
cards.push(`<div class="summary-card">
|
|
3182
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3183
|
+
<div class="card-value">${ms}</div>
|
|
3184
|
+
${provider}
|
|
3185
|
+
</div>`);
|
|
3186
|
+
}
|
|
3187
|
+
if (byCost?.avg !== void 0) {
|
|
3188
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3189
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3190
|
+
cards.push(`<div class="summary-card">
|
|
3191
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3192
|
+
<div class="card-value">${cost}</div>
|
|
3193
|
+
${provider}
|
|
3194
|
+
</div>`);
|
|
3195
|
+
}
|
|
3196
|
+
if (overallWinner) {
|
|
3197
|
+
cards.push(`<div class="summary-card">
|
|
3198
|
+
<div class="card-label">Overall Winner</div>
|
|
3199
|
+
<div class="card-value">🏆</div>
|
|
3200
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3201
|
+
</div>`);
|
|
3202
|
+
}
|
|
3203
|
+
if (cards.length === 0) return "";
|
|
3204
|
+
return `<section class="summary-section">
|
|
3205
|
+
<h2 class="summary-title">Summary</h2>
|
|
3206
|
+
<div class="summary-cards">
|
|
3207
|
+
${cards.join("\n ")}
|
|
3208
|
+
</div>
|
|
3209
|
+
</section>`;
|
|
3210
|
+
}
|
|
3211
|
+
function renderErrors(errors) {
|
|
3212
|
+
const items = errors.map((e) => {
|
|
3213
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3214
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3215
|
+
return `<div class="error-item">
|
|
3216
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3217
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3218
|
+
${hint}
|
|
3219
|
+
</div>`;
|
|
3220
|
+
}).join("\n");
|
|
3221
|
+
return `<section class="errors-section">
|
|
3222
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
|
|
3223
|
+
<div class="errors-list">
|
|
3224
|
+
${items}
|
|
3225
|
+
</div>
|
|
3226
|
+
</section>`;
|
|
3227
|
+
}
|
|
3228
|
+
function renderFooter() {
|
|
3229
|
+
return `<footer class="report-footer">
|
|
3230
|
+
<div class="footer-brand">
|
|
3231
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3232
|
+
</div>
|
|
3233
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3234
|
+
⭐ Star on GitHub
|
|
3235
|
+
</a>
|
|
3236
|
+
</footer>`;
|
|
3237
|
+
}
|
|
3238
|
+
function renderScript(taskCount) {
|
|
3239
|
+
return `<script>
|
|
3240
|
+
(function() {
|
|
3241
|
+
/* Tab switching */
|
|
3242
|
+
${taskCount > 1 ? `
|
|
3243
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3244
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3245
|
+
tabs.forEach(function(tab) {
|
|
3246
|
+
tab.addEventListener('click', function() {
|
|
3247
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3248
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3249
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3250
|
+
tab.classList.add('active');
|
|
3251
|
+
sections[idx].classList.add('active');
|
|
3252
|
+
});
|
|
3253
|
+
});` : ""}
|
|
3254
|
+
|
|
3255
|
+
/* Column sorting */
|
|
3256
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3257
|
+
var table = th.closest('table');
|
|
3258
|
+
var asc = true;
|
|
3259
|
+
th.addEventListener('click', function() {
|
|
3260
|
+
var tbody = table.querySelector('tbody');
|
|
3261
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3262
|
+
rows.sort(function(a, b) {
|
|
3263
|
+
var aCell = a.children[colIdx];
|
|
3264
|
+
var bCell = b.children[colIdx];
|
|
3265
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3266
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3267
|
+
if (aVal !== null && bVal !== null) {
|
|
3268
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3269
|
+
}
|
|
3270
|
+
var aText = aCell.textContent || '';
|
|
3271
|
+
var bText = bCell.textContent || '';
|
|
3272
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3273
|
+
});
|
|
3274
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3275
|
+
|
|
3276
|
+
/* Update sort arrows */
|
|
3277
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3278
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3279
|
+
asc = !asc;
|
|
3280
|
+
});
|
|
3281
|
+
});
|
|
3282
|
+
})();
|
|
3283
|
+
</script>`;
|
|
3284
|
+
}
|
|
3285
|
+
|
|
3286
|
+
// src/ci.ts
|
|
3287
|
+
var import_node_fs = require("fs");
|
|
3288
|
+
var import_node_path = require("path");
|
|
3289
|
+
var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
3290
|
+
var FLAKY_CV_THRESHOLD = 0.3;
|
|
3291
|
+
var T_CRITICAL_95 = {
|
|
3292
|
+
1: 12.706,
|
|
3293
|
+
2: 4.303,
|
|
3294
|
+
3: 3.182,
|
|
3295
|
+
4: 2.776,
|
|
3296
|
+
5: 2.571,
|
|
3297
|
+
6: 2.447,
|
|
3298
|
+
7: 2.365,
|
|
3299
|
+
8: 2.306,
|
|
3300
|
+
9: 2.262,
|
|
3301
|
+
10: 2.228,
|
|
3302
|
+
15: 2.131,
|
|
3303
|
+
20: 2.086,
|
|
3304
|
+
25: 2.06,
|
|
3305
|
+
30: 2.042
|
|
3306
|
+
};
|
|
3307
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
3308
|
+
function tCritical(df) {
|
|
3309
|
+
if (df <= 0) return 1.96;
|
|
3310
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
3311
|
+
const keys = T_CRITICAL_KEYS;
|
|
3312
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
3313
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
3314
|
+
if (df > keys[i] && df < keys[i + 1]) {
|
|
3315
|
+
const low = keys[i], high = keys[i + 1];
|
|
3316
|
+
const ratio = (df - low) / (high - low);
|
|
3317
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
3318
|
+
}
|
|
3319
|
+
}
|
|
3320
|
+
return 1.96;
|
|
3321
|
+
}
|
|
3322
|
+
function computeScorerStats(samples) {
|
|
3323
|
+
const n = samples.length;
|
|
3324
|
+
if (n === 0) {
|
|
3325
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
3326
|
+
}
|
|
3327
|
+
const mean = samples.reduce((a, b) => a + b, 0) / n;
|
|
3328
|
+
if (n === 1) {
|
|
3329
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
3330
|
+
}
|
|
3331
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
|
|
3332
|
+
const stddev = Math.sqrt(variance);
|
|
3333
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
3334
|
+
const se = stddev / Math.sqrt(n);
|
|
3335
|
+
const t = tCritical(n - 1);
|
|
3336
|
+
return {
|
|
3337
|
+
mean,
|
|
3338
|
+
stddev,
|
|
3339
|
+
cv,
|
|
3340
|
+
n,
|
|
3341
|
+
ci95Lower: mean - t * se,
|
|
3342
|
+
ci95Upper: mean + t * se
|
|
3343
|
+
};
|
|
3344
|
+
}
|
|
3345
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
3346
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
3347
|
+
}
|
|
3348
|
+
function computeStats(results) {
|
|
3349
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
3350
|
+
for (const r of results) {
|
|
3351
|
+
if (r.error) continue;
|
|
3352
|
+
for (const score of r.scores) {
|
|
3353
|
+
if (score.value < 0) continue;
|
|
3354
|
+
const key = groupKey(r.providerId, r.taskName, score.name);
|
|
3355
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
3356
|
+
grouped.get(key).push(score.value);
|
|
3357
|
+
}
|
|
3358
|
+
}
|
|
3359
|
+
const stats = /* @__PURE__ */ new Map();
|
|
3360
|
+
for (const [key, samples] of grouped) {
|
|
3361
|
+
stats.set(key, computeScorerStats(samples));
|
|
3362
|
+
}
|
|
3363
|
+
return stats;
|
|
3364
|
+
}
|
|
3365
|
+
function computeCostSummary(results, budget) {
|
|
3366
|
+
let totalUsd = 0;
|
|
3367
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
3368
|
+
for (const r of results) {
|
|
3369
|
+
if (r.error) continue;
|
|
3370
|
+
const costScore = r.scores.find((s) => s.name === "cost");
|
|
3371
|
+
if (!costScore || costScore.value < 0) continue;
|
|
3372
|
+
const details = costScore.details;
|
|
3373
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
3374
|
+
if (usd <= 0) continue;
|
|
3375
|
+
totalUsd += usd;
|
|
3376
|
+
perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
|
|
3377
|
+
}
|
|
3378
|
+
return {
|
|
3379
|
+
totalUsd,
|
|
3380
|
+
perProvider,
|
|
3381
|
+
budget,
|
|
3382
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
3383
|
+
};
|
|
3384
|
+
}
|
|
3385
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
3386
|
+
const comparisons = [];
|
|
3387
|
+
const failureReasons = [];
|
|
3388
|
+
for (const [key, current] of currentStats) {
|
|
3389
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
3390
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
3391
|
+
let delta = null;
|
|
3392
|
+
let regressed = false;
|
|
3393
|
+
let improved = false;
|
|
3394
|
+
if (baseline) {
|
|
3395
|
+
delta = current.mean - baseline.mean;
|
|
3396
|
+
const threshold = thresholds.get(scorerName);
|
|
3397
|
+
if (threshold !== void 0) {
|
|
3398
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
3399
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
3400
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
3401
|
+
}
|
|
3402
|
+
}
|
|
3403
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
3404
|
+
comparisons.push({
|
|
3405
|
+
providerId,
|
|
3406
|
+
taskName,
|
|
3407
|
+
scorerName,
|
|
3408
|
+
baseline,
|
|
3409
|
+
current,
|
|
3410
|
+
delta,
|
|
3411
|
+
regressed,
|
|
3412
|
+
improved,
|
|
3413
|
+
flaky
|
|
3414
|
+
});
|
|
3415
|
+
}
|
|
3416
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
3417
|
+
const regressions = comparisons.filter((c) => c.regressed);
|
|
3418
|
+
if (regressions.length > 0) {
|
|
3419
|
+
for (const r of regressions) {
|
|
3420
|
+
failureReasons.push(
|
|
3421
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
3422
|
+
);
|
|
3423
|
+
}
|
|
3424
|
+
}
|
|
3425
|
+
if (cost.overBudget) {
|
|
3426
|
+
failureReasons.push(
|
|
3427
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
3428
|
+
);
|
|
3429
|
+
}
|
|
3430
|
+
const flakyResults = comparisons.filter((c) => c.flaky);
|
|
3431
|
+
const failed = failureReasons.length > 0;
|
|
3432
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
3433
|
+
}
|
|
3434
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
3435
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
3436
|
+
const delta = current.mean - baseline.mean;
|
|
3437
|
+
if (lowerIsBetter) return delta > threshold;
|
|
3438
|
+
return delta < -threshold;
|
|
3439
|
+
}
|
|
3440
|
+
if (lowerIsBetter) {
|
|
3441
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
3442
|
+
}
|
|
3443
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
3444
|
+
}
|
|
3445
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
3446
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
3447
|
+
const delta = current.mean - baseline.mean;
|
|
3448
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
3449
|
+
return delta > threshold;
|
|
3450
|
+
}
|
|
3451
|
+
if (lowerIsBetter) {
|
|
3452
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
3453
|
+
}
|
|
3454
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
3455
|
+
}
|
|
3456
|
+
function loadBaseline(path) {
|
|
3457
|
+
try {
|
|
3458
|
+
const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
|
|
3459
|
+
const data = JSON.parse(raw);
|
|
3460
|
+
const results = data.results ?? data;
|
|
3461
|
+
if (!Array.isArray(results)) return null;
|
|
3462
|
+
return {
|
|
3463
|
+
timestamp: data.timestamp ?? "unknown",
|
|
3464
|
+
results
|
|
3465
|
+
};
|
|
3466
|
+
} catch {
|
|
3467
|
+
return null;
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
function saveBaseline(path, results) {
|
|
3471
|
+
(0, import_node_fs.mkdirSync)((0, import_node_path.dirname)(path), { recursive: true });
|
|
3472
|
+
const data = {
|
|
3473
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3474
|
+
results
|
|
3475
|
+
};
|
|
3476
|
+
(0, import_node_fs.writeFileSync)(path, JSON.stringify(data, null, 2));
|
|
3477
|
+
}
|
|
3478
|
+
|
|
3479
|
+
// src/github.ts
|
|
3480
|
+
var import_node_fs2 = require("fs");
|
|
3481
|
+
function detectGitHubContext() {
|
|
3482
|
+
const token = process.env.GITHUB_TOKEN;
|
|
3483
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
3484
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
3485
|
+
if (!token || !repository) return null;
|
|
3486
|
+
const [owner, repo] = repository.split("/");
|
|
3487
|
+
if (!owner || !repo) return null;
|
|
3488
|
+
let prNumber;
|
|
3489
|
+
if (eventPath) {
|
|
3490
|
+
try {
|
|
3491
|
+
const event = JSON.parse((0, import_node_fs2.readFileSync)(eventPath, "utf-8"));
|
|
3492
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
3493
|
+
const pr = event.pull_request;
|
|
3494
|
+
prNumber = pr.number;
|
|
3495
|
+
}
|
|
3496
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
3497
|
+
const issue = event.issue;
|
|
3498
|
+
if (issue.pull_request) {
|
|
3499
|
+
prNumber = issue.number;
|
|
3500
|
+
}
|
|
3501
|
+
}
|
|
3502
|
+
} catch {
|
|
3503
|
+
}
|
|
3504
|
+
}
|
|
3505
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
3506
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
3507
|
+
}
|
|
3508
|
+
if (!prNumber) return null;
|
|
3509
|
+
return { token, owner, repo, prNumber };
|
|
3510
|
+
}
|
|
3511
|
+
var API_BASE = "https://api.github.com";
|
|
3512
|
+
function ghHeaders(token, extra) {
|
|
3513
|
+
return {
|
|
3514
|
+
Authorization: `Bearer ${token}`,
|
|
3515
|
+
Accept: "application/vnd.github+json",
|
|
3516
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3517
|
+
...extra
|
|
3518
|
+
};
|
|
3519
|
+
}
|
|
3520
|
+
async function findExistingComment(ctx, marker) {
|
|
3521
|
+
let page = 1;
|
|
3522
|
+
const perPage = 50;
|
|
3523
|
+
while (true) {
|
|
3524
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
3525
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
3526
|
+
if (!res.ok) return null;
|
|
3527
|
+
const comments = await res.json();
|
|
3528
|
+
if (comments.length === 0) break;
|
|
3529
|
+
for (const comment of comments) {
|
|
3530
|
+
if (comment.body?.includes(marker)) {
|
|
3531
|
+
return comment.id;
|
|
3532
|
+
}
|
|
3533
|
+
}
|
|
3534
|
+
if (comments.length < perPage) break;
|
|
3535
|
+
page++;
|
|
3536
|
+
}
|
|
3537
|
+
return null;
|
|
3538
|
+
}
|
|
3539
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
3540
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
3541
|
+
if (existingId) {
|
|
3542
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
3543
|
+
const res = await fetch(url, {
|
|
3544
|
+
method: "PATCH",
|
|
3545
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
3546
|
+
body: JSON.stringify({ body })
|
|
3547
|
+
});
|
|
3548
|
+
if (!res.ok) {
|
|
3549
|
+
const text = await res.text();
|
|
3550
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
3551
|
+
}
|
|
3552
|
+
} else {
|
|
3553
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
3554
|
+
const res = await fetch(url, {
|
|
3555
|
+
method: "POST",
|
|
3556
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
3557
|
+
body: JSON.stringify({ body })
|
|
3558
|
+
});
|
|
3559
|
+
if (!res.ok) {
|
|
3560
|
+
const text = await res.text();
|
|
3561
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
3562
|
+
}
|
|
3563
|
+
}
|
|
2194
3564
|
}
|
|
2195
3565
|
// Annotate the CommonJS export names for ESM import in node:
|
|
2196
3566
|
0 && (module.exports = {
|
|
2197
3567
|
anthropic,
|
|
2198
3568
|
azureOpenai,
|
|
3569
|
+
compareResults,
|
|
3570
|
+
computeStats,
|
|
2199
3571
|
consoleReporter,
|
|
2200
3572
|
defineArena,
|
|
3573
|
+
detectGitHubContext,
|
|
2201
3574
|
gemini,
|
|
3575
|
+
htmlReporter,
|
|
2202
3576
|
jsonReporter,
|
|
3577
|
+
loadBaseline,
|
|
3578
|
+
markdownReporter,
|
|
2203
3579
|
openai,
|
|
2204
3580
|
openaiCompatible,
|
|
2205
|
-
registerPricing
|
|
3581
|
+
registerPricing,
|
|
3582
|
+
saveBaseline,
|
|
3583
|
+
upsertPrComment
|
|
2206
3584
|
});
|
|
2207
3585
|
//# sourceMappingURL=index.cjs.map
|