agent-duelist 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -59
- package/dist/cli.js +1793 -394
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1774 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -8
- package/dist/index.d.ts +73 -8
- package/dist/index.js +1765 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/templates/arena.config.ts +5 -5
package/dist/index.js
CHANGED
|
@@ -1410,11 +1410,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1410
1410
|
}
|
|
1411
1411
|
const a = stringify(task.expected);
|
|
1412
1412
|
const b = stringify(result.output);
|
|
1413
|
-
const
|
|
1413
|
+
const setA = tokenize(a);
|
|
1414
|
+
const setB = tokenize(b);
|
|
1415
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1414
1416
|
return {
|
|
1415
1417
|
name: "fuzzy-similarity",
|
|
1416
1418
|
value: Math.round(similarity * 100) / 100,
|
|
1417
|
-
details: { method: "jaccard", expectedTokens:
|
|
1419
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1418
1420
|
};
|
|
1419
1421
|
};
|
|
1420
1422
|
function stringify(value) {
|
|
@@ -1435,7 +1437,163 @@ function jaccardSimilarity(a, b) {
|
|
|
1435
1437
|
}
|
|
1436
1438
|
|
|
1437
1439
|
// src/scorers/llm-judge.ts
|
|
1440
|
+
import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
1441
|
+
|
|
1442
|
+
// src/providers/openai.ts
|
|
1438
1443
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1444
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1445
|
+
|
|
1446
|
+
// src/providers/shared.ts
|
|
1447
|
+
var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
|
|
1448
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1449
|
+
if (!hasSchema) return rawContent;
|
|
1450
|
+
try {
|
|
1451
|
+
return JSON.parse(rawContent);
|
|
1452
|
+
} catch {
|
|
1453
|
+
return rawContent;
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
// src/providers/openai.ts
|
|
1458
|
+
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1459
|
+
function openai(model, options) {
|
|
1460
|
+
const client = new OpenAI({
|
|
1461
|
+
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1462
|
+
baseURL: options?.baseURL,
|
|
1463
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1464
|
+
});
|
|
1465
|
+
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1466
|
+
}
|
|
1467
|
+
function openaiCompatible(options) {
|
|
1468
|
+
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1469
|
+
const client = new OpenAI({
|
|
1470
|
+
apiKey,
|
|
1471
|
+
baseURL: options.baseURL,
|
|
1472
|
+
timeout: options.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1473
|
+
});
|
|
1474
|
+
if (options.free) {
|
|
1475
|
+
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1476
|
+
}
|
|
1477
|
+
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1478
|
+
}
|
|
1479
|
+
function azureOpenai(model, options) {
|
|
1480
|
+
const deployment = options?.deployment ?? model;
|
|
1481
|
+
const client = new AzureOpenAI({
|
|
1482
|
+
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1483
|
+
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1484
|
+
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1485
|
+
deployment,
|
|
1486
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1487
|
+
});
|
|
1488
|
+
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1489
|
+
}
|
|
1490
|
+
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1491
|
+
return {
|
|
1492
|
+
id,
|
|
1493
|
+
name,
|
|
1494
|
+
model,
|
|
1495
|
+
async run(input) {
|
|
1496
|
+
const start = Date.now();
|
|
1497
|
+
const params = {
|
|
1498
|
+
model: requestModel,
|
|
1499
|
+
messages: [{ role: "user", content: input.prompt }]
|
|
1500
|
+
};
|
|
1501
|
+
if (input.schema) {
|
|
1502
|
+
params.response_format = { type: "json_object" };
|
|
1503
|
+
params.messages = [
|
|
1504
|
+
{ role: "system", content: SCHEMA_SYSTEM_MESSAGE },
|
|
1505
|
+
...params.messages
|
|
1506
|
+
];
|
|
1507
|
+
}
|
|
1508
|
+
if (input.tools?.length) {
|
|
1509
|
+
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1510
|
+
params.tool_choice = "auto";
|
|
1511
|
+
}
|
|
1512
|
+
const response = await client.chat.completions.create(params, { signal: input.signal });
|
|
1513
|
+
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1514
|
+
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1515
|
+
const choice = response.choices[0];
|
|
1516
|
+
const toolCallsRaw = choice?.message?.tool_calls;
|
|
1517
|
+
const collectedToolCalls = [];
|
|
1518
|
+
let finalResponse = response;
|
|
1519
|
+
if (toolCallsRaw?.length && input.tools?.length) {
|
|
1520
|
+
const toolMessages = [
|
|
1521
|
+
...params.messages,
|
|
1522
|
+
choice.message
|
|
1523
|
+
];
|
|
1524
|
+
for (const tc of toolCallsRaw) {
|
|
1525
|
+
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
1526
|
+
let args;
|
|
1527
|
+
try {
|
|
1528
|
+
args = JSON.parse(tc.function.arguments);
|
|
1529
|
+
} catch {
|
|
1530
|
+
args = tc.function.arguments;
|
|
1531
|
+
}
|
|
1532
|
+
let result;
|
|
1533
|
+
if (toolDef?.handler) {
|
|
1534
|
+
result = await toolDef.handler(args);
|
|
1535
|
+
}
|
|
1536
|
+
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
1537
|
+
toolMessages.push({
|
|
1538
|
+
role: "tool",
|
|
1539
|
+
tool_call_id: tc.id,
|
|
1540
|
+
content: JSON.stringify(result ?? {})
|
|
1541
|
+
});
|
|
1542
|
+
}
|
|
1543
|
+
const followUp = await client.chat.completions.create({
|
|
1544
|
+
model: requestModel,
|
|
1545
|
+
messages: toolMessages
|
|
1546
|
+
}, { signal: input.signal });
|
|
1547
|
+
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1548
|
+
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1549
|
+
finalResponse = followUp;
|
|
1550
|
+
}
|
|
1551
|
+
const latencyMs = Date.now() - start;
|
|
1552
|
+
const finalChoice = finalResponse.choices[0];
|
|
1553
|
+
let rawContent = finalChoice?.message?.content ?? "";
|
|
1554
|
+
if (stripThinking) {
|
|
1555
|
+
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1556
|
+
}
|
|
1557
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1558
|
+
return {
|
|
1559
|
+
output,
|
|
1560
|
+
usage: {
|
|
1561
|
+
promptTokens: totalPromptTokens || void 0,
|
|
1562
|
+
completionTokens: totalCompletionTokens || void 0
|
|
1563
|
+
},
|
|
1564
|
+
latencyMs,
|
|
1565
|
+
raw: finalResponse,
|
|
1566
|
+
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
1567
|
+
};
|
|
1568
|
+
}
|
|
1569
|
+
};
|
|
1570
|
+
}
|
|
1571
|
+
function gemini(model, options) {
|
|
1572
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1573
|
+
if (!apiKey) {
|
|
1574
|
+
throw new Error(
|
|
1575
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1576
|
+
);
|
|
1577
|
+
}
|
|
1578
|
+
const client = new OpenAI({
|
|
1579
|
+
apiKey,
|
|
1580
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1581
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1582
|
+
});
|
|
1583
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1584
|
+
}
|
|
1585
|
+
function toolDefToOpenAI(tool) {
|
|
1586
|
+
return {
|
|
1587
|
+
type: "function",
|
|
1588
|
+
function: {
|
|
1589
|
+
name: tool.name,
|
|
1590
|
+
description: tool.description,
|
|
1591
|
+
parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
|
|
1592
|
+
}
|
|
1593
|
+
};
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
// src/scorers/llm-judge.ts
|
|
1439
1597
|
var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
|
|
1440
1598
|
|
|
1441
1599
|
Criteria:
|
|
@@ -1451,40 +1609,42 @@ conciseness: <number>
|
|
|
1451
1609
|
Task: {task}
|
|
1452
1610
|
Expected: {expected}
|
|
1453
1611
|
Actual: {actual}`;
|
|
1454
|
-
function resolveJudgeClient(configModel) {
|
|
1455
|
-
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-
|
|
1612
|
+
function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1613
|
+
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
|
|
1456
1614
|
if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
|
|
1457
1615
|
return {
|
|
1458
|
-
client: new
|
|
1616
|
+
client: new OpenAI2({
|
|
1459
1617
|
apiKey: process.env.GOOGLE_API_KEY,
|
|
1460
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/"
|
|
1618
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1619
|
+
timeout: timeoutMs
|
|
1461
1620
|
}),
|
|
1462
1621
|
model
|
|
1463
1622
|
};
|
|
1464
1623
|
}
|
|
1465
1624
|
if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
|
|
1466
1625
|
return {
|
|
1467
|
-
client: new
|
|
1626
|
+
client: new AzureOpenAI2({
|
|
1468
1627
|
apiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
1469
1628
|
endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
1470
1629
|
apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1471
|
-
deployment: model
|
|
1630
|
+
deployment: model,
|
|
1631
|
+
timeout: timeoutMs
|
|
1472
1632
|
}),
|
|
1473
1633
|
model
|
|
1474
1634
|
};
|
|
1475
1635
|
}
|
|
1476
1636
|
const apiKey = process.env.OPENAI_API_KEY;
|
|
1477
1637
|
if (!apiKey) return void 0;
|
|
1478
|
-
return { client: new
|
|
1638
|
+
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
1479
1639
|
}
|
|
1480
|
-
function createLlmJudgeScorer(judgeModel) {
|
|
1640
|
+
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1481
1641
|
let cached = void 0;
|
|
1482
1642
|
return async ({ task, result }) => {
|
|
1483
1643
|
if (task.expected === void 0) {
|
|
1484
1644
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
1485
1645
|
}
|
|
1486
1646
|
if (cached === void 0) {
|
|
1487
|
-
cached = resolveJudgeClient(judgeModel) ?? null;
|
|
1647
|
+
cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
|
|
1488
1648
|
}
|
|
1489
1649
|
if (!cached) {
|
|
1490
1650
|
return {
|
|
@@ -1499,8 +1659,7 @@ function createLlmJudgeScorer(judgeModel) {
|
|
|
1499
1659
|
const response = await client.chat.completions.create({
|
|
1500
1660
|
model,
|
|
1501
1661
|
messages: [{ role: "user", content: prompt }],
|
|
1502
|
-
|
|
1503
|
-
max_tokens: 2048
|
|
1662
|
+
max_completion_tokens: 2048
|
|
1504
1663
|
});
|
|
1505
1664
|
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1506
1665
|
const parsed = {};
|
|
@@ -1557,10 +1716,10 @@ var staticScorers = {
|
|
|
1557
1716
|
"fuzzy-similarity": fuzzySimilarityScorer,
|
|
1558
1717
|
"tool-usage": toolUsageScorer
|
|
1559
1718
|
};
|
|
1560
|
-
function resolveScorers(names, judgeModel) {
|
|
1719
|
+
function resolveScorers(names, judgeModel, timeoutMs) {
|
|
1561
1720
|
return names.map((name) => {
|
|
1562
1721
|
if (name === "llm-judge-correctness") {
|
|
1563
|
-
return createLlmJudgeScorer(judgeModel);
|
|
1722
|
+
return createLlmJudgeScorer(judgeModel, timeoutMs);
|
|
1564
1723
|
}
|
|
1565
1724
|
const scorer = staticScorers[name];
|
|
1566
1725
|
if (!scorer) {
|
|
@@ -1571,219 +1730,156 @@ function resolveScorers(names, judgeModel) {
|
|
|
1571
1730
|
}
|
|
1572
1731
|
|
|
1573
1732
|
// src/runner.ts
|
|
1733
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
1734
|
+
function withTimeout(run, ms) {
|
|
1735
|
+
return new Promise((resolve, reject) => {
|
|
1736
|
+
const controller = new AbortController();
|
|
1737
|
+
const timer = setTimeout(() => {
|
|
1738
|
+
controller.abort();
|
|
1739
|
+
reject(new Error(`Request timed out after ${ms}ms`));
|
|
1740
|
+
}, ms);
|
|
1741
|
+
run(controller.signal).then(
|
|
1742
|
+
(v) => {
|
|
1743
|
+
clearTimeout(timer);
|
|
1744
|
+
resolve(v);
|
|
1745
|
+
},
|
|
1746
|
+
(e) => {
|
|
1747
|
+
clearTimeout(timer);
|
|
1748
|
+
reject(e);
|
|
1749
|
+
}
|
|
1750
|
+
);
|
|
1751
|
+
});
|
|
1752
|
+
}
|
|
1574
1753
|
async function runBenchmarks(options) {
|
|
1575
1754
|
const { providers, tasks, scorers, runs, onResult } = options;
|
|
1755
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1576
1756
|
const results = [];
|
|
1577
1757
|
for (const task of tasks) {
|
|
1578
|
-
for (
|
|
1579
|
-
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1758
|
+
for (let run = 1; run <= runs; run++) {
|
|
1759
|
+
const runResults = await Promise.all(
|
|
1760
|
+
providers.map(async (provider) => {
|
|
1761
|
+
let result;
|
|
1762
|
+
try {
|
|
1763
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1764
|
+
prompt: task.prompt,
|
|
1765
|
+
schema: task.schema,
|
|
1766
|
+
tools: task.tools,
|
|
1767
|
+
signal
|
|
1768
|
+
}), timeout);
|
|
1769
|
+
const scores = await Promise.all(
|
|
1770
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1771
|
+
);
|
|
1772
|
+
result = {
|
|
1773
|
+
providerId: provider.id,
|
|
1774
|
+
taskName: task.name,
|
|
1775
|
+
run,
|
|
1776
|
+
scores,
|
|
1777
|
+
raw: {
|
|
1778
|
+
output: taskResult.output,
|
|
1779
|
+
latencyMs: taskResult.latencyMs,
|
|
1780
|
+
usage: taskResult.usage,
|
|
1781
|
+
toolCalls: taskResult.toolCalls
|
|
1782
|
+
}
|
|
1783
|
+
};
|
|
1784
|
+
} catch (err) {
|
|
1785
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1786
|
+
result = {
|
|
1787
|
+
providerId: provider.id,
|
|
1788
|
+
taskName: task.name,
|
|
1789
|
+
run,
|
|
1790
|
+
scores: [],
|
|
1791
|
+
error: message,
|
|
1792
|
+
raw: { output: "", latencyMs: 0 }
|
|
1793
|
+
};
|
|
1794
|
+
}
|
|
1795
|
+
onResult?.(result);
|
|
1796
|
+
return result;
|
|
1797
|
+
})
|
|
1798
|
+
);
|
|
1799
|
+
results.push(...runResults);
|
|
1616
1800
|
}
|
|
1617
1801
|
}
|
|
1618
1802
|
return results;
|
|
1619
1803
|
}
|
|
1620
1804
|
|
|
1621
|
-
// src/
|
|
1622
|
-
var
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
}
|
|
1632
|
-
function dim(s) {
|
|
1633
|
-
return `${dimCode}${s}${reset}`;
|
|
1805
|
+
// src/utils/format.ts
|
|
1806
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1807
|
+
function formatCost(usd) {
|
|
1808
|
+
if (usd === void 0) return "\u2014";
|
|
1809
|
+
if (usd === 0) return "$0.00";
|
|
1810
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1811
|
+
const digits = Math.min(
|
|
1812
|
+
MAX_FRACTION_DIGITS,
|
|
1813
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1814
|
+
);
|
|
1815
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1634
1816
|
}
|
|
1635
|
-
function
|
|
1636
|
-
const
|
|
1637
|
-
|
|
1638
|
-
if (value >= 0.8) return `${green}${str}${reset}`;
|
|
1639
|
-
if (value >= 0.5) return `${yellow}${str}${reset}`;
|
|
1640
|
-
return `${red}${str}${reset}`;
|
|
1817
|
+
function formatDelta(delta, precision = 4) {
|
|
1818
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1819
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1641
1820
|
}
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
const
|
|
1648
|
-
const
|
|
1649
|
-
const
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
const
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
else if (name === "cost") {
|
|
1664
|
-
cols.push({ label: "Cost", width: 12, align: "right" });
|
|
1665
|
-
cols.push({ label: "Tokens", width: 9, align: "right" });
|
|
1666
|
-
} else if (name === "correctness") cols.push({ label: "Match", width: 8, align: "right" });
|
|
1667
|
-
else if (name === "schema-correctness") cols.push({ label: "Schema", width: 8, align: "right" });
|
|
1668
|
-
else if (name === "fuzzy-similarity") cols.push({ label: "Fuzzy", width: 8, align: "right" });
|
|
1669
|
-
else if (name === "llm-judge-correctness") cols.push({ label: "Judge", width: 8, align: "right" });
|
|
1670
|
-
else if (name === "tool-usage") cols.push({ label: "Tool", width: 8, align: "right" });
|
|
1671
|
-
else cols.push({ label: name, width: 10, align: "right" });
|
|
1672
|
-
}
|
|
1673
|
-
if (hasErrors) cols.push({ label: "Status", width: 8, align: "left" });
|
|
1674
|
-
const totalWidth = cols.reduce((sum, c) => sum + c.width + 2, 0);
|
|
1675
|
-
console.log(` ${dim(cols.map((c) => pad(c.label, c.width + 2, c.align)).join(""))}`);
|
|
1676
|
-
console.log(` ${dim("\u2500".repeat(totalWidth))}`);
|
|
1677
|
-
for (const provider of providers) {
|
|
1678
|
-
const taskResults = results.filter(
|
|
1679
|
-
(r) => r.taskName === task && r.providerId === provider
|
|
1680
|
-
);
|
|
1681
|
-
const errorResults2 = taskResults.filter((r) => r.error);
|
|
1682
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
1683
|
-
if (successResults.length === 0 && errorResults2.length > 0) {
|
|
1684
|
-
const cells2 = [pad(provider, 24, "left")];
|
|
1685
|
-
for (const name of scorerNames) {
|
|
1686
|
-
if (name === "cost") {
|
|
1687
|
-
cells2.push(pad("\u2014", 14, "right"));
|
|
1688
|
-
cells2.push(pad("\u2014", 11, "right"));
|
|
1689
|
-
} else cells2.push(pad("\u2014", cols.find((c) => c.label !== "Provider").width + 2, "right"));
|
|
1690
|
-
}
|
|
1691
|
-
if (hasErrors) cells2.push(` ${red}FAIL${reset}`);
|
|
1692
|
-
console.log(` ${cells2.join("")}`);
|
|
1693
|
-
continue;
|
|
1694
|
-
}
|
|
1695
|
-
const avgScores = averageScores(successResults);
|
|
1696
|
-
const avgDetails = averageDetails(successResults);
|
|
1697
|
-
const latencyMs = average(successResults.map((r) => r.raw.latencyMs));
|
|
1698
|
-
const cells = [pad(provider, 24, "left")];
|
|
1699
|
-
for (const name of scorerNames) {
|
|
1700
|
-
if (name === "latency") {
|
|
1701
|
-
cells.push(pad(latencyMs !== void 0 ? `${Math.round(latencyMs)}ms` : "\u2014", 12, "right"));
|
|
1702
|
-
} else if (name === "cost") {
|
|
1703
|
-
cells.push(pad(formatCost(avgDetails.costUsd), 14, "right"));
|
|
1704
|
-
cells.push(pad(avgDetails.totalTokens !== void 0 ? `${avgDetails.totalTokens}` : "\u2014", 11, "right"));
|
|
1705
|
-
} else {
|
|
1706
|
-
const val = avgScores[name];
|
|
1707
|
-
if (val === void 0) cells.push(pad("\u2014", 10, "right"));
|
|
1708
|
-
else cells.push(pad(colorScore(val), 10 + colorLen(colorScore(val)), "right"));
|
|
1709
|
-
}
|
|
1710
|
-
}
|
|
1711
|
-
if (hasErrors) {
|
|
1712
|
-
const failCount = errorResults2.length;
|
|
1713
|
-
cells.push(failCount > 0 ? ` ${yellow}${failCount} err${reset}` : ` ${green}OK${reset}`);
|
|
1714
|
-
}
|
|
1715
|
-
console.log(` ${cells.join("")}`);
|
|
1821
|
+
|
|
1822
|
+
// src/reporter/shared.ts
|
|
1823
|
+
function groupResults(results) {
|
|
1824
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1825
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1826
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1827
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1828
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1829
|
+
let hasErrors = false;
|
|
1830
|
+
let maxRun = 0;
|
|
1831
|
+
for (const r of results) {
|
|
1832
|
+
taskSet.add(r.taskName);
|
|
1833
|
+
providerSet.add(r.providerId);
|
|
1834
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1835
|
+
if (r.error) hasErrors = true;
|
|
1836
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1837
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1838
|
+
let group = grouped.get(key);
|
|
1839
|
+
if (!group) {
|
|
1840
|
+
group = [];
|
|
1841
|
+
grouped.set(key, group);
|
|
1716
1842
|
}
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
console.log(` ${bold("Errors")}`);
|
|
1723
|
-
console.log(` ${dim("\u2500".repeat(70))}`);
|
|
1724
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1725
|
-
for (const r of errorResults) {
|
|
1726
|
-
const key = `${r.providerId}::${r.error}`;
|
|
1727
|
-
if (seen.has(key)) continue;
|
|
1728
|
-
seen.add(key);
|
|
1729
|
-
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
1730
|
-
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
1731
|
-
console.log(` ${red}\u2717${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
1732
|
-
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
1733
|
-
if (hint) console.log(` ${dim(hint)}`);
|
|
1843
|
+
group.push(r);
|
|
1844
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1845
|
+
if (!provGroup) {
|
|
1846
|
+
provGroup = [];
|
|
1847
|
+
byProvider.set(r.providerId, provGroup);
|
|
1734
1848
|
}
|
|
1735
|
-
|
|
1736
|
-
}
|
|
1737
|
-
if (hasCost) {
|
|
1738
|
-
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
1739
|
-
console.log("");
|
|
1849
|
+
provGroup.push(r);
|
|
1740
1850
|
}
|
|
1851
|
+
return {
|
|
1852
|
+
tasks: [...taskSet],
|
|
1853
|
+
providers: [...providerSet],
|
|
1854
|
+
scorerNames: [...scorerSet],
|
|
1855
|
+
grouped,
|
|
1856
|
+
byProvider,
|
|
1857
|
+
hasErrors,
|
|
1858
|
+
maxRun
|
|
1859
|
+
};
|
|
1741
1860
|
}
|
|
1742
|
-
function
|
|
1743
|
-
const
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
const byLatency = providers.map((id) => {
|
|
1756
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
1757
|
-
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
1758
|
-
return { id, avg: avg ?? Infinity };
|
|
1759
|
-
}).sort((a, b) => a.avg - b.avg)[0];
|
|
1760
|
-
if (byLatency && byLatency.avg !== Infinity) {
|
|
1761
|
-
const label = single ? "Avg latency" : `Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))}`;
|
|
1762
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${Math.round(byLatency.avg)}ms)`);
|
|
1763
|
-
}
|
|
1764
|
-
const byCost = providers.map((id) => {
|
|
1765
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
1766
|
-
const costs = runs.map((r) => {
|
|
1767
|
-
const s = r.scores.find((s2) => s2.name === "cost");
|
|
1768
|
-
return s && s.value >= 0 ? s.value : void 0;
|
|
1769
|
-
}).filter((c) => c !== void 0);
|
|
1770
|
-
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
1771
|
-
return { id, avg };
|
|
1772
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
1773
|
-
if (byCost?.avg !== void 0) {
|
|
1774
|
-
const label = single ? "Avg cost" : `Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))}`;
|
|
1775
|
-
console.log(` ${cyan}\u25C6${reset} ${label} (avg ${formatCost(byCost.avg)})`);
|
|
1861
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1862
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1863
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1864
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1865
|
+
if (successResults.length === 0) {
|
|
1866
|
+
return {
|
|
1867
|
+
providerId,
|
|
1868
|
+
avgScores: {},
|
|
1869
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1870
|
+
latencyMs: void 0,
|
|
1871
|
+
allErrors: errorResults.length > 0,
|
|
1872
|
+
errorCount: errorResults.length
|
|
1873
|
+
};
|
|
1776
1874
|
}
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
1786
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
1875
|
+
return {
|
|
1876
|
+
providerId,
|
|
1877
|
+
avgScores: averageScores(successResults),
|
|
1878
|
+
avgDetails: averageDetails(successResults),
|
|
1879
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1880
|
+
allErrors: false,
|
|
1881
|
+
errorCount: errorResults.length
|
|
1882
|
+
};
|
|
1787
1883
|
}
|
|
1788
1884
|
function averageScores(results) {
|
|
1789
1885
|
const sums = {};
|
|
@@ -1827,38 +1923,89 @@ function average(nums) {
|
|
|
1827
1923
|
if (nums.length === 0) return void 0;
|
|
1828
1924
|
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1829
1925
|
}
|
|
1830
|
-
function
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
if (
|
|
1834
|
-
|
|
1835
|
-
|
|
1926
|
+
function computeColumnStats(providerData, scorerNames) {
|
|
1927
|
+
const stats = /* @__PURE__ */ new Map();
|
|
1928
|
+
const valid = providerData.filter((p) => !p.allErrors);
|
|
1929
|
+
if (scorerNames.includes("latency")) {
|
|
1930
|
+
const values = /* @__PURE__ */ new Map();
|
|
1931
|
+
for (const p of providerData) {
|
|
1932
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.latencyMs);
|
|
1933
|
+
}
|
|
1934
|
+
const nums = valid.map((p) => p.latencyMs).filter((v) => v !== void 0);
|
|
1935
|
+
stats.set("latency", {
|
|
1936
|
+
values,
|
|
1937
|
+
best: nums.length > 0 ? Math.min(...nums) : void 0,
|
|
1938
|
+
worst: nums.length > 0 ? Math.max(...nums) : void 0
|
|
1939
|
+
});
|
|
1940
|
+
}
|
|
1941
|
+
if (scorerNames.includes("cost")) {
|
|
1942
|
+
const costValues = /* @__PURE__ */ new Map();
|
|
1943
|
+
const tokenValues = /* @__PURE__ */ new Map();
|
|
1944
|
+
for (const p of providerData) {
|
|
1945
|
+
costValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.costUsd);
|
|
1946
|
+
tokenValues.set(p.providerId, p.allErrors ? void 0 : p.avgDetails.totalTokens);
|
|
1947
|
+
}
|
|
1948
|
+
const costNums = valid.map((p) => p.avgDetails.costUsd).filter((v) => v !== void 0);
|
|
1949
|
+
const tokenNums = valid.map((p) => p.avgDetails.totalTokens).filter((v) => v !== void 0);
|
|
1950
|
+
stats.set("cost", {
|
|
1951
|
+
values: costValues,
|
|
1952
|
+
best: costNums.length > 0 ? Math.min(...costNums) : void 0,
|
|
1953
|
+
worst: costNums.length > 0 ? Math.max(...costNums) : void 0
|
|
1954
|
+
});
|
|
1955
|
+
stats.set("tokens", {
|
|
1956
|
+
values: tokenValues,
|
|
1957
|
+
best: tokenNums.length > 0 ? Math.min(...tokenNums) : void 0,
|
|
1958
|
+
worst: tokenNums.length > 0 ? Math.max(...tokenNums) : void 0
|
|
1959
|
+
});
|
|
1960
|
+
}
|
|
1961
|
+
for (const name of scorerNames) {
|
|
1962
|
+
if (name === "latency" || name === "cost") continue;
|
|
1963
|
+
const values = /* @__PURE__ */ new Map();
|
|
1964
|
+
for (const p of providerData) {
|
|
1965
|
+
values.set(p.providerId, p.allErrors ? void 0 : p.avgScores[name]);
|
|
1966
|
+
}
|
|
1967
|
+
const nums = valid.map((p) => p.avgScores[name]).filter((v) => v !== void 0);
|
|
1968
|
+
stats.set(name, {
|
|
1969
|
+
values,
|
|
1970
|
+
best: nums.length > 0 ? Math.max(...nums) : void 0,
|
|
1971
|
+
worst: nums.length > 0 ? Math.min(...nums) : void 0
|
|
1972
|
+
});
|
|
1973
|
+
}
|
|
1974
|
+
return stats;
|
|
1836
1975
|
}
|
|
1837
|
-
function
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1976
|
+
function computeMedals(columnStats, providerIds) {
|
|
1977
|
+
const medals = /* @__PURE__ */ new Map();
|
|
1978
|
+
if (providerIds.length < 2) {
|
|
1979
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1980
|
+
return medals;
|
|
1981
|
+
}
|
|
1982
|
+
const wins = /* @__PURE__ */ new Map();
|
|
1983
|
+
for (const id of providerIds) wins.set(id, 0);
|
|
1984
|
+
for (const [, colStats] of columnStats) {
|
|
1985
|
+
if (colStats.best === void 0) continue;
|
|
1986
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
1987
|
+
if (bestProviders.length === 1) {
|
|
1988
|
+
wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
|
|
1989
|
+
}
|
|
1990
|
+
}
|
|
1991
|
+
const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
|
|
1992
|
+
if (totalWins === 0) {
|
|
1993
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1994
|
+
return medals;
|
|
1995
|
+
}
|
|
1996
|
+
const sorted = [...wins.entries()].sort(
|
|
1997
|
+
(a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
|
|
1998
|
+
);
|
|
1999
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
2000
|
+
let rank = 0;
|
|
2001
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
2002
|
+
if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
|
|
2003
|
+
rank = i;
|
|
2004
|
+
}
|
|
2005
|
+
const hasWins = sorted[i][1] > 0;
|
|
2006
|
+
medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
|
|
2007
|
+
}
|
|
2008
|
+
return medals;
|
|
1862
2009
|
}
|
|
1863
2010
|
function providerLabel(providerId) {
|
|
1864
2011
|
const prefix = providerId.split("/")[0];
|
|
@@ -1911,6 +2058,369 @@ function providerLabel(providerId) {
|
|
|
1911
2058
|
return `(${prefix})`;
|
|
1912
2059
|
}
|
|
1913
2060
|
}
|
|
2061
|
+
function apiKeyHint(providerId, error) {
|
|
2062
|
+
const lower = error.toLowerCase();
|
|
2063
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2064
|
+
if (!isAuthError) return void 0;
|
|
2065
|
+
const prefix = providerId.split("/")[0];
|
|
2066
|
+
switch (prefix) {
|
|
2067
|
+
case "openai":
|
|
2068
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2069
|
+
case "azure":
|
|
2070
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2071
|
+
case "anthropic":
|
|
2072
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2073
|
+
case "google":
|
|
2074
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2075
|
+
default:
|
|
2076
|
+
return `Check the API key for ${providerId}`;
|
|
2077
|
+
}
|
|
2078
|
+
}
|
|
2079
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2080
|
+
const ranked = providers.map((id) => {
|
|
2081
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2082
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2083
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2084
|
+
return { id, avg };
|
|
2085
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2086
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2087
|
+
}
|
|
2088
|
+
function scorerLabel(name) {
|
|
2089
|
+
switch (name) {
|
|
2090
|
+
case "correctness":
|
|
2091
|
+
return "Match";
|
|
2092
|
+
case "schema-correctness":
|
|
2093
|
+
return "Schema";
|
|
2094
|
+
case "fuzzy-similarity":
|
|
2095
|
+
return "Fuzzy";
|
|
2096
|
+
case "llm-judge-correctness":
|
|
2097
|
+
return "Judge";
|
|
2098
|
+
case "tool-usage":
|
|
2099
|
+
return "Tool";
|
|
2100
|
+
default:
|
|
2101
|
+
return name;
|
|
2102
|
+
}
|
|
2103
|
+
}
|
|
2104
|
+
function medalEmoji(medal) {
|
|
2105
|
+
switch (medal) {
|
|
2106
|
+
case "gold":
|
|
2107
|
+
return "\u{1F947}";
|
|
2108
|
+
case "silver":
|
|
2109
|
+
return "\u{1F948}";
|
|
2110
|
+
case "bronze":
|
|
2111
|
+
return "\u{1F949}";
|
|
2112
|
+
case "none":
|
|
2113
|
+
return "";
|
|
2114
|
+
}
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2117
|
+
// src/reporter/console.ts
|
|
2118
|
+
var reset = "\x1B[0m";
|
|
2119
|
+
var boldCode = "\x1B[1m";
|
|
2120
|
+
var dimCode = "\x1B[2m";
|
|
2121
|
+
var green = "\x1B[32m";
|
|
2122
|
+
var red = "\x1B[31m";
|
|
2123
|
+
var yellow = "\x1B[33m";
|
|
2124
|
+
var cyan = "\x1B[36m";
|
|
2125
|
+
var brightGreen = "\x1B[92m";
|
|
2126
|
+
var brightWhite = "\x1B[97m";
|
|
2127
|
+
function bold(s) {
|
|
2128
|
+
return `${boldCode}${s}${reset}`;
|
|
2129
|
+
}
|
|
2130
|
+
function dim(s) {
|
|
2131
|
+
return `${dimCode}${s}${reset}`;
|
|
2132
|
+
}
|
|
2133
|
+
function stripAnsi(s) {
|
|
2134
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2135
|
+
}
|
|
2136
|
+
function displayWidth(s) {
|
|
2137
|
+
const stripped = stripAnsi(s);
|
|
2138
|
+
let width = 0;
|
|
2139
|
+
for (const ch of stripped) {
|
|
2140
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2141
|
+
if (code >= 126976) width += 2;
|
|
2142
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2143
|
+
else width += 1;
|
|
2144
|
+
}
|
|
2145
|
+
return width;
|
|
2146
|
+
}
|
|
2147
|
+
function padCell(str, targetWidth, align) {
|
|
2148
|
+
const dw = displayWidth(str);
|
|
2149
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2150
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2151
|
+
return str + " ".repeat(padding);
|
|
2152
|
+
}
|
|
2153
|
+
function sparkBar(ratio, width = 8) {
|
|
2154
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2155
|
+
const fillLen = Math.round(clamped * width);
|
|
2156
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2157
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2158
|
+
return { fill, track };
|
|
2159
|
+
}
|
|
2160
|
+
function drawTableLine(widths, position) {
|
|
2161
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2162
|
+
if (position === "bottom") {
|
|
2163
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2164
|
+
}
|
|
2165
|
+
if (position === "merge") {
|
|
2166
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2167
|
+
}
|
|
2168
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2169
|
+
if (position === "top") {
|
|
2170
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2171
|
+
}
|
|
2172
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2173
|
+
}
|
|
2174
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2175
|
+
const parts = cells.map(
|
|
2176
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2177
|
+
);
|
|
2178
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2179
|
+
}
|
|
2180
|
+
function drawSpanRow(content, widths) {
|
|
2181
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2182
|
+
const dw = displayWidth(content);
|
|
2183
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2184
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2185
|
+
}
|
|
2186
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2187
|
+
if (value === void 0) return dim("\u2014");
|
|
2188
|
+
if (providerCount < 2) return text;
|
|
2189
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2190
|
+
if (colStats.best === colStats.worst) return text;
|
|
2191
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2192
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2193
|
+
return `${yellow}${text}${reset}`;
|
|
2194
|
+
}
|
|
2195
|
+
function consoleReporter(results, options) {
|
|
2196
|
+
const showSparklines = options?.sparklines ?? true;
|
|
2197
|
+
if (results.length === 0) {
|
|
2198
|
+
console.log("\nNo results to display.\n");
|
|
2199
|
+
return;
|
|
2200
|
+
}
|
|
2201
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
2202
|
+
const hasCost = scorerNames.includes("cost");
|
|
2203
|
+
const multi = providers.length >= 2;
|
|
2204
|
+
const runsPerCell = maxRun;
|
|
2205
|
+
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
2206
|
+
console.log("");
|
|
2207
|
+
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
2208
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2209
|
+
console.log("");
|
|
2210
|
+
for (const task of tasks) {
|
|
2211
|
+
console.log(` ${bold(`Task: ${task}`)}`);
|
|
2212
|
+
console.log("");
|
|
2213
|
+
const providerData = providers.map(
|
|
2214
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2215
|
+
);
|
|
2216
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2217
|
+
const medals = computeMedals(columnStats, providers);
|
|
2218
|
+
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
2219
|
+
const providerWidth = Math.min(35, Math.max(22, maxProviderLen + 5));
|
|
2220
|
+
const cols = [
|
|
2221
|
+
{ label: "Provider", width: providerWidth, align: "left" }
|
|
2222
|
+
];
|
|
2223
|
+
for (const name of scorerNames) {
|
|
2224
|
+
if (name === "latency") {
|
|
2225
|
+
cols.push({ label: "Latency", width: 10, align: "right", statsKey: "latency" });
|
|
2226
|
+
} else if (name === "cost") {
|
|
2227
|
+
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
2228
|
+
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
2229
|
+
} else {
|
|
2230
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2231
|
+
}
|
|
2232
|
+
}
|
|
2233
|
+
if (hasErrors) {
|
|
2234
|
+
cols.push({ label: "Status", width: 8, align: "left" });
|
|
2235
|
+
}
|
|
2236
|
+
const widths = cols.map((c) => c.width);
|
|
2237
|
+
const aligns = cols.map((c) => c.align);
|
|
2238
|
+
console.log(` ${drawTableLine(widths, "top")}`);
|
|
2239
|
+
const headerCells = cols.map((c) => bold(c.label));
|
|
2240
|
+
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2241
|
+
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2242
|
+
for (const pd of providerData) {
|
|
2243
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2244
|
+
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2245
|
+
const cells = [providerCell];
|
|
2246
|
+
if (pd.allErrors) {
|
|
2247
|
+
for (const col of cols.slice(1)) {
|
|
2248
|
+
if (col.label === "Status") {
|
|
2249
|
+
cells.push(`${red}FAIL${reset}`);
|
|
2250
|
+
} else {
|
|
2251
|
+
cells.push(dim("\u2014"));
|
|
2252
|
+
}
|
|
2253
|
+
}
|
|
2254
|
+
} else {
|
|
2255
|
+
for (const col of cols.slice(1)) {
|
|
2256
|
+
if (col.label === "Status") {
|
|
2257
|
+
cells.push(
|
|
2258
|
+
pd.errorCount > 0 ? `${yellow}${pd.errorCount} err${reset}` : `${green}OK${reset}`
|
|
2259
|
+
);
|
|
2260
|
+
continue;
|
|
2261
|
+
}
|
|
2262
|
+
const statsKey = col.statsKey;
|
|
2263
|
+
const colStats = columnStats.get(statsKey);
|
|
2264
|
+
if (statsKey === "latency") {
|
|
2265
|
+
const ms = pd.latencyMs;
|
|
2266
|
+
if (ms === void 0) {
|
|
2267
|
+
cells.push(dim("\u2014"));
|
|
2268
|
+
} else {
|
|
2269
|
+
const text = `${Math.round(ms)}ms`;
|
|
2270
|
+
cells.push(colStats ? colorByRank(text, ms, colStats, providers.length) : text);
|
|
2271
|
+
}
|
|
2272
|
+
} else if (statsKey === "cost") {
|
|
2273
|
+
const cost = pd.avgDetails.costUsd;
|
|
2274
|
+
if (cost === void 0) {
|
|
2275
|
+
cells.push(dim("\u2014"));
|
|
2276
|
+
} else {
|
|
2277
|
+
const text = formatCost(cost);
|
|
2278
|
+
cells.push(colStats ? colorByRank(text, cost, colStats, providers.length) : text);
|
|
2279
|
+
}
|
|
2280
|
+
} else if (statsKey === "tokens") {
|
|
2281
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
2282
|
+
if (tokens === void 0) {
|
|
2283
|
+
cells.push(dim("\u2014"));
|
|
2284
|
+
} else {
|
|
2285
|
+
const text = `${tokens}`;
|
|
2286
|
+
cells.push(colStats ? colorByRank(text, tokens, colStats, providers.length) : text);
|
|
2287
|
+
}
|
|
2288
|
+
} else {
|
|
2289
|
+
const val = pd.avgScores[statsKey];
|
|
2290
|
+
if (val === void 0) {
|
|
2291
|
+
cells.push(dim("\u2014"));
|
|
2292
|
+
} else {
|
|
2293
|
+
const pctStr = `${Math.round(val * 100)}%`.padStart(4);
|
|
2294
|
+
let coloredPct;
|
|
2295
|
+
if (multi && colStats) {
|
|
2296
|
+
coloredPct = colorByRank(pctStr, val, colStats, providers.length);
|
|
2297
|
+
} else {
|
|
2298
|
+
if (val >= 0.8) coloredPct = `${green}${pctStr}${reset}`;
|
|
2299
|
+
else if (val >= 0.5) coloredPct = `${yellow}${pctStr}${reset}`;
|
|
2300
|
+
else coloredPct = `${red}${pctStr}${reset}`;
|
|
2301
|
+
}
|
|
2302
|
+
if (showSparklines) {
|
|
2303
|
+
const { fill, track } = sparkBar(val);
|
|
2304
|
+
const barColor = val >= 0.8 ? green : val >= 0.5 ? yellow : red;
|
|
2305
|
+
cells.push(`${coloredPct} ${barColor}${fill}${reset}${dim(track)}`);
|
|
2306
|
+
} else {
|
|
2307
|
+
cells.push(coloredPct);
|
|
2308
|
+
}
|
|
2309
|
+
}
|
|
2310
|
+
}
|
|
2311
|
+
}
|
|
2312
|
+
}
|
|
2313
|
+
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2314
|
+
}
|
|
2315
|
+
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2316
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2317
|
+
if (winnerId) {
|
|
2318
|
+
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2319
|
+
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
2320
|
+
console.log(` ${drawSpanRow(winnerText, widths)}`);
|
|
2321
|
+
}
|
|
2322
|
+
}
|
|
2323
|
+
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2324
|
+
console.log("");
|
|
2325
|
+
}
|
|
2326
|
+
printSummary(results, providers, byProvider);
|
|
2327
|
+
const errorResults = results.filter((r) => r.error);
|
|
2328
|
+
if (errorResults.length > 0) {
|
|
2329
|
+
console.log(` ${bold("Errors")}`);
|
|
2330
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2331
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2332
|
+
for (const r of errorResults) {
|
|
2333
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2334
|
+
if (seen.has(key)) continue;
|
|
2335
|
+
seen.add(key);
|
|
2336
|
+
const count = errorResults.filter((e) => e.providerId === r.providerId && e.error === r.error).length;
|
|
2337
|
+
const suffix = count > 1 ? ` (\xD7${count})` : "";
|
|
2338
|
+
console.log(` ${red}\u2716${reset} ${r.providerId}: ${r.error}${suffix}`);
|
|
2339
|
+
const hint = apiKeyHint(r.providerId, r.error ?? "");
|
|
2340
|
+
if (hint) console.log(` ${dim(hint)}`);
|
|
2341
|
+
}
|
|
2342
|
+
console.log("");
|
|
2343
|
+
}
|
|
2344
|
+
if (hasCost) {
|
|
2345
|
+
console.log(dim(` Costs estimated from OpenRouter pricing catalog. Run npx tsx scripts/update-pricing.ts to refresh.`));
|
|
2346
|
+
console.log("");
|
|
2347
|
+
}
|
|
2348
|
+
}
|
|
2349
|
+
function printSummary(results, providers, byProvider) {
|
|
2350
|
+
const successResults = results.filter((r) => !r.error);
|
|
2351
|
+
if (successResults.length === 0) return;
|
|
2352
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2353
|
+
for (const id of providers) {
|
|
2354
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2355
|
+
}
|
|
2356
|
+
console.log(` ${bold("Summary")}`);
|
|
2357
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2358
|
+
console.log("");
|
|
2359
|
+
const single = providers.length === 1;
|
|
2360
|
+
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2361
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2362
|
+
if (byCorrectness) {
|
|
2363
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2364
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2365
|
+
if (single) {
|
|
2366
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2367
|
+
} else {
|
|
2368
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
const byLatency = providers.map((id) => {
|
|
2372
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2373
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2374
|
+
return { id, avg: avg ?? Infinity };
|
|
2375
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2376
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
2377
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2378
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2379
|
+
if (single) {
|
|
2380
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2381
|
+
} else {
|
|
2382
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2383
|
+
}
|
|
2384
|
+
}
|
|
2385
|
+
const byCost = providers.map((id) => {
|
|
2386
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2387
|
+
const costs = runs.map((r) => {
|
|
2388
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2389
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2390
|
+
}).filter((c) => c !== void 0);
|
|
2391
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2392
|
+
return { id, avg };
|
|
2393
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2394
|
+
if (byCost?.avg !== void 0) {
|
|
2395
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2396
|
+
const costStr = formatCost(byCost.avg);
|
|
2397
|
+
if (single) {
|
|
2398
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2399
|
+
} else {
|
|
2400
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2401
|
+
}
|
|
2402
|
+
}
|
|
2403
|
+
if (!single) {
|
|
2404
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2405
|
+
for (const id of providers) wins.set(id, 0);
|
|
2406
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2407
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2408
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2409
|
+
const maxWins = Math.max(...wins.values());
|
|
2410
|
+
if (maxWins > 0) {
|
|
2411
|
+
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2412
|
+
console.log("");
|
|
2413
|
+
if (topProviders.length === 1) {
|
|
2414
|
+
const [winnerId, winCount] = topProviders[0];
|
|
2415
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2416
|
+
} else {
|
|
2417
|
+
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2418
|
+
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2419
|
+
}
|
|
2420
|
+
}
|
|
2421
|
+
}
|
|
2422
|
+
console.log("");
|
|
2423
|
+
}
|
|
1914
2424
|
|
|
1915
2425
|
// src/reporter/json.ts
|
|
1916
2426
|
function jsonReporter(results) {
|
|
@@ -1945,7 +2455,7 @@ function defineArena(config) {
|
|
|
1945
2455
|
throw new Error("At least one task is required");
|
|
1946
2456
|
}
|
|
1947
2457
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
1948
|
-
const scorerFns = resolveScorers(scorerNames, config.judgeModel);
|
|
2458
|
+
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
1949
2459
|
const runs = config.runs ?? 1;
|
|
1950
2460
|
return {
|
|
1951
2461
|
config,
|
|
@@ -1955,141 +2465,13 @@ function defineArena(config) {
|
|
|
1955
2465
|
tasks: config.tasks,
|
|
1956
2466
|
scorers: scorerFns,
|
|
1957
2467
|
runs,
|
|
2468
|
+
timeout: config.timeout,
|
|
1958
2469
|
onResult: options?.onResult
|
|
1959
2470
|
});
|
|
1960
2471
|
}
|
|
1961
2472
|
};
|
|
1962
2473
|
}
|
|
1963
2474
|
|
|
1964
|
-
// src/providers/openai.ts
|
|
1965
|
-
import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
1966
|
-
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1967
|
-
function openai(model, options) {
|
|
1968
|
-
const client = new OpenAI2({
|
|
1969
|
-
apiKey: options?.apiKey ?? process.env.OPENAI_API_KEY,
|
|
1970
|
-
baseURL: options?.baseURL
|
|
1971
|
-
});
|
|
1972
|
-
return makeProvider(`openai/${model}`, "OpenAI", model, client, model);
|
|
1973
|
-
}
|
|
1974
|
-
function openaiCompatible(options) {
|
|
1975
|
-
const apiKey = options.apiKey ?? (options.apiKeyEnv ? process.env[options.apiKeyEnv] : void 0) ?? "no-key";
|
|
1976
|
-
const client = new OpenAI2({
|
|
1977
|
-
apiKey,
|
|
1978
|
-
baseURL: options.baseURL
|
|
1979
|
-
});
|
|
1980
|
-
if (options.free) {
|
|
1981
|
-
registerPricing(options.id, { inputPerToken: 0, outputPerToken: 0 });
|
|
1982
|
-
}
|
|
1983
|
-
return makeProvider(options.id, options.name, options.model, client, options.model, options.stripThinking);
|
|
1984
|
-
}
|
|
1985
|
-
function azureOpenai(model, options) {
|
|
1986
|
-
const deployment = options?.deployment ?? model;
|
|
1987
|
-
const client = new AzureOpenAI2({
|
|
1988
|
-
apiKey: options?.apiKey ?? process.env.AZURE_OPENAI_API_KEY,
|
|
1989
|
-
endpoint: options?.endpoint ?? process.env.AZURE_OPENAI_ENDPOINT,
|
|
1990
|
-
apiVersion: options?.apiVersion ?? process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
1991
|
-
deployment
|
|
1992
|
-
});
|
|
1993
|
-
return makeProvider(`azure/${model}`, "Azure OpenAI", model, client, deployment);
|
|
1994
|
-
}
|
|
1995
|
-
function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
1996
|
-
return {
|
|
1997
|
-
id,
|
|
1998
|
-
name,
|
|
1999
|
-
model,
|
|
2000
|
-
async run(input) {
|
|
2001
|
-
const start = Date.now();
|
|
2002
|
-
const params = {
|
|
2003
|
-
model: requestModel,
|
|
2004
|
-
messages: [{ role: "user", content: input.prompt }]
|
|
2005
|
-
};
|
|
2006
|
-
if (input.schema) {
|
|
2007
|
-
params.response_format = { type: "json_object" };
|
|
2008
|
-
params.messages = [
|
|
2009
|
-
{ role: "system", content: "Respond with valid JSON matching the requested schema." },
|
|
2010
|
-
...params.messages
|
|
2011
|
-
];
|
|
2012
|
-
}
|
|
2013
|
-
if (input.tools?.length) {
|
|
2014
|
-
params.tools = input.tools.map(toolDefToOpenAI);
|
|
2015
|
-
params.tool_choice = "auto";
|
|
2016
|
-
}
|
|
2017
|
-
const response = await client.chat.completions.create(params);
|
|
2018
|
-
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
2019
|
-
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
2020
|
-
const choice = response.choices[0];
|
|
2021
|
-
const toolCallsRaw = choice?.message?.tool_calls;
|
|
2022
|
-
const collectedToolCalls = [];
|
|
2023
|
-
let finalResponse = response;
|
|
2024
|
-
if (toolCallsRaw?.length && input.tools?.length) {
|
|
2025
|
-
const toolMessages = [
|
|
2026
|
-
...params.messages,
|
|
2027
|
-
choice.message
|
|
2028
|
-
];
|
|
2029
|
-
for (const tc of toolCallsRaw) {
|
|
2030
|
-
const toolDef = input.tools.find((t) => t.name === tc.function.name);
|
|
2031
|
-
let args;
|
|
2032
|
-
try {
|
|
2033
|
-
args = JSON.parse(tc.function.arguments);
|
|
2034
|
-
} catch {
|
|
2035
|
-
args = tc.function.arguments;
|
|
2036
|
-
}
|
|
2037
|
-
let result;
|
|
2038
|
-
if (toolDef?.handler) {
|
|
2039
|
-
result = await toolDef.handler(args);
|
|
2040
|
-
}
|
|
2041
|
-
collectedToolCalls.push({ name: tc.function.name, arguments: args, result });
|
|
2042
|
-
toolMessages.push({
|
|
2043
|
-
role: "tool",
|
|
2044
|
-
tool_call_id: tc.id,
|
|
2045
|
-
content: JSON.stringify(result ?? {})
|
|
2046
|
-
});
|
|
2047
|
-
}
|
|
2048
|
-
const followUp = await client.chat.completions.create({
|
|
2049
|
-
model: requestModel,
|
|
2050
|
-
messages: toolMessages
|
|
2051
|
-
});
|
|
2052
|
-
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
2053
|
-
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
2054
|
-
finalResponse = followUp;
|
|
2055
|
-
}
|
|
2056
|
-
const latencyMs = Date.now() - start;
|
|
2057
|
-
const finalChoice = finalResponse.choices[0];
|
|
2058
|
-
let rawContent = finalChoice?.message?.content ?? "";
|
|
2059
|
-
if (stripThinking) {
|
|
2060
|
-
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
2061
|
-
}
|
|
2062
|
-
let output = rawContent;
|
|
2063
|
-
if (input.schema) {
|
|
2064
|
-
try {
|
|
2065
|
-
output = JSON.parse(rawContent);
|
|
2066
|
-
} catch {
|
|
2067
|
-
}
|
|
2068
|
-
}
|
|
2069
|
-
return {
|
|
2070
|
-
output,
|
|
2071
|
-
usage: {
|
|
2072
|
-
promptTokens: totalPromptTokens || void 0,
|
|
2073
|
-
completionTokens: totalCompletionTokens || void 0
|
|
2074
|
-
},
|
|
2075
|
-
latencyMs,
|
|
2076
|
-
raw: finalResponse,
|
|
2077
|
-
toolCalls: collectedToolCalls.length > 0 ? collectedToolCalls : void 0
|
|
2078
|
-
};
|
|
2079
|
-
}
|
|
2080
|
-
};
|
|
2081
|
-
}
|
|
2082
|
-
function toolDefToOpenAI(tool) {
|
|
2083
|
-
return {
|
|
2084
|
-
type: "function",
|
|
2085
|
-
function: {
|
|
2086
|
-
name: tool.name,
|
|
2087
|
-
description: tool.description,
|
|
2088
|
-
parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
|
|
2089
|
-
}
|
|
2090
|
-
};
|
|
2091
|
-
}
|
|
2092
|
-
|
|
2093
2475
|
// src/providers/anthropic.ts
|
|
2094
2476
|
import Anthropic from "@anthropic-ai/sdk";
|
|
2095
2477
|
function anthropic(model, options) {
|
|
@@ -2103,23 +2485,17 @@ function anthropic(model, options) {
|
|
|
2103
2485
|
model,
|
|
2104
2486
|
async run(input) {
|
|
2105
2487
|
const start = Date.now();
|
|
2106
|
-
const systemMessage = input.schema ?
|
|
2488
|
+
const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
|
|
2107
2489
|
const response = await client.messages.create({
|
|
2108
2490
|
model,
|
|
2109
2491
|
max_tokens: maxTokens,
|
|
2110
2492
|
system: systemMessage,
|
|
2111
2493
|
messages: [{ role: "user", content: input.prompt }]
|
|
2112
|
-
});
|
|
2494
|
+
}, { signal: input.signal });
|
|
2113
2495
|
const latencyMs = Date.now() - start;
|
|
2114
2496
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2115
2497
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2116
|
-
|
|
2117
|
-
if (input.schema) {
|
|
2118
|
-
try {
|
|
2119
|
-
output = JSON.parse(rawContent);
|
|
2120
|
-
} catch {
|
|
2121
|
-
}
|
|
2122
|
-
}
|
|
2498
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2123
2499
|
return {
|
|
2124
2500
|
output,
|
|
2125
2501
|
usage: {
|
|
@@ -2133,30 +2509,1024 @@ function anthropic(model, options) {
|
|
|
2133
2509
|
};
|
|
2134
2510
|
}
|
|
2135
2511
|
|
|
2136
|
-
// src/
|
|
2137
|
-
|
|
2138
|
-
function
|
|
2139
|
-
const
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
|
|
2143
|
-
|
|
2512
|
+
// src/reporter/markdown.ts
|
|
2513
|
+
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2514
|
+
function markdownReporter(report, _current) {
|
|
2515
|
+
const lines = [COMMENT_MARKER, ""];
|
|
2516
|
+
const status = report.failed ? "\u{1F534} Failed" : "\u{1F7E2} Passed";
|
|
2517
|
+
lines.push(`## \u2B21 Agent Duelist CI \u2014 ${status}`);
|
|
2518
|
+
lines.push("");
|
|
2519
|
+
if (report.comparisons.length > 0) {
|
|
2520
|
+
lines.push(markdownComparisonTable(report.comparisons));
|
|
2521
|
+
lines.push("");
|
|
2144
2522
|
}
|
|
2145
|
-
|
|
2146
|
-
|
|
2147
|
-
|
|
2523
|
+
if (report.cost.totalUsd > 0 || report.cost.budget !== void 0) {
|
|
2524
|
+
lines.push(markdownCostSummary(report.cost));
|
|
2525
|
+
lines.push("");
|
|
2526
|
+
}
|
|
2527
|
+
if (report.flakyResults.length > 0) {
|
|
2528
|
+
lines.push("### \u26A0\uFE0F Flaky Results");
|
|
2529
|
+
lines.push("");
|
|
2530
|
+
lines.push("These scorer/task combinations have high variance (CV > 0.3). Consider increasing `runs` or tightening prompts.");
|
|
2531
|
+
lines.push("");
|
|
2532
|
+
for (const f of report.flakyResults) {
|
|
2533
|
+
lines.push(`- **${f.providerId}** \xD7 ${f.taskName} \u2192 ${f.scorerName} (CV = ${f.current.cv.toFixed(2)})`);
|
|
2534
|
+
}
|
|
2535
|
+
lines.push("");
|
|
2536
|
+
}
|
|
2537
|
+
if (report.failureReasons.length > 0) {
|
|
2538
|
+
lines.push("### Failure Reasons");
|
|
2539
|
+
lines.push("");
|
|
2540
|
+
for (const reason of report.failureReasons) {
|
|
2541
|
+
lines.push(`- ${reason}`);
|
|
2542
|
+
}
|
|
2543
|
+
lines.push("");
|
|
2544
|
+
}
|
|
2545
|
+
lines.push("---");
|
|
2546
|
+
lines.push("*Generated by [agent-duelist](https://github.com/DataGobes/agent-duelist) CI*");
|
|
2547
|
+
return lines.join("\n");
|
|
2548
|
+
}
|
|
2549
|
+
function markdownComparisonTable(comparisons) {
|
|
2550
|
+
const lines = [];
|
|
2551
|
+
lines.push("| Provider | Task | Scorer | Baseline | Current | Delta | Status |");
|
|
2552
|
+
lines.push("|----------|------|--------|----------|---------|-------|--------|");
|
|
2553
|
+
for (const c of comparisons) {
|
|
2554
|
+
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2555
|
+
const currentStr = formatStats(c.current);
|
|
2556
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2557
|
+
const status = statusIndicator(c);
|
|
2558
|
+
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2559
|
+
}
|
|
2560
|
+
return lines.join("\n");
|
|
2561
|
+
}
|
|
2562
|
+
function markdownCostSummary(cost) {
|
|
2563
|
+
const lines = [];
|
|
2564
|
+
lines.push("### \u{1F4B0} Cost Summary");
|
|
2565
|
+
lines.push("");
|
|
2566
|
+
lines.push(`**Total:** $${cost.totalUsd.toFixed(4)}`);
|
|
2567
|
+
if (cost.budget !== void 0) {
|
|
2568
|
+
const pct = cost.budget > 0 ? (cost.totalUsd / cost.budget * 100).toFixed(0) : "\u221E";
|
|
2569
|
+
const indicator = cost.overBudget ? "\u{1F534}" : "\u{1F7E2}";
|
|
2570
|
+
lines.push(`**Budget:** $${cost.budget.toFixed(2)} (${pct}% used) ${indicator}`);
|
|
2571
|
+
}
|
|
2572
|
+
if (cost.perProvider.size > 1) {
|
|
2573
|
+
lines.push("");
|
|
2574
|
+
lines.push("| Provider | Cost |");
|
|
2575
|
+
lines.push("|----------|------|");
|
|
2576
|
+
for (const [provider, usd] of cost.perProvider) {
|
|
2577
|
+
lines.push(`| ${provider} | $${usd.toFixed(4)} |`);
|
|
2578
|
+
}
|
|
2579
|
+
}
|
|
2580
|
+
return lines.join("\n");
|
|
2581
|
+
}
|
|
2582
|
+
function formatStats(stats) {
|
|
2583
|
+
if (stats.n > 1) {
|
|
2584
|
+
const margin = (stats.ci95Upper - stats.ci95Lower) / 2;
|
|
2585
|
+
return `${stats.mean.toFixed(3)} \xB1 ${margin.toFixed(3)}`;
|
|
2586
|
+
}
|
|
2587
|
+
return stats.mean.toFixed(3);
|
|
2588
|
+
}
|
|
2589
|
+
function statusIndicator(c) {
|
|
2590
|
+
if (c.regressed) return "\u{1F534} regressed";
|
|
2591
|
+
if (c.improved) return "\u{1F7E2} improved";
|
|
2592
|
+
if (c.baseline === null) return "\u{1F195} new";
|
|
2593
|
+
return "\u26AA unchanged";
|
|
2594
|
+
}
|
|
2595
|
+
|
|
2596
|
+
// src/reporter/html.ts
|
|
2597
|
+
function esc(s) {
|
|
2598
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2599
|
+
}
|
|
2600
|
+
function htmlReporter(results) {
|
|
2601
|
+
if (results.length === 0) {
|
|
2602
|
+
return emptyReport();
|
|
2603
|
+
}
|
|
2604
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2605
|
+
const hasCost = scorerNames.includes("cost");
|
|
2606
|
+
const multi = providers.length >= 2;
|
|
2607
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2608
|
+
const taskSections = tasks.map((task) => {
|
|
2609
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2610
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2611
|
+
const medals = computeMedals(columnStats, providers);
|
|
2612
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2613
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2148
2614
|
});
|
|
2149
|
-
|
|
2615
|
+
const successResults = results.filter((r) => !r.error);
|
|
2616
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2617
|
+
for (const id of providers) {
|
|
2618
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2619
|
+
}
|
|
2620
|
+
const correctnessKey = successResults.some(
|
|
2621
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2622
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2623
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2624
|
+
const byLatency = providers.map((id) => {
|
|
2625
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2626
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2627
|
+
return { id, avg: avg ?? Infinity };
|
|
2628
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2629
|
+
const byCost = providers.map((id) => {
|
|
2630
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2631
|
+
const costs = runs.map((r) => {
|
|
2632
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2633
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2634
|
+
}).filter((c) => c !== void 0);
|
|
2635
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2636
|
+
return { id, avg };
|
|
2637
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2638
|
+
let overallWinner;
|
|
2639
|
+
if (multi) {
|
|
2640
|
+
const wins = /* @__PURE__ */ new Map();
|
|
2641
|
+
for (const id of providers) wins.set(id, 0);
|
|
2642
|
+
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2643
|
+
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2644
|
+
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2645
|
+
const maxWins = Math.max(...wins.values());
|
|
2646
|
+
if (maxWins > 0) {
|
|
2647
|
+
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2648
|
+
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2649
|
+
}
|
|
2650
|
+
}
|
|
2651
|
+
const errorResults = results.filter((r) => r.error);
|
|
2652
|
+
const deduped = dedupeErrors(errorResults);
|
|
2653
|
+
return `<!DOCTYPE html>
|
|
2654
|
+
<html lang="en">
|
|
2655
|
+
<head>
|
|
2656
|
+
<meta charset="UTF-8">
|
|
2657
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2658
|
+
<title>Agent Duelist Report</title>
|
|
2659
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2660
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2661
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2662
|
+
<meta property="og:type" content="website">
|
|
2663
|
+
${renderStyle()}
|
|
2664
|
+
</head>
|
|
2665
|
+
<body>
|
|
2666
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2667
|
+
<div class="report">
|
|
2668
|
+
|
|
2669
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2670
|
+
|
|
2671
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2672
|
+
|
|
2673
|
+
<main>
|
|
2674
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2675
|
+
s.task,
|
|
2676
|
+
s.providerData,
|
|
2677
|
+
s.columnStats,
|
|
2678
|
+
s.medals,
|
|
2679
|
+
s.winnerId,
|
|
2680
|
+
scorerNames,
|
|
2681
|
+
hasCost,
|
|
2682
|
+
multi,
|
|
2683
|
+
i
|
|
2684
|
+
)).join("\n")}
|
|
2685
|
+
</main>
|
|
2686
|
+
|
|
2687
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2688
|
+
|
|
2689
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2690
|
+
|
|
2691
|
+
${renderFooter()}
|
|
2692
|
+
|
|
2693
|
+
</div>
|
|
2694
|
+
${renderScript(tasks.length)}
|
|
2695
|
+
</body>
|
|
2696
|
+
</html>`;
|
|
2697
|
+
}
|
|
2698
|
+
function emptyReport() {
|
|
2699
|
+
return `<!DOCTYPE html>
|
|
2700
|
+
<html lang="en">
|
|
2701
|
+
<head>
|
|
2702
|
+
<meta charset="UTF-8">
|
|
2703
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2704
|
+
<title>Agent Duelist Report</title>
|
|
2705
|
+
${renderStyle()}
|
|
2706
|
+
</head>
|
|
2707
|
+
<body>
|
|
2708
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2709
|
+
<div class="report">
|
|
2710
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2711
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2712
|
+
${renderFooter()}
|
|
2713
|
+
</div>
|
|
2714
|
+
</body>
|
|
2715
|
+
</html>`;
|
|
2716
|
+
}
|
|
2717
|
+
function dedupeErrors(errorResults) {
|
|
2718
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2719
|
+
for (const r of errorResults) {
|
|
2720
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2721
|
+
const existing = seen.get(key);
|
|
2722
|
+
if (existing) {
|
|
2723
|
+
existing.count++;
|
|
2724
|
+
} else {
|
|
2725
|
+
seen.set(key, {
|
|
2726
|
+
providerId: r.providerId,
|
|
2727
|
+
error: r.error ?? "Unknown error",
|
|
2728
|
+
count: 1,
|
|
2729
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2730
|
+
});
|
|
2731
|
+
}
|
|
2732
|
+
}
|
|
2733
|
+
return [...seen.values()];
|
|
2734
|
+
}
|
|
2735
|
+
function renderStyle() {
|
|
2736
|
+
return `<style>
|
|
2737
|
+
:root {
|
|
2738
|
+
--bg: #0f172a;
|
|
2739
|
+
--bg-deep: #020617;
|
|
2740
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2741
|
+
--accent: #f59e0b;
|
|
2742
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2743
|
+
--text: #e2e8f0;
|
|
2744
|
+
--muted: #94a3b8;
|
|
2745
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2746
|
+
--green: #22c55e;
|
|
2747
|
+
--red: #ef4444;
|
|
2748
|
+
--yellow: #eab308;
|
|
2749
|
+
--radius: 12px;
|
|
2750
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2751
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2752
|
+
}
|
|
2753
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2754
|
+
html, body {
|
|
2755
|
+
font-family: var(--sans);
|
|
2756
|
+
background: var(--bg);
|
|
2757
|
+
color: var(--text);
|
|
2758
|
+
min-height: 100vh;
|
|
2759
|
+
}
|
|
2760
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2761
|
+
|
|
2762
|
+
/* Animated gradient mesh */
|
|
2763
|
+
.bg-mesh {
|
|
2764
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2765
|
+
overflow: hidden; pointer-events: none;
|
|
2766
|
+
}
|
|
2767
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2768
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2769
|
+
filter: blur(120px); opacity: 0.4;
|
|
2770
|
+
}
|
|
2771
|
+
.bg-mesh::before {
|
|
2772
|
+
width: 600px; height: 600px;
|
|
2773
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2774
|
+
top: -10%; left: -5%;
|
|
2775
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2776
|
+
}
|
|
2777
|
+
.bg-mesh::after {
|
|
2778
|
+
width: 500px; height: 500px;
|
|
2779
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2780
|
+
bottom: -10%; right: -5%;
|
|
2781
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2782
|
+
}
|
|
2783
|
+
.bg-mesh-extra {
|
|
2784
|
+
position: absolute; width: 400px; height: 400px;
|
|
2785
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2786
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2787
|
+
top: 50%; left: 60%;
|
|
2788
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2789
|
+
}
|
|
2790
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2791
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2792
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2793
|
+
|
|
2794
|
+
/* Report container */
|
|
2795
|
+
.report {
|
|
2796
|
+
position: relative; z-index: 1;
|
|
2797
|
+
width: 100%; max-width: 960px;
|
|
2798
|
+
}
|
|
2799
|
+
|
|
2800
|
+
/* Header */
|
|
2801
|
+
.report-header {
|
|
2802
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2803
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2804
|
+
}
|
|
2805
|
+
.report-brand {
|
|
2806
|
+
display: flex; align-items: center; gap: 10px;
|
|
2807
|
+
text-decoration: none; color: var(--muted);
|
|
2808
|
+
font-weight: 600; font-size: 14px;
|
|
2809
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2810
|
+
}
|
|
2811
|
+
.report-brand:hover { color: var(--text); }
|
|
2812
|
+
.brand-icon {
|
|
2813
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2814
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2815
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2816
|
+
display: flex; align-items: center; justify-content: center;
|
|
2817
|
+
font-size: 16px;
|
|
2818
|
+
}
|
|
2819
|
+
.report-meta {
|
|
2820
|
+
font-size: 12px; color: var(--muted);
|
|
2821
|
+
text-align: right; line-height: 1.6;
|
|
2822
|
+
}
|
|
2823
|
+
|
|
2824
|
+
/* Task tabs */
|
|
2825
|
+
.task-tabs {
|
|
2826
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2827
|
+
}
|
|
2828
|
+
.task-tab {
|
|
2829
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2830
|
+
border: 1px solid var(--border);
|
|
2831
|
+
background: transparent; color: var(--muted);
|
|
2832
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2833
|
+
transition: all 150ms ease;
|
|
2834
|
+
}
|
|
2835
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2836
|
+
.task-tab.active {
|
|
2837
|
+
background: var(--accent-soft);
|
|
2838
|
+
border-color: rgba(245,158,11,0.4);
|
|
2839
|
+
color: var(--accent);
|
|
2840
|
+
}
|
|
2841
|
+
|
|
2842
|
+
/* Task sections */
|
|
2843
|
+
.task-section { display: none; }
|
|
2844
|
+
.task-section.active { display: block; }
|
|
2845
|
+
.task-name {
|
|
2846
|
+
font-size: 18px; font-weight: 600;
|
|
2847
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2848
|
+
}
|
|
2849
|
+
|
|
2850
|
+
/* Results table */
|
|
2851
|
+
.results-table {
|
|
2852
|
+
width: 100%; border-collapse: collapse;
|
|
2853
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2854
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2855
|
+
border: 1px solid var(--border);
|
|
2856
|
+
}
|
|
2857
|
+
.results-table th, .results-table td {
|
|
2858
|
+
padding: 10px 14px;
|
|
2859
|
+
text-align: left;
|
|
2860
|
+
border-bottom: 1px solid var(--border);
|
|
2861
|
+
}
|
|
2862
|
+
.results-table th {
|
|
2863
|
+
background: rgba(0,0,0,0.3);
|
|
2864
|
+
font-size: 11px; font-weight: 600;
|
|
2865
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2866
|
+
color: var(--muted); cursor: pointer;
|
|
2867
|
+
user-select: none; white-space: nowrap;
|
|
2868
|
+
}
|
|
2869
|
+
.results-table th:hover { color: var(--text); }
|
|
2870
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
2871
|
+
.results-table tbody tr {
|
|
2872
|
+
background: var(--panel);
|
|
2873
|
+
transition: background 120ms ease;
|
|
2874
|
+
}
|
|
2875
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
2876
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
2877
|
+
|
|
2878
|
+
/* Score cell with progress bar */
|
|
2879
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
2880
|
+
.score-bar {
|
|
2881
|
+
position: absolute; left: 0; bottom: 0;
|
|
2882
|
+
height: 3px; border-radius: 2px;
|
|
2883
|
+
transition: width 300ms ease;
|
|
2884
|
+
}
|
|
2885
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
2886
|
+
|
|
2887
|
+
/* Color ranking */
|
|
2888
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
2889
|
+
.rank-worst { color: var(--red); }
|
|
2890
|
+
.rank-mid { color: var(--yellow); }
|
|
2891
|
+
.rank-neutral { color: var(--text); }
|
|
2892
|
+
.rank-error { color: var(--muted); }
|
|
2893
|
+
|
|
2894
|
+
/* Winner banner */
|
|
2895
|
+
.task-winner {
|
|
2896
|
+
display: flex; align-items: center; gap: 10px;
|
|
2897
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
2898
|
+
border-radius: var(--radius);
|
|
2899
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
2900
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
2901
|
+
font-size: 14px; font-weight: 500;
|
|
2902
|
+
}
|
|
2903
|
+
.task-winner .trophy { font-size: 20px; }
|
|
2904
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
2905
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
2906
|
+
|
|
2907
|
+
/* Summary cards */
|
|
2908
|
+
.summary-section { margin-top: 32px; }
|
|
2909
|
+
.summary-title {
|
|
2910
|
+
font-size: 16px; font-weight: 600;
|
|
2911
|
+
margin-bottom: 12px; color: var(--text);
|
|
2912
|
+
}
|
|
2913
|
+
.summary-cards {
|
|
2914
|
+
display: grid;
|
|
2915
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
2916
|
+
gap: 12px;
|
|
2917
|
+
}
|
|
2918
|
+
.summary-card {
|
|
2919
|
+
padding: 16px; border-radius: var(--radius);
|
|
2920
|
+
border: 1px solid var(--border);
|
|
2921
|
+
background: var(--panel);
|
|
2922
|
+
}
|
|
2923
|
+
.summary-card .card-label {
|
|
2924
|
+
font-size: 11px; font-weight: 600;
|
|
2925
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2926
|
+
color: var(--muted); margin-bottom: 6px;
|
|
2927
|
+
}
|
|
2928
|
+
.summary-card .card-value {
|
|
2929
|
+
font-size: 20px; font-weight: 700;
|
|
2930
|
+
color: var(--green); font-family: var(--mono);
|
|
2931
|
+
}
|
|
2932
|
+
.summary-card .card-provider {
|
|
2933
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
2934
|
+
}
|
|
2935
|
+
|
|
2936
|
+
/* Errors */
|
|
2937
|
+
.errors-section { margin-top: 24px; }
|
|
2938
|
+
.errors-title {
|
|
2939
|
+
font-size: 16px; font-weight: 600;
|
|
2940
|
+
margin-bottom: 8px; color: var(--red);
|
|
2941
|
+
cursor: pointer;
|
|
2942
|
+
}
|
|
2943
|
+
.errors-list {
|
|
2944
|
+
border-radius: var(--radius);
|
|
2945
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
2946
|
+
background: rgba(239,68,68,0.04);
|
|
2947
|
+
overflow: hidden;
|
|
2948
|
+
}
|
|
2949
|
+
.error-item {
|
|
2950
|
+
padding: 10px 16px;
|
|
2951
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
2952
|
+
font-size: 13px;
|
|
2953
|
+
}
|
|
2954
|
+
.error-item:last-child { border-bottom: none; }
|
|
2955
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
2956
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
2957
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
2958
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
2959
|
+
|
|
2960
|
+
/* Footer */
|
|
2961
|
+
.report-footer {
|
|
2962
|
+
margin-top: 40px; padding: 20px 0;
|
|
2963
|
+
border-top: 1px solid var(--border);
|
|
2964
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2965
|
+
flex-wrap: wrap; gap: 12px;
|
|
2966
|
+
}
|
|
2967
|
+
.footer-brand {
|
|
2968
|
+
font-size: 13px; color: var(--muted);
|
|
2969
|
+
}
|
|
2970
|
+
.footer-brand a {
|
|
2971
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
2972
|
+
}
|
|
2973
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
2974
|
+
.footer-cta {
|
|
2975
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
2976
|
+
padding: 6px 14px; border-radius: 8px;
|
|
2977
|
+
background: var(--accent-soft);
|
|
2978
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2979
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
2980
|
+
text-decoration: none;
|
|
2981
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
2982
|
+
}
|
|
2983
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
2984
|
+
|
|
2985
|
+
/* Empty state */
|
|
2986
|
+
.empty-msg {
|
|
2987
|
+
text-align: center; color: var(--muted);
|
|
2988
|
+
padding: 60px 20px; font-size: 16px;
|
|
2989
|
+
}
|
|
2990
|
+
|
|
2991
|
+
/* Responsive */
|
|
2992
|
+
@media (max-width: 640px) {
|
|
2993
|
+
body { padding: 12px; }
|
|
2994
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
2995
|
+
.report-meta { text-align: left; }
|
|
2996
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
2997
|
+
.results-table { font-size: 12px; }
|
|
2998
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
2999
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3000
|
+
}
|
|
3001
|
+
</style>`;
|
|
3002
|
+
}
|
|
3003
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3004
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3005
|
+
return `<header class="report-header">
|
|
3006
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3007
|
+
<div class="brand-icon">⬡</div>
|
|
3008
|
+
<span>Agent Duelist</span>
|
|
3009
|
+
</a>
|
|
3010
|
+
<div class="report-meta">
|
|
3011
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3012
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3013
|
+
${esc(runsLabel)}<br>
|
|
3014
|
+
${esc(now)}
|
|
3015
|
+
</div>
|
|
3016
|
+
</header>`;
|
|
3017
|
+
}
|
|
3018
|
+
function renderTabs(tasks) {
|
|
3019
|
+
const buttons = tasks.map(
|
|
3020
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3021
|
+
).join("\n ");
|
|
3022
|
+
return `<nav class="task-tabs">
|
|
3023
|
+
${buttons}
|
|
3024
|
+
</nav>`;
|
|
3025
|
+
}
|
|
3026
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3027
|
+
const cols = [
|
|
3028
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3029
|
+
];
|
|
3030
|
+
for (const name of scorerNames) {
|
|
3031
|
+
if (name === "latency") {
|
|
3032
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3033
|
+
} else if (name === "cost") {
|
|
3034
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3035
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3036
|
+
} else {
|
|
3037
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3038
|
+
}
|
|
3039
|
+
}
|
|
3040
|
+
const ths = cols.map(
|
|
3041
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3042
|
+
).join("");
|
|
3043
|
+
const rows = providerData.map((pd) => {
|
|
3044
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3045
|
+
const cells = [];
|
|
3046
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3047
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3048
|
+
if (pd.allErrors) {
|
|
3049
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3050
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3051
|
+
}
|
|
3052
|
+
} else {
|
|
3053
|
+
for (const col of cols.slice(1)) {
|
|
3054
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3055
|
+
}
|
|
3056
|
+
}
|
|
3057
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3058
|
+
}).join("\n");
|
|
3059
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3060
|
+
<span class="trophy">🏆</span>
|
|
3061
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3062
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3063
|
+
</div>` : "";
|
|
3064
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3065
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3066
|
+
<table class="results-table">
|
|
3067
|
+
<thead><tr>${ths}</tr></thead>
|
|
3068
|
+
<tbody>${rows}</tbody>
|
|
3069
|
+
</table>
|
|
3070
|
+
${winnerHtml}
|
|
3071
|
+
</section>`;
|
|
3072
|
+
}
|
|
3073
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3074
|
+
const colStats = columnStats.get(key);
|
|
3075
|
+
if (key === "latency") {
|
|
3076
|
+
const ms = pd.latencyMs;
|
|
3077
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3078
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3079
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3080
|
+
}
|
|
3081
|
+
if (key === "cost") {
|
|
3082
|
+
const cost = pd.avgDetails.costUsd;
|
|
3083
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3084
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3085
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3086
|
+
}
|
|
3087
|
+
if (key === "tokens") {
|
|
3088
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3089
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3090
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3091
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3092
|
+
}
|
|
3093
|
+
const val = pd.avgScores[key];
|
|
3094
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3095
|
+
const pct = Math.round(val * 100);
|
|
3096
|
+
let rankCls;
|
|
3097
|
+
if (multi && colStats) {
|
|
3098
|
+
rankCls = rankClass_(val, colStats);
|
|
3099
|
+
} else {
|
|
3100
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3101
|
+
}
|
|
3102
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3103
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3104
|
+
<span class="score-val">${pct}%</span>
|
|
3105
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3106
|
+
</td>`;
|
|
3107
|
+
}
|
|
3108
|
+
function rankClass_(value, colStats) {
|
|
3109
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3110
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3111
|
+
if (value === colStats.best) return "rank-best";
|
|
3112
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3113
|
+
return "rank-mid";
|
|
3114
|
+
}
|
|
3115
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3116
|
+
const cards = [];
|
|
3117
|
+
if (byCorrectness) {
|
|
3118
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3119
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3120
|
+
cards.push(`<div class="summary-card">
|
|
3121
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3122
|
+
<div class="card-value">${pct}</div>
|
|
3123
|
+
${provider}
|
|
3124
|
+
</div>`);
|
|
3125
|
+
}
|
|
3126
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3127
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3128
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3129
|
+
cards.push(`<div class="summary-card">
|
|
3130
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3131
|
+
<div class="card-value">${ms}</div>
|
|
3132
|
+
${provider}
|
|
3133
|
+
</div>`);
|
|
3134
|
+
}
|
|
3135
|
+
if (byCost?.avg !== void 0) {
|
|
3136
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3137
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3138
|
+
cards.push(`<div class="summary-card">
|
|
3139
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3140
|
+
<div class="card-value">${cost}</div>
|
|
3141
|
+
${provider}
|
|
3142
|
+
</div>`);
|
|
3143
|
+
}
|
|
3144
|
+
if (overallWinner) {
|
|
3145
|
+
cards.push(`<div class="summary-card">
|
|
3146
|
+
<div class="card-label">Overall Winner</div>
|
|
3147
|
+
<div class="card-value">🏆</div>
|
|
3148
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3149
|
+
</div>`);
|
|
3150
|
+
}
|
|
3151
|
+
if (cards.length === 0) return "";
|
|
3152
|
+
return `<section class="summary-section">
|
|
3153
|
+
<h2 class="summary-title">Summary</h2>
|
|
3154
|
+
<div class="summary-cards">
|
|
3155
|
+
${cards.join("\n ")}
|
|
3156
|
+
</div>
|
|
3157
|
+
</section>`;
|
|
3158
|
+
}
|
|
3159
|
+
function renderErrors(errors) {
|
|
3160
|
+
const items = errors.map((e) => {
|
|
3161
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3162
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3163
|
+
return `<div class="error-item">
|
|
3164
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3165
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3166
|
+
${hint}
|
|
3167
|
+
</div>`;
|
|
3168
|
+
}).join("\n");
|
|
3169
|
+
return `<section class="errors-section">
|
|
3170
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
|
|
3171
|
+
<div class="errors-list">
|
|
3172
|
+
${items}
|
|
3173
|
+
</div>
|
|
3174
|
+
</section>`;
|
|
3175
|
+
}
|
|
3176
|
+
function renderFooter() {
|
|
3177
|
+
return `<footer class="report-footer">
|
|
3178
|
+
<div class="footer-brand">
|
|
3179
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3180
|
+
</div>
|
|
3181
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3182
|
+
⭐ Star on GitHub
|
|
3183
|
+
</a>
|
|
3184
|
+
</footer>`;
|
|
3185
|
+
}
|
|
3186
|
+
function renderScript(taskCount) {
|
|
3187
|
+
return `<script>
|
|
3188
|
+
(function() {
|
|
3189
|
+
/* Tab switching */
|
|
3190
|
+
${taskCount > 1 ? `
|
|
3191
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3192
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3193
|
+
tabs.forEach(function(tab) {
|
|
3194
|
+
tab.addEventListener('click', function() {
|
|
3195
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3196
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3197
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3198
|
+
tab.classList.add('active');
|
|
3199
|
+
sections[idx].classList.add('active');
|
|
3200
|
+
});
|
|
3201
|
+
});` : ""}
|
|
3202
|
+
|
|
3203
|
+
/* Column sorting */
|
|
3204
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3205
|
+
var table = th.closest('table');
|
|
3206
|
+
var asc = true;
|
|
3207
|
+
th.addEventListener('click', function() {
|
|
3208
|
+
var tbody = table.querySelector('tbody');
|
|
3209
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3210
|
+
rows.sort(function(a, b) {
|
|
3211
|
+
var aCell = a.children[colIdx];
|
|
3212
|
+
var bCell = b.children[colIdx];
|
|
3213
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3214
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3215
|
+
if (aVal !== null && bVal !== null) {
|
|
3216
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3217
|
+
}
|
|
3218
|
+
var aText = aCell.textContent || '';
|
|
3219
|
+
var bText = bCell.textContent || '';
|
|
3220
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3221
|
+
});
|
|
3222
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3223
|
+
|
|
3224
|
+
/* Update sort arrows */
|
|
3225
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3226
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3227
|
+
asc = !asc;
|
|
3228
|
+
});
|
|
3229
|
+
});
|
|
3230
|
+
})();
|
|
3231
|
+
</script>`;
|
|
3232
|
+
}
|
|
3233
|
+
|
|
3234
|
+
// src/ci.ts
|
|
3235
|
+
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
3236
|
+
import { dirname } from "path";
|
|
3237
|
+
var LOWER_IS_BETTER = /* @__PURE__ */ new Set(["cost"]);
|
|
3238
|
+
var FLAKY_CV_THRESHOLD = 0.3;
|
|
3239
|
+
var T_CRITICAL_95 = {
|
|
3240
|
+
1: 12.706,
|
|
3241
|
+
2: 4.303,
|
|
3242
|
+
3: 3.182,
|
|
3243
|
+
4: 2.776,
|
|
3244
|
+
5: 2.571,
|
|
3245
|
+
6: 2.447,
|
|
3246
|
+
7: 2.365,
|
|
3247
|
+
8: 2.306,
|
|
3248
|
+
9: 2.262,
|
|
3249
|
+
10: 2.228,
|
|
3250
|
+
15: 2.131,
|
|
3251
|
+
20: 2.086,
|
|
3252
|
+
25: 2.06,
|
|
3253
|
+
30: 2.042
|
|
3254
|
+
};
|
|
3255
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
3256
|
+
function tCritical(df) {
|
|
3257
|
+
if (df <= 0) return 1.96;
|
|
3258
|
+
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
3259
|
+
const keys = T_CRITICAL_KEYS;
|
|
3260
|
+
if (df > keys[keys.length - 1]) return 1.96;
|
|
3261
|
+
for (let i = 0; i < keys.length - 1; i++) {
|
|
3262
|
+
if (df > keys[i] && df < keys[i + 1]) {
|
|
3263
|
+
const low = keys[i], high = keys[i + 1];
|
|
3264
|
+
const ratio = (df - low) / (high - low);
|
|
3265
|
+
return T_CRITICAL_95[low] + ratio * (T_CRITICAL_95[high] - T_CRITICAL_95[low]);
|
|
3266
|
+
}
|
|
3267
|
+
}
|
|
3268
|
+
return 1.96;
|
|
3269
|
+
}
|
|
3270
|
+
function computeScorerStats(samples) {
|
|
3271
|
+
const n = samples.length;
|
|
3272
|
+
if (n === 0) {
|
|
3273
|
+
return { mean: 0, stddev: 0, cv: 0, n: 0, ci95Lower: 0, ci95Upper: 0 };
|
|
3274
|
+
}
|
|
3275
|
+
const mean = samples.reduce((a, b) => a + b, 0) / n;
|
|
3276
|
+
if (n === 1) {
|
|
3277
|
+
return { mean, stddev: 0, cv: 0, n: 1, ci95Lower: mean, ci95Upper: mean };
|
|
3278
|
+
}
|
|
3279
|
+
const variance = samples.reduce((sum, x) => sum + (x - mean) ** 2, 0) / (n - 1);
|
|
3280
|
+
const stddev = Math.sqrt(variance);
|
|
3281
|
+
const cv = mean !== 0 ? stddev / Math.abs(mean) : 0;
|
|
3282
|
+
const se = stddev / Math.sqrt(n);
|
|
3283
|
+
const t = tCritical(n - 1);
|
|
3284
|
+
return {
|
|
3285
|
+
mean,
|
|
3286
|
+
stddev,
|
|
3287
|
+
cv,
|
|
3288
|
+
n,
|
|
3289
|
+
ci95Lower: mean - t * se,
|
|
3290
|
+
ci95Upper: mean + t * se
|
|
3291
|
+
};
|
|
3292
|
+
}
|
|
3293
|
+
function groupKey(providerId, taskName, scorerName) {
|
|
3294
|
+
return `${providerId}::${taskName}::${scorerName}`;
|
|
3295
|
+
}
|
|
3296
|
+
function computeStats(results) {
|
|
3297
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
3298
|
+
for (const r of results) {
|
|
3299
|
+
if (r.error) continue;
|
|
3300
|
+
for (const score of r.scores) {
|
|
3301
|
+
if (score.value < 0) continue;
|
|
3302
|
+
const key = groupKey(r.providerId, r.taskName, score.name);
|
|
3303
|
+
if (!grouped.has(key)) grouped.set(key, []);
|
|
3304
|
+
grouped.get(key).push(score.value);
|
|
3305
|
+
}
|
|
3306
|
+
}
|
|
3307
|
+
const stats = /* @__PURE__ */ new Map();
|
|
3308
|
+
for (const [key, samples] of grouped) {
|
|
3309
|
+
stats.set(key, computeScorerStats(samples));
|
|
3310
|
+
}
|
|
3311
|
+
return stats;
|
|
3312
|
+
}
|
|
3313
|
+
function computeCostSummary(results, budget) {
|
|
3314
|
+
let totalUsd = 0;
|
|
3315
|
+
const perProvider = /* @__PURE__ */ new Map();
|
|
3316
|
+
for (const r of results) {
|
|
3317
|
+
if (r.error) continue;
|
|
3318
|
+
const costScore = r.scores.find((s) => s.name === "cost");
|
|
3319
|
+
if (!costScore || costScore.value < 0) continue;
|
|
3320
|
+
const details = costScore.details;
|
|
3321
|
+
const usd = details?.estimatedUsd ?? 0;
|
|
3322
|
+
if (usd <= 0) continue;
|
|
3323
|
+
totalUsd += usd;
|
|
3324
|
+
perProvider.set(r.providerId, (perProvider.get(r.providerId) ?? 0) + usd);
|
|
3325
|
+
}
|
|
3326
|
+
return {
|
|
3327
|
+
totalUsd,
|
|
3328
|
+
perProvider,
|
|
3329
|
+
budget,
|
|
3330
|
+
overBudget: budget !== void 0 && totalUsd > budget
|
|
3331
|
+
};
|
|
3332
|
+
}
|
|
3333
|
+
function compareResults(baselineStats, currentStats, thresholds, budget, currentResults) {
|
|
3334
|
+
const comparisons = [];
|
|
3335
|
+
const failureReasons = [];
|
|
3336
|
+
for (const [key, current] of currentStats) {
|
|
3337
|
+
const [providerId, taskName, scorerName] = key.split("::");
|
|
3338
|
+
const baseline = baselineStats?.get(key) ?? null;
|
|
3339
|
+
let delta = null;
|
|
3340
|
+
let regressed = false;
|
|
3341
|
+
let improved = false;
|
|
3342
|
+
if (baseline) {
|
|
3343
|
+
delta = current.mean - baseline.mean;
|
|
3344
|
+
const threshold = thresholds.get(scorerName);
|
|
3345
|
+
if (threshold !== void 0) {
|
|
3346
|
+
const lowerIsBetter = LOWER_IS_BETTER.has(scorerName);
|
|
3347
|
+
regressed = detectRegression(baseline, current, threshold, lowerIsBetter);
|
|
3348
|
+
improved = detectImprovement(baseline, current, threshold, lowerIsBetter);
|
|
3349
|
+
}
|
|
3350
|
+
}
|
|
3351
|
+
const flaky = current.n > 1 && current.cv > FLAKY_CV_THRESHOLD;
|
|
3352
|
+
comparisons.push({
|
|
3353
|
+
providerId,
|
|
3354
|
+
taskName,
|
|
3355
|
+
scorerName,
|
|
3356
|
+
baseline,
|
|
3357
|
+
current,
|
|
3358
|
+
delta,
|
|
3359
|
+
regressed,
|
|
3360
|
+
improved,
|
|
3361
|
+
flaky
|
|
3362
|
+
});
|
|
3363
|
+
}
|
|
3364
|
+
const cost = computeCostSummary(currentResults ?? [], budget);
|
|
3365
|
+
const regressions = comparisons.filter((c) => c.regressed);
|
|
3366
|
+
if (regressions.length > 0) {
|
|
3367
|
+
for (const r of regressions) {
|
|
3368
|
+
failureReasons.push(
|
|
3369
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
3370
|
+
);
|
|
3371
|
+
}
|
|
3372
|
+
}
|
|
3373
|
+
if (cost.overBudget) {
|
|
3374
|
+
failureReasons.push(
|
|
3375
|
+
`Total cost $${cost.totalUsd.toFixed(4)} exceeds budget $${cost.budget.toFixed(2)}`
|
|
3376
|
+
);
|
|
3377
|
+
}
|
|
3378
|
+
const flakyResults = comparisons.filter((c) => c.flaky);
|
|
3379
|
+
const failed = failureReasons.length > 0;
|
|
3380
|
+
return { comparisons, cost, failed, flakyResults, failureReasons };
|
|
3381
|
+
}
|
|
3382
|
+
function detectRegression(baseline, current, threshold, lowerIsBetter) {
|
|
3383
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
3384
|
+
const delta = current.mean - baseline.mean;
|
|
3385
|
+
if (lowerIsBetter) return delta > threshold;
|
|
3386
|
+
return delta < -threshold;
|
|
3387
|
+
}
|
|
3388
|
+
if (lowerIsBetter) {
|
|
3389
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
3390
|
+
}
|
|
3391
|
+
return baseline.ci95Upper - current.ci95Lower > threshold && current.mean < baseline.mean;
|
|
3392
|
+
}
|
|
3393
|
+
function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
3394
|
+
if (baseline.n === 1 && current.n === 1) {
|
|
3395
|
+
const delta = current.mean - baseline.mean;
|
|
3396
|
+
if (lowerIsBetter) return delta < -threshold;
|
|
3397
|
+
return delta > threshold;
|
|
3398
|
+
}
|
|
3399
|
+
if (lowerIsBetter) {
|
|
3400
|
+
return baseline.ci95Lower - current.ci95Upper > threshold;
|
|
3401
|
+
}
|
|
3402
|
+
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
3403
|
+
}
|
|
3404
|
+
function loadBaseline(path) {
|
|
3405
|
+
try {
|
|
3406
|
+
const raw = readFileSync(path, "utf-8");
|
|
3407
|
+
const data = JSON.parse(raw);
|
|
3408
|
+
const results = data.results ?? data;
|
|
3409
|
+
if (!Array.isArray(results)) return null;
|
|
3410
|
+
return {
|
|
3411
|
+
timestamp: data.timestamp ?? "unknown",
|
|
3412
|
+
results
|
|
3413
|
+
};
|
|
3414
|
+
} catch {
|
|
3415
|
+
return null;
|
|
3416
|
+
}
|
|
3417
|
+
}
|
|
3418
|
+
function saveBaseline(path, results) {
|
|
3419
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
3420
|
+
const data = {
|
|
3421
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3422
|
+
results
|
|
3423
|
+
};
|
|
3424
|
+
writeFileSync(path, JSON.stringify(data, null, 2));
|
|
3425
|
+
}
|
|
3426
|
+
|
|
3427
|
+
// src/github.ts
|
|
3428
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
3429
|
+
function detectGitHubContext() {
|
|
3430
|
+
const token = process.env.GITHUB_TOKEN;
|
|
3431
|
+
const repository = process.env.GITHUB_REPOSITORY;
|
|
3432
|
+
const eventPath = process.env.GITHUB_EVENT_PATH;
|
|
3433
|
+
if (!token || !repository) return null;
|
|
3434
|
+
const [owner, repo] = repository.split("/");
|
|
3435
|
+
if (!owner || !repo) return null;
|
|
3436
|
+
let prNumber;
|
|
3437
|
+
if (eventPath) {
|
|
3438
|
+
try {
|
|
3439
|
+
const event = JSON.parse(readFileSync2(eventPath, "utf-8"));
|
|
3440
|
+
if (event.pull_request && typeof event.pull_request === "object") {
|
|
3441
|
+
const pr = event.pull_request;
|
|
3442
|
+
prNumber = pr.number;
|
|
3443
|
+
}
|
|
3444
|
+
if (!prNumber && event.issue && typeof event.issue === "object") {
|
|
3445
|
+
const issue = event.issue;
|
|
3446
|
+
if (issue.pull_request) {
|
|
3447
|
+
prNumber = issue.number;
|
|
3448
|
+
}
|
|
3449
|
+
}
|
|
3450
|
+
} catch {
|
|
3451
|
+
}
|
|
3452
|
+
}
|
|
3453
|
+
if (!prNumber && process.env.DUELIST_PR_NUMBER) {
|
|
3454
|
+
prNumber = parseInt(process.env.DUELIST_PR_NUMBER, 10);
|
|
3455
|
+
}
|
|
3456
|
+
if (!prNumber) return null;
|
|
3457
|
+
return { token, owner, repo, prNumber };
|
|
3458
|
+
}
|
|
3459
|
+
var API_BASE = "https://api.github.com";
|
|
3460
|
+
function ghHeaders(token, extra) {
|
|
3461
|
+
return {
|
|
3462
|
+
Authorization: `Bearer ${token}`,
|
|
3463
|
+
Accept: "application/vnd.github+json",
|
|
3464
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3465
|
+
...extra
|
|
3466
|
+
};
|
|
3467
|
+
}
|
|
3468
|
+
async function findExistingComment(ctx, marker) {
|
|
3469
|
+
let page = 1;
|
|
3470
|
+
const perPage = 50;
|
|
3471
|
+
while (true) {
|
|
3472
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
3473
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
3474
|
+
if (!res.ok) return null;
|
|
3475
|
+
const comments = await res.json();
|
|
3476
|
+
if (comments.length === 0) break;
|
|
3477
|
+
for (const comment of comments) {
|
|
3478
|
+
if (comment.body?.includes(marker)) {
|
|
3479
|
+
return comment.id;
|
|
3480
|
+
}
|
|
3481
|
+
}
|
|
3482
|
+
if (comments.length < perPage) break;
|
|
3483
|
+
page++;
|
|
3484
|
+
}
|
|
3485
|
+
return null;
|
|
3486
|
+
}
|
|
3487
|
+
async function upsertPrComment(ctx, body, marker) {
|
|
3488
|
+
const existingId = await findExistingComment(ctx, marker);
|
|
3489
|
+
if (existingId) {
|
|
3490
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
3491
|
+
const res = await fetch(url, {
|
|
3492
|
+
method: "PATCH",
|
|
3493
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
3494
|
+
body: JSON.stringify({ body })
|
|
3495
|
+
});
|
|
3496
|
+
if (!res.ok) {
|
|
3497
|
+
const text = await res.text();
|
|
3498
|
+
console.warn(`Failed to update PR comment: ${res.status} ${text}`);
|
|
3499
|
+
}
|
|
3500
|
+
} else {
|
|
3501
|
+
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
3502
|
+
const res = await fetch(url, {
|
|
3503
|
+
method: "POST",
|
|
3504
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
3505
|
+
body: JSON.stringify({ body })
|
|
3506
|
+
});
|
|
3507
|
+
if (!res.ok) {
|
|
3508
|
+
const text = await res.text();
|
|
3509
|
+
console.warn(`Failed to create PR comment: ${res.status} ${text}`);
|
|
3510
|
+
}
|
|
3511
|
+
}
|
|
2150
3512
|
}
|
|
2151
3513
|
export {
|
|
2152
3514
|
anthropic,
|
|
2153
3515
|
azureOpenai,
|
|
3516
|
+
compareResults,
|
|
3517
|
+
computeStats,
|
|
2154
3518
|
consoleReporter,
|
|
2155
3519
|
defineArena,
|
|
3520
|
+
detectGitHubContext,
|
|
2156
3521
|
gemini,
|
|
3522
|
+
htmlReporter,
|
|
2157
3523
|
jsonReporter,
|
|
3524
|
+
loadBaseline,
|
|
3525
|
+
markdownReporter,
|
|
2158
3526
|
openai,
|
|
2159
3527
|
openaiCompatible,
|
|
2160
|
-
registerPricing
|
|
3528
|
+
registerPricing,
|
|
3529
|
+
saveBaseline,
|
|
3530
|
+
upsertPrComment
|
|
2161
3531
|
};
|
|
2162
3532
|
//# sourceMappingURL=index.js.map
|