agent-duelist 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +251 -133
- package/dist/cli.js +4945 -2351
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1405 -468
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +35 -9
- package/dist/index.d.ts +35 -9
- package/dist/index.js +1402 -468
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -38,8 +38,11 @@ __export(index_exports, {
|
|
|
38
38
|
defineArena: () => defineArena,
|
|
39
39
|
detectGitHubContext: () => detectGitHubContext,
|
|
40
40
|
gemini: () => gemini,
|
|
41
|
+
htmlReporter: () => htmlReporter,
|
|
41
42
|
jsonReporter: () => jsonReporter,
|
|
43
|
+
listPacks: () => listPacks,
|
|
42
44
|
loadBaseline: () => loadBaseline,
|
|
45
|
+
loadPack: () => loadPack,
|
|
43
46
|
markdownReporter: () => markdownReporter,
|
|
44
47
|
openai: () => openai,
|
|
45
48
|
openaiCompatible: () => openaiCompatible,
|
|
@@ -1400,33 +1403,42 @@ var correctnessScorer = ({ task, result }) => {
|
|
|
1400
1403
|
if (task.expected === void 0) {
|
|
1401
1404
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1402
1405
|
}
|
|
1403
|
-
const
|
|
1406
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1407
|
+
const match = deepEqual(task.expected, actual);
|
|
1404
1408
|
return {
|
|
1405
1409
|
name: "correctness",
|
|
1406
1410
|
value: match ? 1 : 0,
|
|
1407
1411
|
details: { expected: task.expected, actual: result.output }
|
|
1408
1412
|
};
|
|
1409
1413
|
};
|
|
1410
|
-
function
|
|
1411
|
-
if (
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
if (
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1414
|
+
function normalizeOutput(expected, actual) {
|
|
1415
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1416
|
+
const entries = Object.entries(actual);
|
|
1417
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1418
|
+
if (arrayEntries.length === 1) {
|
|
1419
|
+
return arrayEntries[0][1];
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
return actual;
|
|
1423
|
+
}
|
|
1424
|
+
function deepEqual(expected, actual) {
|
|
1425
|
+
if (expected === actual) return true;
|
|
1426
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1427
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1428
|
+
}
|
|
1429
|
+
if (typeof expected !== typeof actual) return false;
|
|
1430
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1431
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1432
|
+
if (expected.length !== actual.length) return false;
|
|
1433
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1434
|
+
}
|
|
1435
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1436
|
+
const objExpected = expected;
|
|
1437
|
+
const objActual = actual;
|
|
1438
|
+
const keysExpected = Object.keys(objExpected);
|
|
1439
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1440
|
+
}
|
|
1441
|
+
return expected === actual;
|
|
1430
1442
|
}
|
|
1431
1443
|
|
|
1432
1444
|
// src/scorers/schema-correctness.ts
|
|
@@ -1446,7 +1458,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1446
1458
|
};
|
|
1447
1459
|
}
|
|
1448
1460
|
}
|
|
1449
|
-
|
|
1461
|
+
let parsed = task.schema.safeParse(data);
|
|
1462
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1463
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1464
|
+
if (arrayEntries.length === 1) {
|
|
1465
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1466
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1450
1469
|
return {
|
|
1451
1470
|
name: "schema-correctness",
|
|
1452
1471
|
value: parsed.success ? 1 : 0,
|
|
@@ -1461,11 +1480,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1461
1480
|
}
|
|
1462
1481
|
const a = stringify(task.expected);
|
|
1463
1482
|
const b = stringify(result.output);
|
|
1464
|
-
const
|
|
1483
|
+
const setA = tokenize(a);
|
|
1484
|
+
const setB = tokenize(b);
|
|
1485
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1465
1486
|
return {
|
|
1466
1487
|
name: "fuzzy-similarity",
|
|
1467
1488
|
value: Math.round(similarity * 100) / 100,
|
|
1468
|
-
details: { method: "jaccard", expectedTokens:
|
|
1489
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1469
1490
|
};
|
|
1470
1491
|
};
|
|
1471
1492
|
function stringify(value) {
|
|
@@ -1490,7 +1511,38 @@ var import_openai2 = __toESM(require("openai"), 1);
|
|
|
1490
1511
|
|
|
1491
1512
|
// src/providers/openai.ts
|
|
1492
1513
|
var import_openai = __toESM(require("openai"), 1);
|
|
1514
|
+
var import_zod_to_json_schema2 = require("zod-to-json-schema");
|
|
1515
|
+
|
|
1516
|
+
// src/providers/shared.ts
|
|
1493
1517
|
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1518
|
+
function buildSchemaSystemMessage(schema) {
|
|
1519
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1520
|
+
const jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema, { target: "openAi" });
|
|
1521
|
+
return [
|
|
1522
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1523
|
+
"",
|
|
1524
|
+
"Your output must conform to this JSON Schema:",
|
|
1525
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1526
|
+
"",
|
|
1527
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1528
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1529
|
+
].join("\n");
|
|
1530
|
+
}
|
|
1531
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1532
|
+
if (!hasSchema) return rawContent;
|
|
1533
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1534
|
+
try {
|
|
1535
|
+
return JSON.parse(cleaned);
|
|
1536
|
+
} catch {
|
|
1537
|
+
return rawContent;
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
function stripCodeFences(content) {
|
|
1541
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1542
|
+
return match ? match[1] : content;
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
// src/providers/openai.ts
|
|
1494
1546
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1495
1547
|
function openai(model, options) {
|
|
1496
1548
|
const client = new import_openai.default({
|
|
@@ -1537,7 +1589,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1537
1589
|
if (input.schema) {
|
|
1538
1590
|
params.response_format = { type: "json_object" };
|
|
1539
1591
|
params.messages = [
|
|
1540
|
-
{ role: "system", content:
|
|
1592
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1541
1593
|
...params.messages
|
|
1542
1594
|
];
|
|
1543
1595
|
}
|
|
@@ -1545,7 +1597,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1545
1597
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1546
1598
|
params.tool_choice = "auto";
|
|
1547
1599
|
}
|
|
1548
|
-
const
|
|
1600
|
+
const reqOpts = { signal: input.signal };
|
|
1601
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1602
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1549
1603
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1550
1604
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1551
1605
|
const choice = response.choices[0];
|
|
@@ -1579,7 +1633,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1579
1633
|
const followUp = await client.chat.completions.create({
|
|
1580
1634
|
model: requestModel,
|
|
1581
1635
|
messages: toolMessages
|
|
1582
|
-
},
|
|
1636
|
+
}, reqOpts);
|
|
1583
1637
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1584
1638
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1585
1639
|
finalResponse = followUp;
|
|
@@ -1590,13 +1644,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1590
1644
|
if (stripThinking) {
|
|
1591
1645
|
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1592
1646
|
}
|
|
1593
|
-
|
|
1594
|
-
if (input.schema) {
|
|
1595
|
-
try {
|
|
1596
|
-
output = JSON.parse(rawContent);
|
|
1597
|
-
} catch {
|
|
1598
|
-
}
|
|
1599
|
-
}
|
|
1647
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1600
1648
|
return {
|
|
1601
1649
|
output,
|
|
1602
1650
|
usage: {
|
|
@@ -1610,13 +1658,27 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1610
1658
|
}
|
|
1611
1659
|
};
|
|
1612
1660
|
}
|
|
1661
|
+
function gemini(model, options) {
|
|
1662
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1663
|
+
if (!apiKey) {
|
|
1664
|
+
throw new Error(
|
|
1665
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1666
|
+
);
|
|
1667
|
+
}
|
|
1668
|
+
const client = new import_openai.default({
|
|
1669
|
+
apiKey,
|
|
1670
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1671
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1672
|
+
});
|
|
1673
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1674
|
+
}
|
|
1613
1675
|
function toolDefToOpenAI(tool) {
|
|
1614
1676
|
return {
|
|
1615
1677
|
type: "function",
|
|
1616
1678
|
function: {
|
|
1617
1679
|
name: tool.name,
|
|
1618
1680
|
description: tool.description,
|
|
1619
|
-
parameters: (0,
|
|
1681
|
+
parameters: (0, import_zod_to_json_schema2.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
1620
1682
|
}
|
|
1621
1683
|
};
|
|
1622
1684
|
}
|
|
@@ -1665,8 +1727,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1665
1727
|
if (!apiKey) return void 0;
|
|
1666
1728
|
return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
|
|
1667
1729
|
}
|
|
1730
|
+
function isTemperatureError(err) {
|
|
1731
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1732
|
+
const lower = msg.toLowerCase();
|
|
1733
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1734
|
+
}
|
|
1668
1735
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1669
1736
|
let cached = void 0;
|
|
1737
|
+
let useTemperature = true;
|
|
1670
1738
|
return async ({ task, result }) => {
|
|
1671
1739
|
if (task.expected === void 0) {
|
|
1672
1740
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1683,36 +1751,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1683
1751
|
}
|
|
1684
1752
|
const { client, model } = cached;
|
|
1685
1753
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1754
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1686
1755
|
try {
|
|
1687
|
-
const response = await client
|
|
1688
|
-
|
|
1689
|
-
messages: [{ role: "user", content: prompt }],
|
|
1690
|
-
temperature: 0,
|
|
1691
|
-
max_tokens: 2048
|
|
1692
|
-
});
|
|
1693
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1694
|
-
const parsed = {};
|
|
1695
|
-
for (const line of content.split("\n")) {
|
|
1696
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1697
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1698
|
-
}
|
|
1699
|
-
const accuracy = parsed.accuracy;
|
|
1700
|
-
const completeness = parsed.completeness;
|
|
1701
|
-
const conciseness = parsed.conciseness;
|
|
1702
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1703
|
-
return {
|
|
1704
|
-
name: "llm-judge-correctness",
|
|
1705
|
-
value: -1,
|
|
1706
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1707
|
-
};
|
|
1708
|
-
}
|
|
1709
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1710
|
-
return {
|
|
1711
|
-
name: "llm-judge-correctness",
|
|
1712
|
-
value: composite,
|
|
1713
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1714
|
-
};
|
|
1756
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1757
|
+
return parseJudgeResponse(response, model);
|
|
1715
1758
|
} catch (err) {
|
|
1759
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1760
|
+
useTemperature = false;
|
|
1761
|
+
try {
|
|
1762
|
+
const response = await callJudge(client, model, messages, false);
|
|
1763
|
+
return parseJudgeResponse(response, model);
|
|
1764
|
+
} catch (retryErr) {
|
|
1765
|
+
return {
|
|
1766
|
+
name: "llm-judge-correctness",
|
|
1767
|
+
value: -1,
|
|
1768
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1769
|
+
};
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1716
1772
|
return {
|
|
1717
1773
|
name: "llm-judge-correctness",
|
|
1718
1774
|
value: -1,
|
|
@@ -1721,6 +1777,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1721
1777
|
}
|
|
1722
1778
|
};
|
|
1723
1779
|
}
|
|
1780
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1781
|
+
return client.chat.completions.create({
|
|
1782
|
+
model,
|
|
1783
|
+
messages,
|
|
1784
|
+
max_completion_tokens: 2048,
|
|
1785
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1786
|
+
});
|
|
1787
|
+
}
|
|
1788
|
+
function parseJudgeResponse(response, model) {
|
|
1789
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1790
|
+
const parsed = {};
|
|
1791
|
+
for (const line of content.split("\n")) {
|
|
1792
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1793
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1794
|
+
}
|
|
1795
|
+
const accuracy = parsed.accuracy;
|
|
1796
|
+
const completeness = parsed.completeness;
|
|
1797
|
+
const conciseness = parsed.conciseness;
|
|
1798
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1799
|
+
return {
|
|
1800
|
+
name: "llm-judge-correctness",
|
|
1801
|
+
value: -1,
|
|
1802
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1803
|
+
};
|
|
1804
|
+
}
|
|
1805
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1806
|
+
return {
|
|
1807
|
+
name: "llm-judge-correctness",
|
|
1808
|
+
value: composite,
|
|
1809
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1810
|
+
};
|
|
1811
|
+
}
|
|
1724
1812
|
|
|
1725
1813
|
// src/scorers/tool-usage.ts
|
|
1726
1814
|
var toolUsageScorer = ({ task, result }) => {
|
|
@@ -1784,118 +1872,174 @@ async function runBenchmarks(options) {
|
|
|
1784
1872
|
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1785
1873
|
const results = [];
|
|
1786
1874
|
for (const task of tasks) {
|
|
1787
|
-
for (
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1875
|
+
for (let run = 1; run <= runs; run++) {
|
|
1876
|
+
const runResults = await Promise.all(
|
|
1877
|
+
providers.map(async (provider) => {
|
|
1878
|
+
let result;
|
|
1879
|
+
try {
|
|
1880
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1881
|
+
prompt: task.prompt,
|
|
1882
|
+
schema: task.schema,
|
|
1883
|
+
tools: task.tools,
|
|
1884
|
+
signal,
|
|
1885
|
+
timeout
|
|
1886
|
+
}), timeout);
|
|
1887
|
+
const scores = await Promise.all(
|
|
1888
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1889
|
+
);
|
|
1890
|
+
result = {
|
|
1891
|
+
providerId: provider.id,
|
|
1892
|
+
taskName: task.name,
|
|
1893
|
+
run,
|
|
1894
|
+
scores,
|
|
1895
|
+
raw: {
|
|
1896
|
+
output: taskResult.output,
|
|
1897
|
+
latencyMs: taskResult.latencyMs,
|
|
1898
|
+
usage: taskResult.usage,
|
|
1899
|
+
toolCalls: taskResult.toolCalls
|
|
1900
|
+
}
|
|
1901
|
+
};
|
|
1902
|
+
} catch (err) {
|
|
1903
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1904
|
+
result = {
|
|
1905
|
+
providerId: provider.id,
|
|
1906
|
+
taskName: task.name,
|
|
1907
|
+
run,
|
|
1908
|
+
scores: [],
|
|
1909
|
+
error: message,
|
|
1910
|
+
raw: { output: "", latencyMs: 0 }
|
|
1911
|
+
};
|
|
1912
|
+
}
|
|
1913
|
+
onResult?.(result);
|
|
1914
|
+
return result;
|
|
1915
|
+
})
|
|
1916
|
+
);
|
|
1917
|
+
results.push(...runResults);
|
|
1826
1918
|
}
|
|
1827
1919
|
}
|
|
1828
1920
|
return results;
|
|
1829
1921
|
}
|
|
1830
1922
|
|
|
1831
|
-
// src/
|
|
1832
|
-
var
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
return `${boldCode}${s}${reset}`;
|
|
1843
|
-
}
|
|
1844
|
-
function dim(s) {
|
|
1845
|
-
return `${dimCode}${s}${reset}`;
|
|
1923
|
+
// src/utils/format.ts
|
|
1924
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1925
|
+
function formatCost(usd) {
|
|
1926
|
+
if (usd === void 0) return "\u2014";
|
|
1927
|
+
if (usd === 0) return "$0.00";
|
|
1928
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1929
|
+
const digits = Math.min(
|
|
1930
|
+
MAX_FRACTION_DIGITS,
|
|
1931
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1932
|
+
);
|
|
1933
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1846
1934
|
}
|
|
1847
|
-
function
|
|
1848
|
-
|
|
1935
|
+
function formatDelta(delta, precision = 4) {
|
|
1936
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1937
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1849
1938
|
}
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1939
|
+
|
|
1940
|
+
// src/reporter/shared.ts
|
|
1941
|
+
function groupResults(results) {
|
|
1942
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1943
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1944
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1945
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1946
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1947
|
+
let hasErrors = false;
|
|
1948
|
+
let maxRun = 0;
|
|
1949
|
+
for (const r of results) {
|
|
1950
|
+
taskSet.add(r.taskName);
|
|
1951
|
+
providerSet.add(r.providerId);
|
|
1952
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1953
|
+
if (r.error) hasErrors = true;
|
|
1954
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1955
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1956
|
+
let group = grouped.get(key);
|
|
1957
|
+
if (!group) {
|
|
1958
|
+
group = [];
|
|
1959
|
+
grouped.set(key, group);
|
|
1960
|
+
}
|
|
1961
|
+
group.push(r);
|
|
1962
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1963
|
+
if (!provGroup) {
|
|
1964
|
+
provGroup = [];
|
|
1965
|
+
byProvider.set(r.providerId, provGroup);
|
|
1966
|
+
}
|
|
1967
|
+
provGroup.push(r);
|
|
1858
1968
|
}
|
|
1859
|
-
return
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1869
|
-
const fillLen = Math.round(clamped * width);
|
|
1870
|
-
const fill = "\u2593".repeat(fillLen);
|
|
1871
|
-
const track = "\u2591".repeat(width - fillLen);
|
|
1872
|
-
return { fill, track };
|
|
1969
|
+
return {
|
|
1970
|
+
tasks: [...taskSet],
|
|
1971
|
+
providers: [...providerSet],
|
|
1972
|
+
scorerNames: [...scorerSet],
|
|
1973
|
+
grouped,
|
|
1974
|
+
byProvider,
|
|
1975
|
+
hasErrors,
|
|
1976
|
+
maxRun
|
|
1977
|
+
};
|
|
1873
1978
|
}
|
|
1874
|
-
function
|
|
1875
|
-
const
|
|
1876
|
-
|
|
1877
|
-
|
|
1979
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1980
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1981
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1982
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1983
|
+
if (successResults.length === 0) {
|
|
1984
|
+
return {
|
|
1985
|
+
providerId,
|
|
1986
|
+
avgScores: {},
|
|
1987
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1988
|
+
latencyMs: void 0,
|
|
1989
|
+
allErrors: errorResults.length > 0,
|
|
1990
|
+
errorCount: errorResults.length
|
|
1991
|
+
};
|
|
1878
1992
|
}
|
|
1879
|
-
|
|
1880
|
-
|
|
1993
|
+
return {
|
|
1994
|
+
providerId,
|
|
1995
|
+
avgScores: averageScores(successResults),
|
|
1996
|
+
avgDetails: averageDetails(successResults),
|
|
1997
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1998
|
+
allErrors: false,
|
|
1999
|
+
errorCount: errorResults.length
|
|
2000
|
+
};
|
|
2001
|
+
}
|
|
2002
|
+
function averageScores(results) {
|
|
2003
|
+
const sums = {};
|
|
2004
|
+
const counts = {};
|
|
2005
|
+
for (const result of results) {
|
|
2006
|
+
for (const score of result.scores) {
|
|
2007
|
+
if (score.value < 0) continue;
|
|
2008
|
+
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
2009
|
+
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
2010
|
+
}
|
|
1881
2011
|
}
|
|
1882
|
-
const
|
|
1883
|
-
|
|
1884
|
-
|
|
2012
|
+
const avgs = {};
|
|
2013
|
+
for (const name of Object.keys(sums)) {
|
|
2014
|
+
avgs[name] = sums[name] / counts[name];
|
|
1885
2015
|
}
|
|
1886
|
-
return
|
|
2016
|
+
return avgs;
|
|
1887
2017
|
}
|
|
1888
|
-
function
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
2018
|
+
function averageDetails(results) {
|
|
2019
|
+
let costSum = 0;
|
|
2020
|
+
let costCount = 0;
|
|
2021
|
+
let tokenSum = 0;
|
|
2022
|
+
let tokenCount = 0;
|
|
2023
|
+
for (const result of results) {
|
|
2024
|
+
const costScore = result.scores.find((s) => s.name === "cost");
|
|
2025
|
+
const details = costScore?.details;
|
|
2026
|
+
if (details?.estimatedUsd != null) {
|
|
2027
|
+
costSum += details.estimatedUsd;
|
|
2028
|
+
costCount++;
|
|
2029
|
+
}
|
|
2030
|
+
if (details?.totalTokens != null) {
|
|
2031
|
+
tokenSum += details.totalTokens;
|
|
2032
|
+
tokenCount++;
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
return {
|
|
2036
|
+
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
2037
|
+
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
2038
|
+
};
|
|
1893
2039
|
}
|
|
1894
|
-
function
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
const padding = Math.max(0, totalInner - dw - 1);
|
|
1898
|
-
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2040
|
+
function average(nums) {
|
|
2041
|
+
if (nums.length === 0) return void 0;
|
|
2042
|
+
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1899
2043
|
}
|
|
1900
2044
|
function computeColumnStats(providerData, scorerNames) {
|
|
1901
2045
|
const stats = /* @__PURE__ */ new Map();
|
|
@@ -1947,62 +2091,274 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1947
2091
|
}
|
|
1948
2092
|
return stats;
|
|
1949
2093
|
}
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
2094
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2095
|
+
"correctness",
|
|
2096
|
+
"schema-correctness",
|
|
2097
|
+
"fuzzy-similarity",
|
|
2098
|
+
"llm-judge-correctness",
|
|
2099
|
+
"tool-usage"
|
|
2100
|
+
]);
|
|
2101
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2102
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2103
|
+
if (qualityColumns.length === 0) return true;
|
|
2104
|
+
return qualityColumns.some((col) => {
|
|
2105
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2106
|
+
return val !== void 0 && val > 0;
|
|
2107
|
+
});
|
|
1958
2108
|
}
|
|
1959
2109
|
function computeMedals(columnStats, providerIds) {
|
|
1960
2110
|
const medals = /* @__PURE__ */ new Map();
|
|
1961
2111
|
if (providerIds.length < 2) {
|
|
1962
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2112
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1963
2113
|
return medals;
|
|
1964
2114
|
}
|
|
1965
|
-
const
|
|
1966
|
-
|
|
1967
|
-
|
|
2115
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2116
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2117
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2118
|
+
for (const id of providerIds) {
|
|
2119
|
+
qualityWins.set(id, 0);
|
|
2120
|
+
efficiencyWins.set(id, 0);
|
|
2121
|
+
}
|
|
2122
|
+
for (const [colName, colStats] of columnStats) {
|
|
1968
2123
|
if (colStats.best === void 0) continue;
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
2124
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2125
|
+
if (bestProviders.length === 1) {
|
|
2126
|
+
const winnerId = bestProviders[0][0];
|
|
2127
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2128
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2129
|
+
} else {
|
|
2130
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
1972
2131
|
}
|
|
1973
2132
|
}
|
|
1974
2133
|
}
|
|
1975
|
-
const totalWins = [...
|
|
2134
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
1976
2135
|
if (totalWins === 0) {
|
|
1977
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2136
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1978
2137
|
return medals;
|
|
1979
2138
|
}
|
|
1980
|
-
const
|
|
1981
|
-
(
|
|
1982
|
-
|
|
1983
|
-
|
|
2139
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2140
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2141
|
+
if (qDiff !== 0) return qDiff;
|
|
2142
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2143
|
+
if (eDiff !== 0) return eDiff;
|
|
2144
|
+
return a.localeCompare(b);
|
|
2145
|
+
});
|
|
2146
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
1984
2147
|
let rank = 0;
|
|
1985
|
-
for (let i = 0; i <
|
|
1986
|
-
if (i > 0
|
|
1987
|
-
|
|
2148
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2149
|
+
if (i > 0) {
|
|
2150
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2151
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2152
|
+
if (currQ < prevQ) {
|
|
2153
|
+
rank = i;
|
|
2154
|
+
} else if (currQ === prevQ) {
|
|
2155
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2156
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2157
|
+
if (currE < prevE) rank = i;
|
|
2158
|
+
}
|
|
1988
2159
|
}
|
|
1989
|
-
medals.set(
|
|
2160
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2161
|
+
}
|
|
2162
|
+
for (const id of providerIds) {
|
|
2163
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
1990
2164
|
}
|
|
1991
2165
|
return medals;
|
|
1992
2166
|
}
|
|
2167
|
+
function providerLabel(providerId) {
|
|
2168
|
+
const prefix = providerId.split("/")[0];
|
|
2169
|
+
switch (prefix) {
|
|
2170
|
+
case "azure":
|
|
2171
|
+
return "(OpenAI via Azure)";
|
|
2172
|
+
case "openai":
|
|
2173
|
+
return "(OpenAI)";
|
|
2174
|
+
case "anthropic":
|
|
2175
|
+
return "(Anthropic)";
|
|
2176
|
+
case "google":
|
|
2177
|
+
return "(Google)";
|
|
2178
|
+
case "mistral":
|
|
2179
|
+
return "(Mistral)";
|
|
2180
|
+
case "meta":
|
|
2181
|
+
return "(Meta)";
|
|
2182
|
+
case "deepseek":
|
|
2183
|
+
return "(DeepSeek)";
|
|
2184
|
+
case "cohere":
|
|
2185
|
+
return "(Cohere)";
|
|
2186
|
+
case "qwen":
|
|
2187
|
+
return "(Qwen)";
|
|
2188
|
+
case "xai":
|
|
2189
|
+
return "(xAI)";
|
|
2190
|
+
case "minimax":
|
|
2191
|
+
return "(MiniMax)";
|
|
2192
|
+
case "moonshot":
|
|
2193
|
+
return "(Moonshot / Kimi)";
|
|
2194
|
+
case "perplexity":
|
|
2195
|
+
return "(Perplexity)";
|
|
2196
|
+
case "amazon":
|
|
2197
|
+
return "(Amazon)";
|
|
2198
|
+
case "nvidia":
|
|
2199
|
+
return "(NVIDIA)";
|
|
2200
|
+
case "microsoft":
|
|
2201
|
+
return "(Microsoft)";
|
|
2202
|
+
case "ai21":
|
|
2203
|
+
return "(AI21 Labs)";
|
|
2204
|
+
case "bytedance":
|
|
2205
|
+
return "(ByteDance)";
|
|
2206
|
+
case "together":
|
|
2207
|
+
return "(Together AI)";
|
|
2208
|
+
case "fireworks":
|
|
2209
|
+
return "(Fireworks AI)";
|
|
2210
|
+
case "groq":
|
|
2211
|
+
return "(Groq)";
|
|
2212
|
+
case "cerebras":
|
|
2213
|
+
return "(Cerebras)";
|
|
2214
|
+
default:
|
|
2215
|
+
return `(${prefix})`;
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
function apiKeyHint(providerId, error) {
|
|
2219
|
+
const lower = error.toLowerCase();
|
|
2220
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2221
|
+
if (!isAuthError) return void 0;
|
|
2222
|
+
const prefix = providerId.split("/")[0];
|
|
2223
|
+
switch (prefix) {
|
|
2224
|
+
case "openai":
|
|
2225
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2226
|
+
case "azure":
|
|
2227
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2228
|
+
case "anthropic":
|
|
2229
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2230
|
+
case "google":
|
|
2231
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2232
|
+
default:
|
|
2233
|
+
return `Check the API key for ${providerId}`;
|
|
2234
|
+
}
|
|
2235
|
+
}
|
|
2236
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2237
|
+
const ranked = providers.map((id) => {
|
|
2238
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2239
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2240
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2241
|
+
return { id, avg };
|
|
2242
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2243
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2244
|
+
}
|
|
2245
|
+
function scorerLabel(name) {
|
|
2246
|
+
switch (name) {
|
|
2247
|
+
case "correctness":
|
|
2248
|
+
return "Match";
|
|
2249
|
+
case "schema-correctness":
|
|
2250
|
+
return "Schema";
|
|
2251
|
+
case "fuzzy-similarity":
|
|
2252
|
+
return "Fuzzy";
|
|
2253
|
+
case "llm-judge-correctness":
|
|
2254
|
+
return "Judge";
|
|
2255
|
+
case "tool-usage":
|
|
2256
|
+
return "Tool";
|
|
2257
|
+
default:
|
|
2258
|
+
return name;
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
function medalEmoji(medal) {
|
|
2262
|
+
switch (medal) {
|
|
2263
|
+
case "gold":
|
|
2264
|
+
return "\u{1F947}";
|
|
2265
|
+
case "silver":
|
|
2266
|
+
return "\u{1F948}";
|
|
2267
|
+
case "bronze":
|
|
2268
|
+
return "\u{1F949}";
|
|
2269
|
+
case "none":
|
|
2270
|
+
return "";
|
|
2271
|
+
}
|
|
2272
|
+
}
|
|
2273
|
+
|
|
2274
|
+
// src/reporter/console.ts
|
|
2275
|
+
var reset = "\x1B[0m";
|
|
2276
|
+
var boldCode = "\x1B[1m";
|
|
2277
|
+
var dimCode = "\x1B[2m";
|
|
2278
|
+
var green = "\x1B[32m";
|
|
2279
|
+
var red = "\x1B[31m";
|
|
2280
|
+
var yellow = "\x1B[33m";
|
|
2281
|
+
var cyan = "\x1B[36m";
|
|
2282
|
+
var brightGreen = "\x1B[92m";
|
|
2283
|
+
var brightWhite = "\x1B[97m";
|
|
2284
|
+
function bold(s) {
|
|
2285
|
+
return `${boldCode}${s}${reset}`;
|
|
2286
|
+
}
|
|
2287
|
+
function dim(s) {
|
|
2288
|
+
return `${dimCode}${s}${reset}`;
|
|
2289
|
+
}
|
|
2290
|
+
function stripAnsi(s) {
|
|
2291
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2292
|
+
}
|
|
2293
|
+
function displayWidth(s) {
|
|
2294
|
+
const stripped = stripAnsi(s);
|
|
2295
|
+
let width = 0;
|
|
2296
|
+
for (const ch of stripped) {
|
|
2297
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2298
|
+
if (code >= 126976) width += 2;
|
|
2299
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2300
|
+
else width += 1;
|
|
2301
|
+
}
|
|
2302
|
+
return width;
|
|
2303
|
+
}
|
|
2304
|
+
function padCell(str, targetWidth, align) {
|
|
2305
|
+
const dw = displayWidth(str);
|
|
2306
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2307
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2308
|
+
return str + " ".repeat(padding);
|
|
2309
|
+
}
|
|
2310
|
+
function sparkBar(ratio, width = 8) {
|
|
2311
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2312
|
+
const fillLen = Math.round(clamped * width);
|
|
2313
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2314
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2315
|
+
return { fill, track };
|
|
2316
|
+
}
|
|
2317
|
+
function drawTableLine(widths, position) {
|
|
2318
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2319
|
+
if (position === "bottom") {
|
|
2320
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2321
|
+
}
|
|
2322
|
+
if (position === "merge") {
|
|
2323
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2324
|
+
}
|
|
2325
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2326
|
+
if (position === "top") {
|
|
2327
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2328
|
+
}
|
|
2329
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2330
|
+
}
|
|
2331
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2332
|
+
const parts = cells.map(
|
|
2333
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2334
|
+
);
|
|
2335
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2336
|
+
}
|
|
2337
|
+
function drawSpanRow(content, widths) {
|
|
2338
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2339
|
+
const dw = displayWidth(content);
|
|
2340
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2341
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2342
|
+
}
|
|
2343
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2344
|
+
if (value === void 0) return dim("\u2014");
|
|
2345
|
+
if (providerCount < 2) return text;
|
|
2346
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2347
|
+
if (colStats.best === colStats.worst) return text;
|
|
2348
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2349
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2350
|
+
return `${yellow}${text}${reset}`;
|
|
2351
|
+
}
|
|
1993
2352
|
function consoleReporter(results, options) {
|
|
1994
2353
|
const showSparklines = options?.sparklines ?? true;
|
|
1995
2354
|
if (results.length === 0) {
|
|
1996
2355
|
console.log("\nNo results to display.\n");
|
|
1997
2356
|
return;
|
|
1998
2357
|
}
|
|
1999
|
-
const tasks =
|
|
2000
|
-
const providers = [...new Set(results.map((r) => r.providerId))];
|
|
2001
|
-
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
2358
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
2002
2359
|
const hasCost = scorerNames.includes("cost");
|
|
2003
|
-
const hasErrors = results.some((r) => r.error);
|
|
2004
2360
|
const multi = providers.length >= 2;
|
|
2005
|
-
const runsPerCell =
|
|
2361
|
+
const runsPerCell = maxRun;
|
|
2006
2362
|
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
2007
2363
|
console.log("");
|
|
2008
2364
|
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
@@ -2011,29 +2367,9 @@ function consoleReporter(results, options) {
|
|
|
2011
2367
|
for (const task of tasks) {
|
|
2012
2368
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
2013
2369
|
console.log("");
|
|
2014
|
-
const providerData = providers.map(
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
2018
|
-
if (successResults.length === 0) {
|
|
2019
|
-
return {
|
|
2020
|
-
providerId,
|
|
2021
|
-
avgScores: {},
|
|
2022
|
-
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
2023
|
-
latencyMs: void 0,
|
|
2024
|
-
allErrors: errorResults2.length > 0,
|
|
2025
|
-
errorCount: errorResults2.length
|
|
2026
|
-
};
|
|
2027
|
-
}
|
|
2028
|
-
return {
|
|
2029
|
-
providerId,
|
|
2030
|
-
avgScores: averageScores(successResults),
|
|
2031
|
-
avgDetails: averageDetails(successResults),
|
|
2032
|
-
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
2033
|
-
allErrors: false,
|
|
2034
|
-
errorCount: errorResults2.length
|
|
2035
|
-
};
|
|
2036
|
-
});
|
|
2370
|
+
const providerData = providers.map(
|
|
2371
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2372
|
+
);
|
|
2037
2373
|
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2038
2374
|
const medals = computeMedals(columnStats, providers);
|
|
2039
2375
|
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
@@ -2048,8 +2384,7 @@ function consoleReporter(results, options) {
|
|
|
2048
2384
|
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
2049
2385
|
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
2050
2386
|
} else {
|
|
2051
|
-
|
|
2052
|
-
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2387
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2053
2388
|
}
|
|
2054
2389
|
}
|
|
2055
2390
|
if (hasErrors) {
|
|
@@ -2062,7 +2397,7 @@ function consoleReporter(results, options) {
|
|
|
2062
2397
|
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2063
2398
|
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2064
2399
|
for (const pd of providerData) {
|
|
2065
|
-
const medal = medals.get(pd.providerId) ?? "";
|
|
2400
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2066
2401
|
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2067
2402
|
const cells = [providerCell];
|
|
2068
2403
|
if (pd.allErrors) {
|
|
@@ -2135,7 +2470,7 @@ function consoleReporter(results, options) {
|
|
|
2135
2470
|
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2136
2471
|
}
|
|
2137
2472
|
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2138
|
-
const winnerId = [...medals.entries()].find(([, m]) => m === "
|
|
2473
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2139
2474
|
if (winnerId) {
|
|
2140
2475
|
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2141
2476
|
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
@@ -2145,7 +2480,7 @@ function consoleReporter(results, options) {
|
|
|
2145
2480
|
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2146
2481
|
console.log("");
|
|
2147
2482
|
}
|
|
2148
|
-
printSummary(results, providers);
|
|
2483
|
+
printSummary(results, providers, byProvider);
|
|
2149
2484
|
const errorResults = results.filter((r) => r.error);
|
|
2150
2485
|
if (errorResults.length > 0) {
|
|
2151
2486
|
console.log(` ${bold("Errors")}`);
|
|
@@ -2168,203 +2503,66 @@ function consoleReporter(results, options) {
|
|
|
2168
2503
|
console.log("");
|
|
2169
2504
|
}
|
|
2170
2505
|
}
|
|
2171
|
-
function printSummary(results, providers) {
|
|
2506
|
+
function printSummary(results, providers, byProvider) {
|
|
2172
2507
|
const successResults = results.filter((r) => !r.error);
|
|
2173
2508
|
if (successResults.length === 0) return;
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
}
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
}
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2192
|
-
return { id, avg: avg ?? Infinity };
|
|
2193
|
-
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2194
|
-
if (byLatency && byLatency.avg !== Infinity) {
|
|
2195
|
-
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2196
|
-
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2197
|
-
if (single) {
|
|
2198
|
-
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2199
|
-
} else {
|
|
2200
|
-
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2201
|
-
}
|
|
2202
|
-
}
|
|
2203
|
-
const byCost = providers.map((id) => {
|
|
2204
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
2205
|
-
const costs = runs.map((r) => {
|
|
2206
|
-
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2207
|
-
return s && s.value >= 0 ? s.value : void 0;
|
|
2208
|
-
}).filter((c) => c !== void 0);
|
|
2209
|
-
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2210
|
-
return { id, avg };
|
|
2211
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2212
|
-
if (byCost?.avg !== void 0) {
|
|
2213
|
-
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2214
|
-
const costStr = formatCost(byCost.avg);
|
|
2215
|
-
if (single) {
|
|
2216
|
-
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2217
|
-
} else {
|
|
2218
|
-
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2219
|
-
}
|
|
2220
|
-
}
|
|
2221
|
-
if (!single) {
|
|
2222
|
-
const wins = /* @__PURE__ */ new Map();
|
|
2223
|
-
for (const id of providers) wins.set(id, 0);
|
|
2224
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2225
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2226
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2227
|
-
const maxWins = Math.max(...wins.values());
|
|
2228
|
-
if (maxWins > 0) {
|
|
2229
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2230
|
-
console.log("");
|
|
2231
|
-
if (topProviders.length === 1) {
|
|
2232
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2233
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2234
|
-
} else {
|
|
2235
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2236
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2237
|
-
}
|
|
2238
|
-
}
|
|
2239
|
-
}
|
|
2240
|
-
console.log("");
|
|
2241
|
-
}
|
|
2242
|
-
function rankProviders(results, providers, scorerName) {
|
|
2243
|
-
const ranked = providers.map((id) => {
|
|
2244
|
-
const runs = results.filter((r) => r.providerId === id);
|
|
2245
|
-
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2246
|
-
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2247
|
-
return { id, avg };
|
|
2248
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2249
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2250
|
-
}
|
|
2251
|
-
function averageScores(results) {
|
|
2252
|
-
const sums = {};
|
|
2253
|
-
const counts = {};
|
|
2254
|
-
for (const result of results) {
|
|
2255
|
-
for (const score of result.scores) {
|
|
2256
|
-
if (score.value < 0) continue;
|
|
2257
|
-
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
2258
|
-
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
2259
|
-
}
|
|
2260
|
-
}
|
|
2261
|
-
const avgs = {};
|
|
2262
|
-
for (const name of Object.keys(sums)) {
|
|
2263
|
-
avgs[name] = sums[name] / counts[name];
|
|
2264
|
-
}
|
|
2265
|
-
return avgs;
|
|
2266
|
-
}
|
|
2267
|
-
function averageDetails(results) {
|
|
2268
|
-
let costSum = 0;
|
|
2269
|
-
let costCount = 0;
|
|
2270
|
-
let tokenSum = 0;
|
|
2271
|
-
let tokenCount = 0;
|
|
2272
|
-
for (const result of results) {
|
|
2273
|
-
const costScore = result.scores.find((s) => s.name === "cost");
|
|
2274
|
-
const details = costScore?.details;
|
|
2275
|
-
if (details?.estimatedUsd != null) {
|
|
2276
|
-
costSum += details.estimatedUsd;
|
|
2277
|
-
costCount++;
|
|
2509
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2510
|
+
for (const id of providers) {
|
|
2511
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2512
|
+
}
|
|
2513
|
+
console.log(` ${bold("Summary")}`);
|
|
2514
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2515
|
+
console.log("");
|
|
2516
|
+
const single = providers.length === 1;
|
|
2517
|
+
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2518
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2519
|
+
if (byCorrectness) {
|
|
2520
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2521
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2522
|
+
if (single) {
|
|
2523
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2524
|
+
} else {
|
|
2525
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2278
2526
|
}
|
|
2279
|
-
|
|
2280
|
-
|
|
2281
|
-
|
|
2527
|
+
}
|
|
2528
|
+
const byLatency = providers.map((id) => {
|
|
2529
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2530
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2531
|
+
return { id, avg: avg ?? Infinity };
|
|
2532
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2533
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
2534
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2535
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2536
|
+
if (single) {
|
|
2537
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2538
|
+
} else {
|
|
2539
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2282
2540
|
}
|
|
2283
2541
|
}
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2291
|
-
|
|
2292
|
-
}
|
|
2293
|
-
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
|
-
|
|
2297
|
-
|
|
2298
|
-
|
|
2299
|
-
}
|
|
2300
|
-
|
|
2301
|
-
const lower = error.toLowerCase();
|
|
2302
|
-
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2303
|
-
if (!isAuthError) return void 0;
|
|
2304
|
-
const prefix = providerId.split("/")[0];
|
|
2305
|
-
switch (prefix) {
|
|
2306
|
-
case "openai":
|
|
2307
|
-
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2308
|
-
case "azure":
|
|
2309
|
-
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2310
|
-
case "anthropic":
|
|
2311
|
-
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2312
|
-
case "google":
|
|
2313
|
-
return "Set: export GOOGLE_API_KEY=...";
|
|
2314
|
-
default:
|
|
2315
|
-
return `Check the API key for ${providerId}`;
|
|
2542
|
+
const byCost = providers.map((id) => {
|
|
2543
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2544
|
+
const costs = runs.map((r) => {
|
|
2545
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2546
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2547
|
+
}).filter((c) => c !== void 0);
|
|
2548
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2549
|
+
return { id, avg };
|
|
2550
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2551
|
+
if (byCost?.avg !== void 0) {
|
|
2552
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2553
|
+
const costStr = formatCost(byCost.avg);
|
|
2554
|
+
if (single) {
|
|
2555
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2556
|
+
} else {
|
|
2557
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2558
|
+
}
|
|
2316
2559
|
}
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
case "azure":
|
|
2322
|
-
return "(OpenAI via Azure)";
|
|
2323
|
-
case "openai":
|
|
2324
|
-
return "(OpenAI)";
|
|
2325
|
-
case "anthropic":
|
|
2326
|
-
return "(Anthropic)";
|
|
2327
|
-
case "google":
|
|
2328
|
-
return "(Google)";
|
|
2329
|
-
case "mistral":
|
|
2330
|
-
return "(Mistral)";
|
|
2331
|
-
case "meta":
|
|
2332
|
-
return "(Meta)";
|
|
2333
|
-
case "deepseek":
|
|
2334
|
-
return "(DeepSeek)";
|
|
2335
|
-
case "cohere":
|
|
2336
|
-
return "(Cohere)";
|
|
2337
|
-
case "qwen":
|
|
2338
|
-
return "(Qwen)";
|
|
2339
|
-
case "xai":
|
|
2340
|
-
return "(xAI)";
|
|
2341
|
-
case "minimax":
|
|
2342
|
-
return "(MiniMax)";
|
|
2343
|
-
case "moonshot":
|
|
2344
|
-
return "(Moonshot / Kimi)";
|
|
2345
|
-
case "perplexity":
|
|
2346
|
-
return "(Perplexity)";
|
|
2347
|
-
case "amazon":
|
|
2348
|
-
return "(Amazon)";
|
|
2349
|
-
case "nvidia":
|
|
2350
|
-
return "(NVIDIA)";
|
|
2351
|
-
case "microsoft":
|
|
2352
|
-
return "(Microsoft)";
|
|
2353
|
-
case "ai21":
|
|
2354
|
-
return "(AI21 Labs)";
|
|
2355
|
-
case "bytedance":
|
|
2356
|
-
return "(ByteDance)";
|
|
2357
|
-
case "together":
|
|
2358
|
-
return "(Together AI)";
|
|
2359
|
-
case "fireworks":
|
|
2360
|
-
return "(Fireworks AI)";
|
|
2361
|
-
case "groq":
|
|
2362
|
-
return "(Groq)";
|
|
2363
|
-
case "cerebras":
|
|
2364
|
-
return "(Cerebras)";
|
|
2365
|
-
default:
|
|
2366
|
-
return `(${prefix})`;
|
|
2560
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2561
|
+
console.log("");
|
|
2562
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2563
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2367
2564
|
}
|
|
2565
|
+
console.log("");
|
|
2368
2566
|
}
|
|
2369
2567
|
|
|
2370
2568
|
// src/reporter/json.ts
|
|
@@ -2396,15 +2594,15 @@ function defineArena(config) {
|
|
|
2396
2594
|
if (config.providers.length === 0) {
|
|
2397
2595
|
throw new Error("At least one provider is required");
|
|
2398
2596
|
}
|
|
2399
|
-
if (config.tasks.length === 0) {
|
|
2400
|
-
throw new Error("At least one task is required");
|
|
2401
|
-
}
|
|
2402
2597
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2403
2598
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2404
2599
|
const runs = config.runs ?? 1;
|
|
2405
2600
|
return {
|
|
2406
2601
|
config,
|
|
2407
2602
|
async run(options) {
|
|
2603
|
+
if (config.tasks.length === 0) {
|
|
2604
|
+
throw new Error("At least one task is required");
|
|
2605
|
+
}
|
|
2408
2606
|
return runBenchmarks({
|
|
2409
2607
|
providers: config.providers,
|
|
2410
2608
|
tasks: config.tasks,
|
|
@@ -2430,23 +2628,19 @@ function anthropic(model, options) {
|
|
|
2430
2628
|
model,
|
|
2431
2629
|
async run(input) {
|
|
2432
2630
|
const start = Date.now();
|
|
2433
|
-
const systemMessage = input.schema ?
|
|
2631
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2632
|
+
const reqOpts = { signal: input.signal };
|
|
2633
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2434
2634
|
const response = await client.messages.create({
|
|
2435
2635
|
model,
|
|
2436
2636
|
max_tokens: maxTokens,
|
|
2437
2637
|
system: systemMessage,
|
|
2438
2638
|
messages: [{ role: "user", content: input.prompt }]
|
|
2439
|
-
},
|
|
2639
|
+
}, reqOpts);
|
|
2440
2640
|
const latencyMs = Date.now() - start;
|
|
2441
2641
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2442
2642
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2443
|
-
|
|
2444
|
-
if (input.schema) {
|
|
2445
|
-
try {
|
|
2446
|
-
output = JSON.parse(rawContent);
|
|
2447
|
-
} catch {
|
|
2448
|
-
}
|
|
2449
|
-
}
|
|
2643
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2450
2644
|
return {
|
|
2451
2645
|
output,
|
|
2452
2646
|
usage: {
|
|
@@ -2460,23 +2654,6 @@ function anthropic(model, options) {
|
|
|
2460
2654
|
};
|
|
2461
2655
|
}
|
|
2462
2656
|
|
|
2463
|
-
// src/providers/gemini.ts
|
|
2464
|
-
var import_openai4 = __toESM(require("openai"), 1);
|
|
2465
|
-
function gemini(model, options) {
|
|
2466
|
-
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
2467
|
-
if (!apiKey) {
|
|
2468
|
-
throw new Error(
|
|
2469
|
-
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
2470
|
-
);
|
|
2471
|
-
}
|
|
2472
|
-
const client = new import_openai4.default({
|
|
2473
|
-
apiKey,
|
|
2474
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2475
|
-
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2476
|
-
});
|
|
2477
|
-
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2478
|
-
}
|
|
2479
|
-
|
|
2480
2657
|
// src/reporter/markdown.ts
|
|
2481
2658
|
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2482
2659
|
function markdownReporter(report, _current) {
|
|
@@ -2521,7 +2698,7 @@ function markdownComparisonTable(comparisons) {
|
|
|
2521
2698
|
for (const c of comparisons) {
|
|
2522
2699
|
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2523
2700
|
const currentStr = formatStats(c.current);
|
|
2524
|
-
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2701
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2525
2702
|
const status = statusIndicator(c);
|
|
2526
2703
|
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2527
2704
|
}
|
|
@@ -2554,10 +2731,6 @@ function formatStats(stats) {
|
|
|
2554
2731
|
}
|
|
2555
2732
|
return stats.mean.toFixed(3);
|
|
2556
2733
|
}
|
|
2557
|
-
function formatDelta(delta) {
|
|
2558
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2559
|
-
return `${sign}${delta.toFixed(3)}`;
|
|
2560
|
-
}
|
|
2561
2734
|
function statusIndicator(c) {
|
|
2562
2735
|
if (c.regressed) return "\u{1F534} regressed";
|
|
2563
2736
|
if (c.improved) return "\u{1F7E2} improved";
|
|
@@ -2565,6 +2738,778 @@ function statusIndicator(c) {
|
|
|
2565
2738
|
return "\u26AA unchanged";
|
|
2566
2739
|
}
|
|
2567
2740
|
|
|
2741
|
+
// src/reporter/html.ts
|
|
2742
|
+
function esc(s) {
|
|
2743
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2744
|
+
}
|
|
2745
|
+
function htmlReporter(results) {
|
|
2746
|
+
if (results.length === 0) {
|
|
2747
|
+
return emptyReport();
|
|
2748
|
+
}
|
|
2749
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2750
|
+
const hasCost = scorerNames.includes("cost");
|
|
2751
|
+
const multi = providers.length >= 2;
|
|
2752
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2753
|
+
const taskSections = tasks.map((task) => {
|
|
2754
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2755
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2756
|
+
const medals = computeMedals(columnStats, providers);
|
|
2757
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2758
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2759
|
+
});
|
|
2760
|
+
const successResults = results.filter((r) => !r.error);
|
|
2761
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2762
|
+
for (const id of providers) {
|
|
2763
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2764
|
+
}
|
|
2765
|
+
const correctnessKey = successResults.some(
|
|
2766
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2767
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2768
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2769
|
+
const byLatency = providers.map((id) => {
|
|
2770
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2771
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2772
|
+
return { id, avg: avg ?? Infinity };
|
|
2773
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2774
|
+
const byCost = providers.map((id) => {
|
|
2775
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2776
|
+
const costs = runs.map((r) => {
|
|
2777
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2778
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2779
|
+
}).filter((c) => c !== void 0);
|
|
2780
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2781
|
+
return { id, avg };
|
|
2782
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2783
|
+
let overallWinner;
|
|
2784
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2785
|
+
overallWinner = byCorrectness.id;
|
|
2786
|
+
}
|
|
2787
|
+
const errorResults = results.filter((r) => r.error);
|
|
2788
|
+
const deduped = dedupeErrors(errorResults);
|
|
2789
|
+
return `<!DOCTYPE html>
|
|
2790
|
+
<html lang="en">
|
|
2791
|
+
<head>
|
|
2792
|
+
<meta charset="UTF-8">
|
|
2793
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2794
|
+
<title>Agent Duelist Report</title>
|
|
2795
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2796
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2797
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2798
|
+
<meta property="og:type" content="website">
|
|
2799
|
+
${renderStyle()}
|
|
2800
|
+
</head>
|
|
2801
|
+
<body>
|
|
2802
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2803
|
+
<div class="report">
|
|
2804
|
+
|
|
2805
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2806
|
+
|
|
2807
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2808
|
+
|
|
2809
|
+
<main>
|
|
2810
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2811
|
+
s.task,
|
|
2812
|
+
s.providerData,
|
|
2813
|
+
s.columnStats,
|
|
2814
|
+
s.medals,
|
|
2815
|
+
s.winnerId,
|
|
2816
|
+
scorerNames,
|
|
2817
|
+
hasCost,
|
|
2818
|
+
multi,
|
|
2819
|
+
i
|
|
2820
|
+
)).join("\n")}
|
|
2821
|
+
</main>
|
|
2822
|
+
|
|
2823
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2824
|
+
|
|
2825
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2826
|
+
|
|
2827
|
+
${renderFooter()}
|
|
2828
|
+
|
|
2829
|
+
</div>
|
|
2830
|
+
${renderScript(tasks.length)}
|
|
2831
|
+
</body>
|
|
2832
|
+
</html>`;
|
|
2833
|
+
}
|
|
2834
|
+
function emptyReport() {
|
|
2835
|
+
return `<!DOCTYPE html>
|
|
2836
|
+
<html lang="en">
|
|
2837
|
+
<head>
|
|
2838
|
+
<meta charset="UTF-8">
|
|
2839
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2840
|
+
<title>Agent Duelist Report</title>
|
|
2841
|
+
${renderStyle()}
|
|
2842
|
+
</head>
|
|
2843
|
+
<body>
|
|
2844
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2845
|
+
<div class="report">
|
|
2846
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2847
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2848
|
+
${renderFooter()}
|
|
2849
|
+
</div>
|
|
2850
|
+
</body>
|
|
2851
|
+
</html>`;
|
|
2852
|
+
}
|
|
2853
|
+
function dedupeErrors(errorResults) {
|
|
2854
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2855
|
+
for (const r of errorResults) {
|
|
2856
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2857
|
+
const existing = seen.get(key);
|
|
2858
|
+
if (existing) {
|
|
2859
|
+
existing.count++;
|
|
2860
|
+
} else {
|
|
2861
|
+
seen.set(key, {
|
|
2862
|
+
providerId: r.providerId,
|
|
2863
|
+
error: r.error ?? "Unknown error",
|
|
2864
|
+
count: 1,
|
|
2865
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2866
|
+
});
|
|
2867
|
+
}
|
|
2868
|
+
}
|
|
2869
|
+
return [...seen.values()];
|
|
2870
|
+
}
|
|
2871
|
+
function renderStyle() {
|
|
2872
|
+
return `<style>
|
|
2873
|
+
:root {
|
|
2874
|
+
--bg: #0f172a;
|
|
2875
|
+
--bg-deep: #020617;
|
|
2876
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2877
|
+
--accent: #f59e0b;
|
|
2878
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2879
|
+
--text: #e2e8f0;
|
|
2880
|
+
--muted: #94a3b8;
|
|
2881
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2882
|
+
--green: #22c55e;
|
|
2883
|
+
--red: #ef4444;
|
|
2884
|
+
--yellow: #eab308;
|
|
2885
|
+
--radius: 12px;
|
|
2886
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2887
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2888
|
+
}
|
|
2889
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2890
|
+
html, body {
|
|
2891
|
+
font-family: var(--sans);
|
|
2892
|
+
background: var(--bg);
|
|
2893
|
+
color: var(--text);
|
|
2894
|
+
min-height: 100vh;
|
|
2895
|
+
}
|
|
2896
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2897
|
+
|
|
2898
|
+
/* Animated gradient mesh */
|
|
2899
|
+
.bg-mesh {
|
|
2900
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2901
|
+
overflow: hidden; pointer-events: none;
|
|
2902
|
+
}
|
|
2903
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2904
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2905
|
+
filter: blur(120px); opacity: 0.4;
|
|
2906
|
+
}
|
|
2907
|
+
.bg-mesh::before {
|
|
2908
|
+
width: 600px; height: 600px;
|
|
2909
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2910
|
+
top: -10%; left: -5%;
|
|
2911
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2912
|
+
}
|
|
2913
|
+
.bg-mesh::after {
|
|
2914
|
+
width: 500px; height: 500px;
|
|
2915
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2916
|
+
bottom: -10%; right: -5%;
|
|
2917
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2918
|
+
}
|
|
2919
|
+
.bg-mesh-extra {
|
|
2920
|
+
position: absolute; width: 400px; height: 400px;
|
|
2921
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2922
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2923
|
+
top: 50%; left: 60%;
|
|
2924
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2925
|
+
}
|
|
2926
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2927
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2928
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2929
|
+
|
|
2930
|
+
/* Report container */
|
|
2931
|
+
.report {
|
|
2932
|
+
position: relative; z-index: 1;
|
|
2933
|
+
width: 100%; max-width: 960px;
|
|
2934
|
+
}
|
|
2935
|
+
|
|
2936
|
+
/* Header */
|
|
2937
|
+
.report-header {
|
|
2938
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2939
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2940
|
+
}
|
|
2941
|
+
.report-brand {
|
|
2942
|
+
display: flex; align-items: center; gap: 10px;
|
|
2943
|
+
text-decoration: none; color: var(--muted);
|
|
2944
|
+
font-weight: 600; font-size: 14px;
|
|
2945
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2946
|
+
}
|
|
2947
|
+
.report-brand:hover { color: var(--text); }
|
|
2948
|
+
.brand-icon {
|
|
2949
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2950
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2951
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2952
|
+
display: flex; align-items: center; justify-content: center;
|
|
2953
|
+
font-size: 16px;
|
|
2954
|
+
}
|
|
2955
|
+
.report-meta {
|
|
2956
|
+
font-size: 12px; color: var(--muted);
|
|
2957
|
+
text-align: right; line-height: 1.6;
|
|
2958
|
+
}
|
|
2959
|
+
|
|
2960
|
+
/* Task tabs */
|
|
2961
|
+
.task-tabs {
|
|
2962
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2963
|
+
}
|
|
2964
|
+
.task-tab {
|
|
2965
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2966
|
+
border: 1px solid var(--border);
|
|
2967
|
+
background: transparent; color: var(--muted);
|
|
2968
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2969
|
+
transition: all 150ms ease;
|
|
2970
|
+
}
|
|
2971
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2972
|
+
.task-tab.active {
|
|
2973
|
+
background: var(--accent-soft);
|
|
2974
|
+
border-color: rgba(245,158,11,0.4);
|
|
2975
|
+
color: var(--accent);
|
|
2976
|
+
}
|
|
2977
|
+
|
|
2978
|
+
/* Task sections */
|
|
2979
|
+
.task-section { display: none; }
|
|
2980
|
+
.task-section.active { display: block; }
|
|
2981
|
+
.task-name {
|
|
2982
|
+
font-size: 18px; font-weight: 600;
|
|
2983
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2984
|
+
}
|
|
2985
|
+
|
|
2986
|
+
/* Results table */
|
|
2987
|
+
.results-table {
|
|
2988
|
+
width: 100%; border-collapse: collapse;
|
|
2989
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2990
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2991
|
+
border: 1px solid var(--border);
|
|
2992
|
+
}
|
|
2993
|
+
.results-table th, .results-table td {
|
|
2994
|
+
padding: 10px 14px;
|
|
2995
|
+
text-align: left;
|
|
2996
|
+
border-bottom: 1px solid var(--border);
|
|
2997
|
+
}
|
|
2998
|
+
.results-table th {
|
|
2999
|
+
background: rgba(0,0,0,0.3);
|
|
3000
|
+
font-size: 11px; font-weight: 600;
|
|
3001
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
3002
|
+
color: var(--muted); cursor: pointer;
|
|
3003
|
+
user-select: none; white-space: nowrap;
|
|
3004
|
+
}
|
|
3005
|
+
.results-table th:hover { color: var(--text); }
|
|
3006
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
3007
|
+
.results-table tbody tr {
|
|
3008
|
+
background: var(--panel);
|
|
3009
|
+
transition: background 120ms ease;
|
|
3010
|
+
}
|
|
3011
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
3012
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
3013
|
+
|
|
3014
|
+
/* Score cell with progress bar */
|
|
3015
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
3016
|
+
.score-bar {
|
|
3017
|
+
position: absolute; left: 0; bottom: 0;
|
|
3018
|
+
height: 3px; border-radius: 2px;
|
|
3019
|
+
transition: width 300ms ease;
|
|
3020
|
+
}
|
|
3021
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
3022
|
+
|
|
3023
|
+
/* Color ranking */
|
|
3024
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
3025
|
+
.rank-worst { color: var(--red); }
|
|
3026
|
+
.rank-mid { color: var(--yellow); }
|
|
3027
|
+
.rank-neutral { color: var(--text); }
|
|
3028
|
+
.rank-error { color: var(--muted); }
|
|
3029
|
+
|
|
3030
|
+
/* Winner banner */
|
|
3031
|
+
.task-winner {
|
|
3032
|
+
display: flex; align-items: center; gap: 10px;
|
|
3033
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
3034
|
+
border-radius: var(--radius);
|
|
3035
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
3036
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
3037
|
+
font-size: 14px; font-weight: 500;
|
|
3038
|
+
}
|
|
3039
|
+
.task-winner .trophy { font-size: 20px; }
|
|
3040
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
3041
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
3042
|
+
|
|
3043
|
+
/* Summary cards */
|
|
3044
|
+
.summary-section { margin-top: 32px; }
|
|
3045
|
+
.summary-title {
|
|
3046
|
+
font-size: 16px; font-weight: 600;
|
|
3047
|
+
margin-bottom: 12px; color: var(--text);
|
|
3048
|
+
}
|
|
3049
|
+
.summary-cards {
|
|
3050
|
+
display: grid;
|
|
3051
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
3052
|
+
gap: 12px;
|
|
3053
|
+
}
|
|
3054
|
+
.summary-card {
|
|
3055
|
+
padding: 16px; border-radius: var(--radius);
|
|
3056
|
+
border: 1px solid var(--border);
|
|
3057
|
+
background: var(--panel);
|
|
3058
|
+
}
|
|
3059
|
+
.summary-card .card-label {
|
|
3060
|
+
font-size: 11px; font-weight: 600;
|
|
3061
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
3062
|
+
color: var(--muted); margin-bottom: 6px;
|
|
3063
|
+
}
|
|
3064
|
+
.summary-card .card-value {
|
|
3065
|
+
font-size: 20px; font-weight: 700;
|
|
3066
|
+
color: var(--green); font-family: var(--mono);
|
|
3067
|
+
}
|
|
3068
|
+
.summary-card .card-provider {
|
|
3069
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
3070
|
+
}
|
|
3071
|
+
|
|
3072
|
+
/* Errors */
|
|
3073
|
+
.errors-section { margin-top: 24px; }
|
|
3074
|
+
.errors-title {
|
|
3075
|
+
font-size: 16px; font-weight: 600;
|
|
3076
|
+
margin-bottom: 8px; color: var(--red);
|
|
3077
|
+
cursor: pointer;
|
|
3078
|
+
}
|
|
3079
|
+
.errors-list {
|
|
3080
|
+
border-radius: var(--radius);
|
|
3081
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
3082
|
+
background: rgba(239,68,68,0.04);
|
|
3083
|
+
overflow: hidden;
|
|
3084
|
+
}
|
|
3085
|
+
.error-item {
|
|
3086
|
+
padding: 10px 16px;
|
|
3087
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
3088
|
+
font-size: 13px;
|
|
3089
|
+
}
|
|
3090
|
+
.error-item:last-child { border-bottom: none; }
|
|
3091
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
3092
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
3093
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
3094
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
3095
|
+
|
|
3096
|
+
/* Footer */
|
|
3097
|
+
.report-footer {
|
|
3098
|
+
margin-top: 40px; padding: 20px 0;
|
|
3099
|
+
border-top: 1px solid var(--border);
|
|
3100
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
3101
|
+
flex-wrap: wrap; gap: 12px;
|
|
3102
|
+
}
|
|
3103
|
+
.footer-brand {
|
|
3104
|
+
font-size: 13px; color: var(--muted);
|
|
3105
|
+
}
|
|
3106
|
+
.footer-brand a {
|
|
3107
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
3108
|
+
}
|
|
3109
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
3110
|
+
.footer-cta {
|
|
3111
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
3112
|
+
padding: 6px 14px; border-radius: 8px;
|
|
3113
|
+
background: var(--accent-soft);
|
|
3114
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
3115
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
3116
|
+
text-decoration: none;
|
|
3117
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
3118
|
+
}
|
|
3119
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
3120
|
+
|
|
3121
|
+
/* Empty state */
|
|
3122
|
+
.empty-msg {
|
|
3123
|
+
text-align: center; color: var(--muted);
|
|
3124
|
+
padding: 60px 20px; font-size: 16px;
|
|
3125
|
+
}
|
|
3126
|
+
|
|
3127
|
+
/* Responsive */
|
|
3128
|
+
@media (max-width: 640px) {
|
|
3129
|
+
body { padding: 12px; }
|
|
3130
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
3131
|
+
.report-meta { text-align: left; }
|
|
3132
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
3133
|
+
.results-table { font-size: 12px; }
|
|
3134
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
3135
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3136
|
+
}
|
|
3137
|
+
</style>`;
|
|
3138
|
+
}
|
|
3139
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3140
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3141
|
+
return `<header class="report-header">
|
|
3142
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3143
|
+
<div class="brand-icon">⬡</div>
|
|
3144
|
+
<span>Agent Duelist</span>
|
|
3145
|
+
</a>
|
|
3146
|
+
<div class="report-meta">
|
|
3147
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3148
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3149
|
+
${esc(runsLabel)}<br>
|
|
3150
|
+
${esc(now)}
|
|
3151
|
+
</div>
|
|
3152
|
+
</header>`;
|
|
3153
|
+
}
|
|
3154
|
+
function renderTabs(tasks) {
|
|
3155
|
+
const buttons = tasks.map(
|
|
3156
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3157
|
+
).join("\n ");
|
|
3158
|
+
return `<nav class="task-tabs">
|
|
3159
|
+
${buttons}
|
|
3160
|
+
</nav>`;
|
|
3161
|
+
}
|
|
3162
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3163
|
+
const cols = [
|
|
3164
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3165
|
+
];
|
|
3166
|
+
for (const name of scorerNames) {
|
|
3167
|
+
if (name === "latency") {
|
|
3168
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3169
|
+
} else if (name === "cost") {
|
|
3170
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3171
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3172
|
+
} else {
|
|
3173
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3174
|
+
}
|
|
3175
|
+
}
|
|
3176
|
+
const ths = cols.map(
|
|
3177
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3178
|
+
).join("");
|
|
3179
|
+
const rows = providerData.map((pd) => {
|
|
3180
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3181
|
+
const cells = [];
|
|
3182
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3183
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3184
|
+
if (pd.allErrors) {
|
|
3185
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3186
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3187
|
+
}
|
|
3188
|
+
} else {
|
|
3189
|
+
for (const col of cols.slice(1)) {
|
|
3190
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3191
|
+
}
|
|
3192
|
+
}
|
|
3193
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3194
|
+
}).join("\n");
|
|
3195
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3196
|
+
<span class="trophy">🏆</span>
|
|
3197
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3198
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3199
|
+
</div>` : "";
|
|
3200
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3201
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3202
|
+
<table class="results-table">
|
|
3203
|
+
<thead><tr>${ths}</tr></thead>
|
|
3204
|
+
<tbody>${rows}</tbody>
|
|
3205
|
+
</table>
|
|
3206
|
+
${winnerHtml}
|
|
3207
|
+
</section>`;
|
|
3208
|
+
}
|
|
3209
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3210
|
+
const colStats = columnStats.get(key);
|
|
3211
|
+
if (key === "latency") {
|
|
3212
|
+
const ms = pd.latencyMs;
|
|
3213
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3214
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3215
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3216
|
+
}
|
|
3217
|
+
if (key === "cost") {
|
|
3218
|
+
const cost = pd.avgDetails.costUsd;
|
|
3219
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3220
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3221
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3222
|
+
}
|
|
3223
|
+
if (key === "tokens") {
|
|
3224
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3225
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3226
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3227
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3228
|
+
}
|
|
3229
|
+
const val = pd.avgScores[key];
|
|
3230
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3231
|
+
const pct = Math.round(val * 100);
|
|
3232
|
+
let rankCls;
|
|
3233
|
+
if (multi && colStats) {
|
|
3234
|
+
rankCls = rankClass_(val, colStats);
|
|
3235
|
+
} else {
|
|
3236
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3237
|
+
}
|
|
3238
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3239
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3240
|
+
<span class="score-val">${pct}%</span>
|
|
3241
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3242
|
+
</td>`;
|
|
3243
|
+
}
|
|
3244
|
+
function rankClass_(value, colStats) {
|
|
3245
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3246
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3247
|
+
if (value === colStats.best) return "rank-best";
|
|
3248
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3249
|
+
return "rank-mid";
|
|
3250
|
+
}
|
|
3251
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3252
|
+
const cards = [];
|
|
3253
|
+
if (byCorrectness) {
|
|
3254
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3255
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3256
|
+
cards.push(`<div class="summary-card">
|
|
3257
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3258
|
+
<div class="card-value">${pct}</div>
|
|
3259
|
+
${provider}
|
|
3260
|
+
</div>`);
|
|
3261
|
+
}
|
|
3262
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3263
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3264
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3265
|
+
cards.push(`<div class="summary-card">
|
|
3266
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3267
|
+
<div class="card-value">${ms}</div>
|
|
3268
|
+
${provider}
|
|
3269
|
+
</div>`);
|
|
3270
|
+
}
|
|
3271
|
+
if (byCost?.avg !== void 0) {
|
|
3272
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3273
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3274
|
+
cards.push(`<div class="summary-card">
|
|
3275
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3276
|
+
<div class="card-value">${cost}</div>
|
|
3277
|
+
${provider}
|
|
3278
|
+
</div>`);
|
|
3279
|
+
}
|
|
3280
|
+
if (overallWinner) {
|
|
3281
|
+
cards.push(`<div class="summary-card">
|
|
3282
|
+
<div class="card-label">Overall Winner</div>
|
|
3283
|
+
<div class="card-value">🏆</div>
|
|
3284
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3285
|
+
</div>`);
|
|
3286
|
+
}
|
|
3287
|
+
if (cards.length === 0) return "";
|
|
3288
|
+
return `<section class="summary-section">
|
|
3289
|
+
<h2 class="summary-title">Summary</h2>
|
|
3290
|
+
<div class="summary-cards">
|
|
3291
|
+
${cards.join("\n ")}
|
|
3292
|
+
</div>
|
|
3293
|
+
</section>`;
|
|
3294
|
+
}
|
|
3295
|
+
function renderErrors(errors) {
|
|
3296
|
+
const items = errors.map((e) => {
|
|
3297
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3298
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3299
|
+
return `<div class="error-item">
|
|
3300
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3301
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3302
|
+
${hint}
|
|
3303
|
+
</div>`;
|
|
3304
|
+
}).join("\n");
|
|
3305
|
+
return `<section class="errors-section">
|
|
3306
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3307
|
+
<div class="errors-list">
|
|
3308
|
+
${items}
|
|
3309
|
+
</div>
|
|
3310
|
+
</section>`;
|
|
3311
|
+
}
|
|
3312
|
+
function renderFooter() {
|
|
3313
|
+
return `<footer class="report-footer">
|
|
3314
|
+
<div class="footer-brand">
|
|
3315
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3316
|
+
</div>
|
|
3317
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3318
|
+
⭐ Star on GitHub
|
|
3319
|
+
</a>
|
|
3320
|
+
</footer>`;
|
|
3321
|
+
}
|
|
3322
|
+
function renderScript(taskCount) {
|
|
3323
|
+
return `<script>
|
|
3324
|
+
(function() {
|
|
3325
|
+
/* Tab switching */
|
|
3326
|
+
${taskCount > 1 ? `
|
|
3327
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3328
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3329
|
+
tabs.forEach(function(tab) {
|
|
3330
|
+
tab.addEventListener('click', function() {
|
|
3331
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3332
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3333
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3334
|
+
tab.classList.add('active');
|
|
3335
|
+
sections[idx].classList.add('active');
|
|
3336
|
+
});
|
|
3337
|
+
});` : ""}
|
|
3338
|
+
|
|
3339
|
+
/* Column sorting */
|
|
3340
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3341
|
+
var table = th.closest('table');
|
|
3342
|
+
var asc = true;
|
|
3343
|
+
th.addEventListener('click', function() {
|
|
3344
|
+
var tbody = table.querySelector('tbody');
|
|
3345
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3346
|
+
rows.sort(function(a, b) {
|
|
3347
|
+
var aCell = a.children[colIdx];
|
|
3348
|
+
var bCell = b.children[colIdx];
|
|
3349
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3350
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3351
|
+
if (aVal !== null && bVal !== null) {
|
|
3352
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3353
|
+
}
|
|
3354
|
+
var aText = aCell.textContent || '';
|
|
3355
|
+
var bText = bCell.textContent || '';
|
|
3356
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3357
|
+
});
|
|
3358
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3359
|
+
|
|
3360
|
+
/* Update sort arrows */
|
|
3361
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3362
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3363
|
+
asc = !asc;
|
|
3364
|
+
});
|
|
3365
|
+
});
|
|
3366
|
+
})();
|
|
3367
|
+
</script>`;
|
|
3368
|
+
}
|
|
3369
|
+
|
|
3370
|
+
// src/packs/structured-output.ts
|
|
3371
|
+
var import_zod = require("zod");
|
|
3372
|
+
var structuredOutputPack = {
|
|
3373
|
+
name: "structured-output",
|
|
3374
|
+
label: "Structured Output",
|
|
3375
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3376
|
+
tasks: [
|
|
3377
|
+
{
|
|
3378
|
+
name: "so:flat-entity",
|
|
3379
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3380
|
+
expected: {
|
|
3381
|
+
name: "Maria Garcia",
|
|
3382
|
+
age: 34,
|
|
3383
|
+
role: "software architect",
|
|
3384
|
+
city: "Barcelona",
|
|
3385
|
+
country: "Spain",
|
|
3386
|
+
employeeId: "EMP-2847"
|
|
3387
|
+
},
|
|
3388
|
+
schema: import_zod.z.object({
|
|
3389
|
+
name: import_zod.z.string(),
|
|
3390
|
+
age: import_zod.z.number(),
|
|
3391
|
+
role: import_zod.z.string(),
|
|
3392
|
+
city: import_zod.z.string(),
|
|
3393
|
+
country: import_zod.z.string(),
|
|
3394
|
+
employeeId: import_zod.z.string()
|
|
3395
|
+
})
|
|
3396
|
+
},
|
|
3397
|
+
{
|
|
3398
|
+
name: "so:nested-object",
|
|
3399
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3400
|
+
expected: {
|
|
3401
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3402
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3403
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3404
|
+
},
|
|
3405
|
+
schema: import_zod.z.object({
|
|
3406
|
+
recipient: import_zod.z.object({ company: import_zod.z.string(), contact: import_zod.z.string(), floor: import_zod.z.string() }),
|
|
3407
|
+
address: import_zod.z.object({
|
|
3408
|
+
street: import_zod.z.string(),
|
|
3409
|
+
city: import_zod.z.string(),
|
|
3410
|
+
state: import_zod.z.string(),
|
|
3411
|
+
zip: import_zod.z.string(),
|
|
3412
|
+
country: import_zod.z.string()
|
|
3413
|
+
}),
|
|
3414
|
+
order: import_zod.z.object({
|
|
3415
|
+
id: import_zod.z.string(),
|
|
3416
|
+
itemCount: import_zod.z.number(),
|
|
3417
|
+
weightKg: import_zod.z.number(),
|
|
3418
|
+
shippingMethod: import_zod.z.enum(["standard", "express", "overnight"])
|
|
3419
|
+
})
|
|
3420
|
+
})
|
|
3421
|
+
},
|
|
3422
|
+
{
|
|
3423
|
+
name: "so:array-of-objects",
|
|
3424
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3425
|
+
expected: [
|
|
3426
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3427
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3428
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3429
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3430
|
+
],
|
|
3431
|
+
schema: import_zod.z.array(import_zod.z.object({ name: import_zod.z.string(), price: import_zod.z.number(), category: import_zod.z.string() }))
|
|
3432
|
+
},
|
|
3433
|
+
{
|
|
3434
|
+
name: "so:empty-arrays",
|
|
3435
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3436
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3437
|
+
schema: import_zod.z.object({
|
|
3438
|
+
errors: import_zod.z.array(import_zod.z.object({ code: import_zod.z.string(), severity: import_zod.z.string() })),
|
|
3439
|
+
warnings: import_zod.z.array(import_zod.z.string()),
|
|
3440
|
+
status: import_zod.z.enum(["healthy", "degraded", "down"]),
|
|
3441
|
+
uptimePercent: import_zod.z.number()
|
|
3442
|
+
})
|
|
3443
|
+
},
|
|
3444
|
+
{
|
|
3445
|
+
name: "so:enum-classification",
|
|
3446
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3447
|
+
expected: [
|
|
3448
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3449
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3450
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3451
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3452
|
+
],
|
|
3453
|
+
schema: import_zod.z.array(
|
|
3454
|
+
import_zod.z.object({
|
|
3455
|
+
id: import_zod.z.string(),
|
|
3456
|
+
priority: import_zod.z.enum(["low", "medium", "high", "critical"]),
|
|
3457
|
+
category: import_zod.z.enum(["billing", "technical", "account", "general"])
|
|
3458
|
+
})
|
|
3459
|
+
)
|
|
3460
|
+
},
|
|
3461
|
+
{
|
|
3462
|
+
name: "so:adversarial-input",
|
|
3463
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3464
|
+
|
|
3465
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3466
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3467
|
+
expected: {
|
|
3468
|
+
product: "headphones",
|
|
3469
|
+
price: 59.99,
|
|
3470
|
+
rating: 5,
|
|
3471
|
+
maxRating: 5,
|
|
3472
|
+
features: ["noise-cancelling"],
|
|
3473
|
+
recommended: true,
|
|
3474
|
+
purchaseDate: "2026-01-15"
|
|
3475
|
+
},
|
|
3476
|
+
schema: import_zod.z.object({
|
|
3477
|
+
product: import_zod.z.string(),
|
|
3478
|
+
price: import_zod.z.number(),
|
|
3479
|
+
rating: import_zod.z.number(),
|
|
3480
|
+
maxRating: import_zod.z.number(),
|
|
3481
|
+
features: import_zod.z.array(import_zod.z.string()),
|
|
3482
|
+
recommended: import_zod.z.boolean(),
|
|
3483
|
+
purchaseDate: import_zod.z.string()
|
|
3484
|
+
})
|
|
3485
|
+
}
|
|
3486
|
+
],
|
|
3487
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3488
|
+
};
|
|
3489
|
+
|
|
3490
|
+
// src/packs/index.ts
|
|
3491
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3492
|
+
function register(pack) {
|
|
3493
|
+
registry.set(pack.name, pack);
|
|
3494
|
+
}
|
|
3495
|
+
register(structuredOutputPack);
|
|
3496
|
+
function loadPack(name) {
|
|
3497
|
+
const pack = registry.get(name);
|
|
3498
|
+
if (!pack) {
|
|
3499
|
+
const available = [...registry.keys()].join(", ");
|
|
3500
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3501
|
+
}
|
|
3502
|
+
return pack;
|
|
3503
|
+
}
|
|
3504
|
+
function listPacks() {
|
|
3505
|
+
return [...registry.values()].map((p) => ({
|
|
3506
|
+
name: p.name,
|
|
3507
|
+
label: p.label,
|
|
3508
|
+
description: p.description,
|
|
3509
|
+
taskCount: p.tasks.length
|
|
3510
|
+
}));
|
|
3511
|
+
}
|
|
3512
|
+
|
|
2568
3513
|
// src/ci.ts
|
|
2569
3514
|
var import_node_fs = require("fs");
|
|
2570
3515
|
var import_node_path = require("path");
|
|
@@ -2586,10 +3531,11 @@ var T_CRITICAL_95 = {
|
|
|
2586
3531
|
25: 2.06,
|
|
2587
3532
|
30: 2.042
|
|
2588
3533
|
};
|
|
3534
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2589
3535
|
function tCritical(df) {
|
|
2590
3536
|
if (df <= 0) return 1.96;
|
|
2591
3537
|
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2592
|
-
const keys =
|
|
3538
|
+
const keys = T_CRITICAL_KEYS;
|
|
2593
3539
|
if (df > keys[keys.length - 1]) return 1.96;
|
|
2594
3540
|
for (let i = 0; i < keys.length - 1; i++) {
|
|
2595
3541
|
if (df > keys[i] && df < keys[i + 1]) {
|
|
@@ -2699,7 +3645,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
|
|
|
2699
3645
|
if (regressions.length > 0) {
|
|
2700
3646
|
for (const r of regressions) {
|
|
2701
3647
|
failureReasons.push(
|
|
2702
|
-
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${
|
|
3648
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
2703
3649
|
);
|
|
2704
3650
|
}
|
|
2705
3651
|
}
|
|
@@ -2734,10 +3680,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
|
2734
3680
|
}
|
|
2735
3681
|
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2736
3682
|
}
|
|
2737
|
-
function formatDelta2(delta) {
|
|
2738
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2739
|
-
return `${sign}${delta.toFixed(4)}`;
|
|
2740
|
-
}
|
|
2741
3683
|
function loadBaseline(path) {
|
|
2742
3684
|
try {
|
|
2743
3685
|
const raw = (0, import_node_fs.readFileSync)(path, "utf-8");
|
|
@@ -2794,18 +3736,20 @@ function detectGitHubContext() {
|
|
|
2794
3736
|
return { token, owner, repo, prNumber };
|
|
2795
3737
|
}
|
|
2796
3738
|
var API_BASE = "https://api.github.com";
|
|
3739
|
+
function ghHeaders(token, extra) {
|
|
3740
|
+
return {
|
|
3741
|
+
Authorization: `Bearer ${token}`,
|
|
3742
|
+
Accept: "application/vnd.github+json",
|
|
3743
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3744
|
+
...extra
|
|
3745
|
+
};
|
|
3746
|
+
}
|
|
2797
3747
|
async function findExistingComment(ctx, marker) {
|
|
2798
3748
|
let page = 1;
|
|
2799
3749
|
const perPage = 50;
|
|
2800
3750
|
while (true) {
|
|
2801
3751
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2802
|
-
const res = await fetch(url, {
|
|
2803
|
-
headers: {
|
|
2804
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2805
|
-
Accept: "application/vnd.github+json",
|
|
2806
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2807
|
-
}
|
|
2808
|
-
});
|
|
3752
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
2809
3753
|
if (!res.ok) return null;
|
|
2810
3754
|
const comments = await res.json();
|
|
2811
3755
|
if (comments.length === 0) break;
|
|
@@ -2825,12 +3769,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2825
3769
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2826
3770
|
const res = await fetch(url, {
|
|
2827
3771
|
method: "PATCH",
|
|
2828
|
-
headers: {
|
|
2829
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2830
|
-
Accept: "application/vnd.github+json",
|
|
2831
|
-
"Content-Type": "application/json",
|
|
2832
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2833
|
-
},
|
|
3772
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2834
3773
|
body: JSON.stringify({ body })
|
|
2835
3774
|
});
|
|
2836
3775
|
if (!res.ok) {
|
|
@@ -2841,12 +3780,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2841
3780
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2842
3781
|
const res = await fetch(url, {
|
|
2843
3782
|
method: "POST",
|
|
2844
|
-
headers: {
|
|
2845
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2846
|
-
Accept: "application/vnd.github+json",
|
|
2847
|
-
"Content-Type": "application/json",
|
|
2848
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2849
|
-
},
|
|
3783
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2850
3784
|
body: JSON.stringify({ body })
|
|
2851
3785
|
});
|
|
2852
3786
|
if (!res.ok) {
|
|
@@ -2865,8 +3799,11 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2865
3799
|
defineArena,
|
|
2866
3800
|
detectGitHubContext,
|
|
2867
3801
|
gemini,
|
|
3802
|
+
htmlReporter,
|
|
2868
3803
|
jsonReporter,
|
|
3804
|
+
listPacks,
|
|
2869
3805
|
loadBaseline,
|
|
3806
|
+
loadPack,
|
|
2870
3807
|
markdownReporter,
|
|
2871
3808
|
openai,
|
|
2872
3809
|
openaiCompatible,
|