agent-duelist 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +246 -142
- package/dist/cli.js +2004 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +334 -105
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +332 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -40,7 +40,9 @@ __export(index_exports, {
|
|
|
40
40
|
gemini: () => gemini,
|
|
41
41
|
htmlReporter: () => htmlReporter,
|
|
42
42
|
jsonReporter: () => jsonReporter,
|
|
43
|
+
listPacks: () => listPacks,
|
|
43
44
|
loadBaseline: () => loadBaseline,
|
|
45
|
+
loadPack: () => loadPack,
|
|
44
46
|
markdownReporter: () => markdownReporter,
|
|
45
47
|
openai: () => openai,
|
|
46
48
|
openaiCompatible: () => openaiCompatible,
|
|
@@ -1401,33 +1403,42 @@ var correctnessScorer = ({ task, result }) => {
|
|
|
1401
1403
|
if (task.expected === void 0) {
|
|
1402
1404
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1403
1405
|
}
|
|
1404
|
-
const
|
|
1406
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1407
|
+
const match = deepEqual(task.expected, actual);
|
|
1405
1408
|
return {
|
|
1406
1409
|
name: "correctness",
|
|
1407
1410
|
value: match ? 1 : 0,
|
|
1408
1411
|
details: { expected: task.expected, actual: result.output }
|
|
1409
1412
|
};
|
|
1410
1413
|
};
|
|
1411
|
-
function
|
|
1412
|
-
if (
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
if (
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1414
|
+
function normalizeOutput(expected, actual) {
|
|
1415
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1416
|
+
const entries = Object.entries(actual);
|
|
1417
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1418
|
+
if (arrayEntries.length === 1) {
|
|
1419
|
+
return arrayEntries[0][1];
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
return actual;
|
|
1423
|
+
}
|
|
1424
|
+
function deepEqual(expected, actual) {
|
|
1425
|
+
if (expected === actual) return true;
|
|
1426
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1427
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1428
|
+
}
|
|
1429
|
+
if (typeof expected !== typeof actual) return false;
|
|
1430
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1431
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1432
|
+
if (expected.length !== actual.length) return false;
|
|
1433
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1434
|
+
}
|
|
1435
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1436
|
+
const objExpected = expected;
|
|
1437
|
+
const objActual = actual;
|
|
1438
|
+
const keysExpected = Object.keys(objExpected);
|
|
1439
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1440
|
+
}
|
|
1441
|
+
return expected === actual;
|
|
1431
1442
|
}
|
|
1432
1443
|
|
|
1433
1444
|
// src/scorers/schema-correctness.ts
|
|
@@ -1447,7 +1458,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1447
1458
|
};
|
|
1448
1459
|
}
|
|
1449
1460
|
}
|
|
1450
|
-
|
|
1461
|
+
let parsed = task.schema.safeParse(data);
|
|
1462
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1463
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1464
|
+
if (arrayEntries.length === 1) {
|
|
1465
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1466
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1451
1469
|
return {
|
|
1452
1470
|
name: "schema-correctness",
|
|
1453
1471
|
value: parsed.success ? 1 : 0,
|
|
@@ -1493,18 +1511,36 @@ var import_openai2 = __toESM(require("openai"), 1);
|
|
|
1493
1511
|
|
|
1494
1512
|
// src/providers/openai.ts
|
|
1495
1513
|
var import_openai = __toESM(require("openai"), 1);
|
|
1496
|
-
var
|
|
1514
|
+
var import_zod_to_json_schema2 = require("zod-to-json-schema");
|
|
1497
1515
|
|
|
1498
1516
|
// src/providers/shared.ts
|
|
1499
|
-
var
|
|
1517
|
+
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1518
|
+
function buildSchemaSystemMessage(schema) {
|
|
1519
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1520
|
+
const jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema, { target: "openAi" });
|
|
1521
|
+
return [
|
|
1522
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1523
|
+
"",
|
|
1524
|
+
"Your output must conform to this JSON Schema:",
|
|
1525
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1526
|
+
"",
|
|
1527
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1528
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1529
|
+
].join("\n");
|
|
1530
|
+
}
|
|
1500
1531
|
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1501
1532
|
if (!hasSchema) return rawContent;
|
|
1533
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1502
1534
|
try {
|
|
1503
|
-
return JSON.parse(
|
|
1535
|
+
return JSON.parse(cleaned);
|
|
1504
1536
|
} catch {
|
|
1505
1537
|
return rawContent;
|
|
1506
1538
|
}
|
|
1507
1539
|
}
|
|
1540
|
+
function stripCodeFences(content) {
|
|
1541
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1542
|
+
return match ? match[1] : content;
|
|
1543
|
+
}
|
|
1508
1544
|
|
|
1509
1545
|
// src/providers/openai.ts
|
|
1510
1546
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
@@ -1553,7 +1589,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1553
1589
|
if (input.schema) {
|
|
1554
1590
|
params.response_format = { type: "json_object" };
|
|
1555
1591
|
params.messages = [
|
|
1556
|
-
{ role: "system", content:
|
|
1592
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1557
1593
|
...params.messages
|
|
1558
1594
|
];
|
|
1559
1595
|
}
|
|
@@ -1561,7 +1597,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1561
1597
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1562
1598
|
params.tool_choice = "auto";
|
|
1563
1599
|
}
|
|
1564
|
-
const
|
|
1600
|
+
const reqOpts = { signal: input.signal };
|
|
1601
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1602
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1565
1603
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1566
1604
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1567
1605
|
const choice = response.choices[0];
|
|
@@ -1595,7 +1633,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1595
1633
|
const followUp = await client.chat.completions.create({
|
|
1596
1634
|
model: requestModel,
|
|
1597
1635
|
messages: toolMessages
|
|
1598
|
-
},
|
|
1636
|
+
}, reqOpts);
|
|
1599
1637
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1600
1638
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1601
1639
|
finalResponse = followUp;
|
|
@@ -1640,7 +1678,7 @@ function toolDefToOpenAI(tool) {
|
|
|
1640
1678
|
function: {
|
|
1641
1679
|
name: tool.name,
|
|
1642
1680
|
description: tool.description,
|
|
1643
|
-
parameters: (0,
|
|
1681
|
+
parameters: (0, import_zod_to_json_schema2.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
1644
1682
|
}
|
|
1645
1683
|
};
|
|
1646
1684
|
}
|
|
@@ -1689,8 +1727,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1689
1727
|
if (!apiKey) return void 0;
|
|
1690
1728
|
return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
|
|
1691
1729
|
}
|
|
1730
|
+
function isTemperatureError(err) {
|
|
1731
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1732
|
+
const lower = msg.toLowerCase();
|
|
1733
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1734
|
+
}
|
|
1692
1735
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1693
1736
|
let cached = void 0;
|
|
1737
|
+
let useTemperature = true;
|
|
1694
1738
|
return async ({ task, result }) => {
|
|
1695
1739
|
if (task.expected === void 0) {
|
|
1696
1740
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1707,35 +1751,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1707
1751
|
}
|
|
1708
1752
|
const { client, model } = cached;
|
|
1709
1753
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1754
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1710
1755
|
try {
|
|
1711
|
-
const response = await client
|
|
1712
|
-
|
|
1713
|
-
messages: [{ role: "user", content: prompt }],
|
|
1714
|
-
max_completion_tokens: 2048
|
|
1715
|
-
});
|
|
1716
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1717
|
-
const parsed = {};
|
|
1718
|
-
for (const line of content.split("\n")) {
|
|
1719
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1720
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1721
|
-
}
|
|
1722
|
-
const accuracy = parsed.accuracy;
|
|
1723
|
-
const completeness = parsed.completeness;
|
|
1724
|
-
const conciseness = parsed.conciseness;
|
|
1725
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1726
|
-
return {
|
|
1727
|
-
name: "llm-judge-correctness",
|
|
1728
|
-
value: -1,
|
|
1729
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1730
|
-
};
|
|
1731
|
-
}
|
|
1732
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1733
|
-
return {
|
|
1734
|
-
name: "llm-judge-correctness",
|
|
1735
|
-
value: composite,
|
|
1736
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1737
|
-
};
|
|
1756
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1757
|
+
return parseJudgeResponse(response, model);
|
|
1738
1758
|
} catch (err) {
|
|
1759
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1760
|
+
useTemperature = false;
|
|
1761
|
+
try {
|
|
1762
|
+
const response = await callJudge(client, model, messages, false);
|
|
1763
|
+
return parseJudgeResponse(response, model);
|
|
1764
|
+
} catch (retryErr) {
|
|
1765
|
+
return {
|
|
1766
|
+
name: "llm-judge-correctness",
|
|
1767
|
+
value: -1,
|
|
1768
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1769
|
+
};
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1739
1772
|
return {
|
|
1740
1773
|
name: "llm-judge-correctness",
|
|
1741
1774
|
value: -1,
|
|
@@ -1744,6 +1777,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1744
1777
|
}
|
|
1745
1778
|
};
|
|
1746
1779
|
}
|
|
1780
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1781
|
+
return client.chat.completions.create({
|
|
1782
|
+
model,
|
|
1783
|
+
messages,
|
|
1784
|
+
max_completion_tokens: 2048,
|
|
1785
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1786
|
+
});
|
|
1787
|
+
}
|
|
1788
|
+
function parseJudgeResponse(response, model) {
|
|
1789
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1790
|
+
const parsed = {};
|
|
1791
|
+
for (const line of content.split("\n")) {
|
|
1792
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1793
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1794
|
+
}
|
|
1795
|
+
const accuracy = parsed.accuracy;
|
|
1796
|
+
const completeness = parsed.completeness;
|
|
1797
|
+
const conciseness = parsed.conciseness;
|
|
1798
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1799
|
+
return {
|
|
1800
|
+
name: "llm-judge-correctness",
|
|
1801
|
+
value: -1,
|
|
1802
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1803
|
+
};
|
|
1804
|
+
}
|
|
1805
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1806
|
+
return {
|
|
1807
|
+
name: "llm-judge-correctness",
|
|
1808
|
+
value: composite,
|
|
1809
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1810
|
+
};
|
|
1811
|
+
}
|
|
1747
1812
|
|
|
1748
1813
|
// src/scorers/tool-usage.ts
|
|
1749
1814
|
var toolUsageScorer = ({ task, result }) => {
|
|
@@ -1816,7 +1881,8 @@ async function runBenchmarks(options) {
|
|
|
1816
1881
|
prompt: task.prompt,
|
|
1817
1882
|
schema: task.schema,
|
|
1818
1883
|
tools: task.tools,
|
|
1819
|
-
signal
|
|
1884
|
+
signal,
|
|
1885
|
+
timeout
|
|
1820
1886
|
}), timeout);
|
|
1821
1887
|
const scores = await Promise.all(
|
|
1822
1888
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
@@ -2025,37 +2091,76 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
2025
2091
|
}
|
|
2026
2092
|
return stats;
|
|
2027
2093
|
}
|
|
2094
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2095
|
+
"correctness",
|
|
2096
|
+
"schema-correctness",
|
|
2097
|
+
"fuzzy-similarity",
|
|
2098
|
+
"llm-judge-correctness",
|
|
2099
|
+
"tool-usage"
|
|
2100
|
+
]);
|
|
2101
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2102
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2103
|
+
if (qualityColumns.length === 0) return true;
|
|
2104
|
+
return qualityColumns.some((col) => {
|
|
2105
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2106
|
+
return val !== void 0 && val > 0;
|
|
2107
|
+
});
|
|
2108
|
+
}
|
|
2028
2109
|
function computeMedals(columnStats, providerIds) {
|
|
2029
2110
|
const medals = /* @__PURE__ */ new Map();
|
|
2030
2111
|
if (providerIds.length < 2) {
|
|
2031
2112
|
for (const id of providerIds) medals.set(id, "none");
|
|
2032
2113
|
return medals;
|
|
2033
2114
|
}
|
|
2034
|
-
const
|
|
2035
|
-
|
|
2036
|
-
|
|
2115
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2116
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2117
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2118
|
+
for (const id of providerIds) {
|
|
2119
|
+
qualityWins.set(id, 0);
|
|
2120
|
+
efficiencyWins.set(id, 0);
|
|
2121
|
+
}
|
|
2122
|
+
for (const [colName, colStats] of columnStats) {
|
|
2037
2123
|
if (colStats.best === void 0) continue;
|
|
2038
2124
|
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2039
2125
|
if (bestProviders.length === 1) {
|
|
2040
|
-
|
|
2126
|
+
const winnerId = bestProviders[0][0];
|
|
2127
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2128
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2129
|
+
} else {
|
|
2130
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
2131
|
+
}
|
|
2041
2132
|
}
|
|
2042
2133
|
}
|
|
2043
|
-
const totalWins = [...
|
|
2134
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
2044
2135
|
if (totalWins === 0) {
|
|
2045
2136
|
for (const id of providerIds) medals.set(id, "none");
|
|
2046
2137
|
return medals;
|
|
2047
2138
|
}
|
|
2048
|
-
const
|
|
2049
|
-
(
|
|
2050
|
-
|
|
2139
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2140
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2141
|
+
if (qDiff !== 0) return qDiff;
|
|
2142
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2143
|
+
if (eDiff !== 0) return eDiff;
|
|
2144
|
+
return a.localeCompare(b);
|
|
2145
|
+
});
|
|
2051
2146
|
const medalList = ["gold", "silver", "bronze"];
|
|
2052
2147
|
let rank = 0;
|
|
2053
|
-
for (let i = 0; i <
|
|
2054
|
-
if (i > 0
|
|
2055
|
-
|
|
2148
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2149
|
+
if (i > 0) {
|
|
2150
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2151
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2152
|
+
if (currQ < prevQ) {
|
|
2153
|
+
rank = i;
|
|
2154
|
+
} else if (currQ === prevQ) {
|
|
2155
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2156
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2157
|
+
if (currE < prevE) rank = i;
|
|
2158
|
+
}
|
|
2056
2159
|
}
|
|
2057
|
-
|
|
2058
|
-
|
|
2160
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2161
|
+
}
|
|
2162
|
+
for (const id of providerIds) {
|
|
2163
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
2059
2164
|
}
|
|
2060
2165
|
return medals;
|
|
2061
2166
|
}
|
|
@@ -2452,24 +2557,10 @@ function printSummary(results, providers, byProvider) {
|
|
|
2452
2557
|
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2453
2558
|
}
|
|
2454
2559
|
}
|
|
2455
|
-
if (!single) {
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2460
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2461
|
-
const maxWins = Math.max(...wins.values());
|
|
2462
|
-
if (maxWins > 0) {
|
|
2463
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2464
|
-
console.log("");
|
|
2465
|
-
if (topProviders.length === 1) {
|
|
2466
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2467
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2468
|
-
} else {
|
|
2469
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2470
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2471
|
-
}
|
|
2472
|
-
}
|
|
2560
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2561
|
+
console.log("");
|
|
2562
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2563
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2473
2564
|
}
|
|
2474
2565
|
console.log("");
|
|
2475
2566
|
}
|
|
@@ -2503,15 +2594,15 @@ function defineArena(config) {
|
|
|
2503
2594
|
if (config.providers.length === 0) {
|
|
2504
2595
|
throw new Error("At least one provider is required");
|
|
2505
2596
|
}
|
|
2506
|
-
if (config.tasks.length === 0) {
|
|
2507
|
-
throw new Error("At least one task is required");
|
|
2508
|
-
}
|
|
2509
2597
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2510
2598
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2511
2599
|
const runs = config.runs ?? 1;
|
|
2512
2600
|
return {
|
|
2513
2601
|
config,
|
|
2514
2602
|
async run(options) {
|
|
2603
|
+
if (config.tasks.length === 0) {
|
|
2604
|
+
throw new Error("At least one task is required");
|
|
2605
|
+
}
|
|
2515
2606
|
return runBenchmarks({
|
|
2516
2607
|
providers: config.providers,
|
|
2517
2608
|
tasks: config.tasks,
|
|
@@ -2537,13 +2628,15 @@ function anthropic(model, options) {
|
|
|
2537
2628
|
model,
|
|
2538
2629
|
async run(input) {
|
|
2539
2630
|
const start = Date.now();
|
|
2540
|
-
const systemMessage = input.schema ?
|
|
2631
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2632
|
+
const reqOpts = { signal: input.signal };
|
|
2633
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2541
2634
|
const response = await client.messages.create({
|
|
2542
2635
|
model,
|
|
2543
2636
|
max_tokens: maxTokens,
|
|
2544
2637
|
system: systemMessage,
|
|
2545
2638
|
messages: [{ role: "user", content: input.prompt }]
|
|
2546
|
-
},
|
|
2639
|
+
}, reqOpts);
|
|
2547
2640
|
const latencyMs = Date.now() - start;
|
|
2548
2641
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2549
2642
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2688,17 +2781,8 @@ function htmlReporter(results) {
|
|
|
2688
2781
|
return { id, avg };
|
|
2689
2782
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2690
2783
|
let overallWinner;
|
|
2691
|
-
if (multi) {
|
|
2692
|
-
|
|
2693
|
-
for (const id of providers) wins.set(id, 0);
|
|
2694
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2695
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2696
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2697
|
-
const maxWins = Math.max(...wins.values());
|
|
2698
|
-
if (maxWins > 0) {
|
|
2699
|
-
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2700
|
-
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2701
|
-
}
|
|
2784
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2785
|
+
overallWinner = byCorrectness.id;
|
|
2702
2786
|
}
|
|
2703
2787
|
const errorResults = results.filter((r) => r.error);
|
|
2704
2788
|
const deduped = dedupeErrors(errorResults);
|
|
@@ -3219,7 +3303,7 @@ function renderErrors(errors) {
|
|
|
3219
3303
|
</div>`;
|
|
3220
3304
|
}).join("\n");
|
|
3221
3305
|
return `<section class="errors-section">
|
|
3222
|
-
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'
|
|
3306
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3223
3307
|
<div class="errors-list">
|
|
3224
3308
|
${items}
|
|
3225
3309
|
</div>
|
|
@@ -3283,6 +3367,149 @@ function renderScript(taskCount) {
|
|
|
3283
3367
|
</script>`;
|
|
3284
3368
|
}
|
|
3285
3369
|
|
|
3370
|
+
// src/packs/structured-output.ts
|
|
3371
|
+
var import_zod = require("zod");
|
|
3372
|
+
var structuredOutputPack = {
|
|
3373
|
+
name: "structured-output",
|
|
3374
|
+
label: "Structured Output",
|
|
3375
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3376
|
+
tasks: [
|
|
3377
|
+
{
|
|
3378
|
+
name: "so:flat-entity",
|
|
3379
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3380
|
+
expected: {
|
|
3381
|
+
name: "Maria Garcia",
|
|
3382
|
+
age: 34,
|
|
3383
|
+
role: "software architect",
|
|
3384
|
+
city: "Barcelona",
|
|
3385
|
+
country: "Spain",
|
|
3386
|
+
employeeId: "EMP-2847"
|
|
3387
|
+
},
|
|
3388
|
+
schema: import_zod.z.object({
|
|
3389
|
+
name: import_zod.z.string(),
|
|
3390
|
+
age: import_zod.z.number(),
|
|
3391
|
+
role: import_zod.z.string(),
|
|
3392
|
+
city: import_zod.z.string(),
|
|
3393
|
+
country: import_zod.z.string(),
|
|
3394
|
+
employeeId: import_zod.z.string()
|
|
3395
|
+
})
|
|
3396
|
+
},
|
|
3397
|
+
{
|
|
3398
|
+
name: "so:nested-object",
|
|
3399
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3400
|
+
expected: {
|
|
3401
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3402
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3403
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3404
|
+
},
|
|
3405
|
+
schema: import_zod.z.object({
|
|
3406
|
+
recipient: import_zod.z.object({ company: import_zod.z.string(), contact: import_zod.z.string(), floor: import_zod.z.string() }),
|
|
3407
|
+
address: import_zod.z.object({
|
|
3408
|
+
street: import_zod.z.string(),
|
|
3409
|
+
city: import_zod.z.string(),
|
|
3410
|
+
state: import_zod.z.string(),
|
|
3411
|
+
zip: import_zod.z.string(),
|
|
3412
|
+
country: import_zod.z.string()
|
|
3413
|
+
}),
|
|
3414
|
+
order: import_zod.z.object({
|
|
3415
|
+
id: import_zod.z.string(),
|
|
3416
|
+
itemCount: import_zod.z.number(),
|
|
3417
|
+
weightKg: import_zod.z.number(),
|
|
3418
|
+
shippingMethod: import_zod.z.enum(["standard", "express", "overnight"])
|
|
3419
|
+
})
|
|
3420
|
+
})
|
|
3421
|
+
},
|
|
3422
|
+
{
|
|
3423
|
+
name: "so:array-of-objects",
|
|
3424
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3425
|
+
expected: [
|
|
3426
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3427
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3428
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3429
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3430
|
+
],
|
|
3431
|
+
schema: import_zod.z.array(import_zod.z.object({ name: import_zod.z.string(), price: import_zod.z.number(), category: import_zod.z.string() }))
|
|
3432
|
+
},
|
|
3433
|
+
{
|
|
3434
|
+
name: "so:empty-arrays",
|
|
3435
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3436
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3437
|
+
schema: import_zod.z.object({
|
|
3438
|
+
errors: import_zod.z.array(import_zod.z.object({ code: import_zod.z.string(), severity: import_zod.z.string() })),
|
|
3439
|
+
warnings: import_zod.z.array(import_zod.z.string()),
|
|
3440
|
+
status: import_zod.z.enum(["healthy", "degraded", "down"]),
|
|
3441
|
+
uptimePercent: import_zod.z.number()
|
|
3442
|
+
})
|
|
3443
|
+
},
|
|
3444
|
+
{
|
|
3445
|
+
name: "so:enum-classification",
|
|
3446
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3447
|
+
expected: [
|
|
3448
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3449
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3450
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3451
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3452
|
+
],
|
|
3453
|
+
schema: import_zod.z.array(
|
|
3454
|
+
import_zod.z.object({
|
|
3455
|
+
id: import_zod.z.string(),
|
|
3456
|
+
priority: import_zod.z.enum(["low", "medium", "high", "critical"]),
|
|
3457
|
+
category: import_zod.z.enum(["billing", "technical", "account", "general"])
|
|
3458
|
+
})
|
|
3459
|
+
)
|
|
3460
|
+
},
|
|
3461
|
+
{
|
|
3462
|
+
name: "so:adversarial-input",
|
|
3463
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3464
|
+
|
|
3465
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3466
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3467
|
+
expected: {
|
|
3468
|
+
product: "headphones",
|
|
3469
|
+
price: 59.99,
|
|
3470
|
+
rating: 5,
|
|
3471
|
+
maxRating: 5,
|
|
3472
|
+
features: ["noise-cancelling"],
|
|
3473
|
+
recommended: true,
|
|
3474
|
+
purchaseDate: "2026-01-15"
|
|
3475
|
+
},
|
|
3476
|
+
schema: import_zod.z.object({
|
|
3477
|
+
product: import_zod.z.string(),
|
|
3478
|
+
price: import_zod.z.number(),
|
|
3479
|
+
rating: import_zod.z.number(),
|
|
3480
|
+
maxRating: import_zod.z.number(),
|
|
3481
|
+
features: import_zod.z.array(import_zod.z.string()),
|
|
3482
|
+
recommended: import_zod.z.boolean(),
|
|
3483
|
+
purchaseDate: import_zod.z.string()
|
|
3484
|
+
})
|
|
3485
|
+
}
|
|
3486
|
+
],
|
|
3487
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3488
|
+
};
|
|
3489
|
+
|
|
3490
|
+
// src/packs/index.ts
|
|
3491
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3492
|
+
function register(pack) {
|
|
3493
|
+
registry.set(pack.name, pack);
|
|
3494
|
+
}
|
|
3495
|
+
register(structuredOutputPack);
|
|
3496
|
+
function loadPack(name) {
|
|
3497
|
+
const pack = registry.get(name);
|
|
3498
|
+
if (!pack) {
|
|
3499
|
+
const available = [...registry.keys()].join(", ");
|
|
3500
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3501
|
+
}
|
|
3502
|
+
return pack;
|
|
3503
|
+
}
|
|
3504
|
+
function listPacks() {
|
|
3505
|
+
return [...registry.values()].map((p) => ({
|
|
3506
|
+
name: p.name,
|
|
3507
|
+
label: p.label,
|
|
3508
|
+
description: p.description,
|
|
3509
|
+
taskCount: p.tasks.length
|
|
3510
|
+
}));
|
|
3511
|
+
}
|
|
3512
|
+
|
|
3286
3513
|
// src/ci.ts
|
|
3287
3514
|
var import_node_fs = require("fs");
|
|
3288
3515
|
var import_node_path = require("path");
|
|
@@ -3574,7 +3801,9 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
3574
3801
|
gemini,
|
|
3575
3802
|
htmlReporter,
|
|
3576
3803
|
jsonReporter,
|
|
3804
|
+
listPacks,
|
|
3577
3805
|
loadBaseline,
|
|
3806
|
+
loadPack,
|
|
3578
3807
|
markdownReporter,
|
|
3579
3808
|
openai,
|
|
3580
3809
|
openaiCompatible,
|