agent-duelist 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +246 -142
- package/dist/cli.js +2004 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +334 -105
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +332 -105
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1349,33 +1349,42 @@ var correctnessScorer = ({ task, result }) => {
|
|
|
1349
1349
|
if (task.expected === void 0) {
|
|
1350
1350
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1351
1351
|
}
|
|
1352
|
-
const
|
|
1352
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1353
|
+
const match = deepEqual(task.expected, actual);
|
|
1353
1354
|
return {
|
|
1354
1355
|
name: "correctness",
|
|
1355
1356
|
value: match ? 1 : 0,
|
|
1356
1357
|
details: { expected: task.expected, actual: result.output }
|
|
1357
1358
|
};
|
|
1358
1359
|
};
|
|
1359
|
-
function
|
|
1360
|
-
if (
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
if (
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1360
|
+
function normalizeOutput(expected, actual) {
|
|
1361
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1362
|
+
const entries = Object.entries(actual);
|
|
1363
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1364
|
+
if (arrayEntries.length === 1) {
|
|
1365
|
+
return arrayEntries[0][1];
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
return actual;
|
|
1369
|
+
}
|
|
1370
|
+
function deepEqual(expected, actual) {
|
|
1371
|
+
if (expected === actual) return true;
|
|
1372
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1373
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1374
|
+
}
|
|
1375
|
+
if (typeof expected !== typeof actual) return false;
|
|
1376
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1377
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1378
|
+
if (expected.length !== actual.length) return false;
|
|
1379
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1380
|
+
}
|
|
1381
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1382
|
+
const objExpected = expected;
|
|
1383
|
+
const objActual = actual;
|
|
1384
|
+
const keysExpected = Object.keys(objExpected);
|
|
1385
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1386
|
+
}
|
|
1387
|
+
return expected === actual;
|
|
1379
1388
|
}
|
|
1380
1389
|
|
|
1381
1390
|
// src/scorers/schema-correctness.ts
|
|
@@ -1395,7 +1404,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1395
1404
|
};
|
|
1396
1405
|
}
|
|
1397
1406
|
}
|
|
1398
|
-
|
|
1407
|
+
let parsed = task.schema.safeParse(data);
|
|
1408
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1409
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1410
|
+
if (arrayEntries.length === 1) {
|
|
1411
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1412
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1399
1415
|
return {
|
|
1400
1416
|
name: "schema-correctness",
|
|
1401
1417
|
value: parsed.success ? 1 : 0,
|
|
@@ -1441,18 +1457,36 @@ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
|
1441
1457
|
|
|
1442
1458
|
// src/providers/openai.ts
|
|
1443
1459
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1444
|
-
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1460
|
+
import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
|
|
1445
1461
|
|
|
1446
1462
|
// src/providers/shared.ts
|
|
1447
|
-
|
|
1463
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1464
|
+
function buildSchemaSystemMessage(schema) {
|
|
1465
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1466
|
+
const jsonSchema = zodToJsonSchema(schema, { target: "openAi" });
|
|
1467
|
+
return [
|
|
1468
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1469
|
+
"",
|
|
1470
|
+
"Your output must conform to this JSON Schema:",
|
|
1471
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1472
|
+
"",
|
|
1473
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1474
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1475
|
+
].join("\n");
|
|
1476
|
+
}
|
|
1448
1477
|
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1449
1478
|
if (!hasSchema) return rawContent;
|
|
1479
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1450
1480
|
try {
|
|
1451
|
-
return JSON.parse(
|
|
1481
|
+
return JSON.parse(cleaned);
|
|
1452
1482
|
} catch {
|
|
1453
1483
|
return rawContent;
|
|
1454
1484
|
}
|
|
1455
1485
|
}
|
|
1486
|
+
function stripCodeFences(content) {
|
|
1487
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1488
|
+
return match ? match[1] : content;
|
|
1489
|
+
}
|
|
1456
1490
|
|
|
1457
1491
|
// src/providers/openai.ts
|
|
1458
1492
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
@@ -1501,7 +1535,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1501
1535
|
if (input.schema) {
|
|
1502
1536
|
params.response_format = { type: "json_object" };
|
|
1503
1537
|
params.messages = [
|
|
1504
|
-
{ role: "system", content:
|
|
1538
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1505
1539
|
...params.messages
|
|
1506
1540
|
];
|
|
1507
1541
|
}
|
|
@@ -1509,7 +1543,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1509
1543
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1510
1544
|
params.tool_choice = "auto";
|
|
1511
1545
|
}
|
|
1512
|
-
const
|
|
1546
|
+
const reqOpts = { signal: input.signal };
|
|
1547
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1548
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1513
1549
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1514
1550
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1515
1551
|
const choice = response.choices[0];
|
|
@@ -1543,7 +1579,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1543
1579
|
const followUp = await client.chat.completions.create({
|
|
1544
1580
|
model: requestModel,
|
|
1545
1581
|
messages: toolMessages
|
|
1546
|
-
},
|
|
1582
|
+
}, reqOpts);
|
|
1547
1583
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1548
1584
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1549
1585
|
finalResponse = followUp;
|
|
@@ -1588,7 +1624,7 @@ function toolDefToOpenAI(tool) {
|
|
|
1588
1624
|
function: {
|
|
1589
1625
|
name: tool.name,
|
|
1590
1626
|
description: tool.description,
|
|
1591
|
-
parameters:
|
|
1627
|
+
parameters: zodToJsonSchema2(tool.parameters, { target: "openAi" })
|
|
1592
1628
|
}
|
|
1593
1629
|
};
|
|
1594
1630
|
}
|
|
@@ -1637,8 +1673,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1637
1673
|
if (!apiKey) return void 0;
|
|
1638
1674
|
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
1639
1675
|
}
|
|
1676
|
+
function isTemperatureError(err) {
|
|
1677
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1678
|
+
const lower = msg.toLowerCase();
|
|
1679
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1680
|
+
}
|
|
1640
1681
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1641
1682
|
let cached = void 0;
|
|
1683
|
+
let useTemperature = true;
|
|
1642
1684
|
return async ({ task, result }) => {
|
|
1643
1685
|
if (task.expected === void 0) {
|
|
1644
1686
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1655,35 +1697,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1655
1697
|
}
|
|
1656
1698
|
const { client, model } = cached;
|
|
1657
1699
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1700
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1658
1701
|
try {
|
|
1659
|
-
const response = await client
|
|
1660
|
-
|
|
1661
|
-
messages: [{ role: "user", content: prompt }],
|
|
1662
|
-
max_completion_tokens: 2048
|
|
1663
|
-
});
|
|
1664
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1665
|
-
const parsed = {};
|
|
1666
|
-
for (const line of content.split("\n")) {
|
|
1667
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1668
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1669
|
-
}
|
|
1670
|
-
const accuracy = parsed.accuracy;
|
|
1671
|
-
const completeness = parsed.completeness;
|
|
1672
|
-
const conciseness = parsed.conciseness;
|
|
1673
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1674
|
-
return {
|
|
1675
|
-
name: "llm-judge-correctness",
|
|
1676
|
-
value: -1,
|
|
1677
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1678
|
-
};
|
|
1679
|
-
}
|
|
1680
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1681
|
-
return {
|
|
1682
|
-
name: "llm-judge-correctness",
|
|
1683
|
-
value: composite,
|
|
1684
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1685
|
-
};
|
|
1702
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1703
|
+
return parseJudgeResponse(response, model);
|
|
1686
1704
|
} catch (err) {
|
|
1705
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1706
|
+
useTemperature = false;
|
|
1707
|
+
try {
|
|
1708
|
+
const response = await callJudge(client, model, messages, false);
|
|
1709
|
+
return parseJudgeResponse(response, model);
|
|
1710
|
+
} catch (retryErr) {
|
|
1711
|
+
return {
|
|
1712
|
+
name: "llm-judge-correctness",
|
|
1713
|
+
value: -1,
|
|
1714
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1715
|
+
};
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1687
1718
|
return {
|
|
1688
1719
|
name: "llm-judge-correctness",
|
|
1689
1720
|
value: -1,
|
|
@@ -1692,6 +1723,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1692
1723
|
}
|
|
1693
1724
|
};
|
|
1694
1725
|
}
|
|
1726
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1727
|
+
return client.chat.completions.create({
|
|
1728
|
+
model,
|
|
1729
|
+
messages,
|
|
1730
|
+
max_completion_tokens: 2048,
|
|
1731
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1732
|
+
});
|
|
1733
|
+
}
|
|
1734
|
+
function parseJudgeResponse(response, model) {
|
|
1735
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1736
|
+
const parsed = {};
|
|
1737
|
+
for (const line of content.split("\n")) {
|
|
1738
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1739
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1740
|
+
}
|
|
1741
|
+
const accuracy = parsed.accuracy;
|
|
1742
|
+
const completeness = parsed.completeness;
|
|
1743
|
+
const conciseness = parsed.conciseness;
|
|
1744
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1745
|
+
return {
|
|
1746
|
+
name: "llm-judge-correctness",
|
|
1747
|
+
value: -1,
|
|
1748
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1749
|
+
};
|
|
1750
|
+
}
|
|
1751
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1752
|
+
return {
|
|
1753
|
+
name: "llm-judge-correctness",
|
|
1754
|
+
value: composite,
|
|
1755
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1756
|
+
};
|
|
1757
|
+
}
|
|
1695
1758
|
|
|
1696
1759
|
// src/scorers/tool-usage.ts
|
|
1697
1760
|
var toolUsageScorer = ({ task, result }) => {
|
|
@@ -1764,7 +1827,8 @@ async function runBenchmarks(options) {
|
|
|
1764
1827
|
prompt: task.prompt,
|
|
1765
1828
|
schema: task.schema,
|
|
1766
1829
|
tools: task.tools,
|
|
1767
|
-
signal
|
|
1830
|
+
signal,
|
|
1831
|
+
timeout
|
|
1768
1832
|
}), timeout);
|
|
1769
1833
|
const scores = await Promise.all(
|
|
1770
1834
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
@@ -1973,37 +2037,76 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1973
2037
|
}
|
|
1974
2038
|
return stats;
|
|
1975
2039
|
}
|
|
2040
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2041
|
+
"correctness",
|
|
2042
|
+
"schema-correctness",
|
|
2043
|
+
"fuzzy-similarity",
|
|
2044
|
+
"llm-judge-correctness",
|
|
2045
|
+
"tool-usage"
|
|
2046
|
+
]);
|
|
2047
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2048
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2049
|
+
if (qualityColumns.length === 0) return true;
|
|
2050
|
+
return qualityColumns.some((col) => {
|
|
2051
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2052
|
+
return val !== void 0 && val > 0;
|
|
2053
|
+
});
|
|
2054
|
+
}
|
|
1976
2055
|
function computeMedals(columnStats, providerIds) {
|
|
1977
2056
|
const medals = /* @__PURE__ */ new Map();
|
|
1978
2057
|
if (providerIds.length < 2) {
|
|
1979
2058
|
for (const id of providerIds) medals.set(id, "none");
|
|
1980
2059
|
return medals;
|
|
1981
2060
|
}
|
|
1982
|
-
const
|
|
1983
|
-
|
|
1984
|
-
|
|
2061
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2062
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2063
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2064
|
+
for (const id of providerIds) {
|
|
2065
|
+
qualityWins.set(id, 0);
|
|
2066
|
+
efficiencyWins.set(id, 0);
|
|
2067
|
+
}
|
|
2068
|
+
for (const [colName, colStats] of columnStats) {
|
|
1985
2069
|
if (colStats.best === void 0) continue;
|
|
1986
2070
|
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
1987
2071
|
if (bestProviders.length === 1) {
|
|
1988
|
-
|
|
2072
|
+
const winnerId = bestProviders[0][0];
|
|
2073
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2074
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2075
|
+
} else {
|
|
2076
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
2077
|
+
}
|
|
1989
2078
|
}
|
|
1990
2079
|
}
|
|
1991
|
-
const totalWins = [...
|
|
2080
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
1992
2081
|
if (totalWins === 0) {
|
|
1993
2082
|
for (const id of providerIds) medals.set(id, "none");
|
|
1994
2083
|
return medals;
|
|
1995
2084
|
}
|
|
1996
|
-
const
|
|
1997
|
-
(
|
|
1998
|
-
|
|
2085
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2086
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2087
|
+
if (qDiff !== 0) return qDiff;
|
|
2088
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2089
|
+
if (eDiff !== 0) return eDiff;
|
|
2090
|
+
return a.localeCompare(b);
|
|
2091
|
+
});
|
|
1999
2092
|
const medalList = ["gold", "silver", "bronze"];
|
|
2000
2093
|
let rank = 0;
|
|
2001
|
-
for (let i = 0; i <
|
|
2002
|
-
if (i > 0
|
|
2003
|
-
|
|
2094
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2095
|
+
if (i > 0) {
|
|
2096
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2097
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2098
|
+
if (currQ < prevQ) {
|
|
2099
|
+
rank = i;
|
|
2100
|
+
} else if (currQ === prevQ) {
|
|
2101
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2102
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2103
|
+
if (currE < prevE) rank = i;
|
|
2104
|
+
}
|
|
2004
2105
|
}
|
|
2005
|
-
|
|
2006
|
-
|
|
2106
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2107
|
+
}
|
|
2108
|
+
for (const id of providerIds) {
|
|
2109
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
2007
2110
|
}
|
|
2008
2111
|
return medals;
|
|
2009
2112
|
}
|
|
@@ -2400,24 +2503,10 @@ function printSummary(results, providers, byProvider) {
|
|
|
2400
2503
|
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2401
2504
|
}
|
|
2402
2505
|
}
|
|
2403
|
-
if (!single) {
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2408
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2409
|
-
const maxWins = Math.max(...wins.values());
|
|
2410
|
-
if (maxWins > 0) {
|
|
2411
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2412
|
-
console.log("");
|
|
2413
|
-
if (topProviders.length === 1) {
|
|
2414
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2415
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2416
|
-
} else {
|
|
2417
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2418
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2419
|
-
}
|
|
2420
|
-
}
|
|
2506
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2507
|
+
console.log("");
|
|
2508
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2509
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2421
2510
|
}
|
|
2422
2511
|
console.log("");
|
|
2423
2512
|
}
|
|
@@ -2451,15 +2540,15 @@ function defineArena(config) {
|
|
|
2451
2540
|
if (config.providers.length === 0) {
|
|
2452
2541
|
throw new Error("At least one provider is required");
|
|
2453
2542
|
}
|
|
2454
|
-
if (config.tasks.length === 0) {
|
|
2455
|
-
throw new Error("At least one task is required");
|
|
2456
|
-
}
|
|
2457
2543
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2458
2544
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2459
2545
|
const runs = config.runs ?? 1;
|
|
2460
2546
|
return {
|
|
2461
2547
|
config,
|
|
2462
2548
|
async run(options) {
|
|
2549
|
+
if (config.tasks.length === 0) {
|
|
2550
|
+
throw new Error("At least one task is required");
|
|
2551
|
+
}
|
|
2463
2552
|
return runBenchmarks({
|
|
2464
2553
|
providers: config.providers,
|
|
2465
2554
|
tasks: config.tasks,
|
|
@@ -2485,13 +2574,15 @@ function anthropic(model, options) {
|
|
|
2485
2574
|
model,
|
|
2486
2575
|
async run(input) {
|
|
2487
2576
|
const start = Date.now();
|
|
2488
|
-
const systemMessage = input.schema ?
|
|
2577
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2578
|
+
const reqOpts = { signal: input.signal };
|
|
2579
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2489
2580
|
const response = await client.messages.create({
|
|
2490
2581
|
model,
|
|
2491
2582
|
max_tokens: maxTokens,
|
|
2492
2583
|
system: systemMessage,
|
|
2493
2584
|
messages: [{ role: "user", content: input.prompt }]
|
|
2494
|
-
},
|
|
2585
|
+
}, reqOpts);
|
|
2495
2586
|
const latencyMs = Date.now() - start;
|
|
2496
2587
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2497
2588
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2636,17 +2727,8 @@ function htmlReporter(results) {
|
|
|
2636
2727
|
return { id, avg };
|
|
2637
2728
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2638
2729
|
let overallWinner;
|
|
2639
|
-
if (multi) {
|
|
2640
|
-
|
|
2641
|
-
for (const id of providers) wins.set(id, 0);
|
|
2642
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2643
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2644
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2645
|
-
const maxWins = Math.max(...wins.values());
|
|
2646
|
-
if (maxWins > 0) {
|
|
2647
|
-
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2648
|
-
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2649
|
-
}
|
|
2730
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2731
|
+
overallWinner = byCorrectness.id;
|
|
2650
2732
|
}
|
|
2651
2733
|
const errorResults = results.filter((r) => r.error);
|
|
2652
2734
|
const deduped = dedupeErrors(errorResults);
|
|
@@ -3167,7 +3249,7 @@ function renderErrors(errors) {
|
|
|
3167
3249
|
</div>`;
|
|
3168
3250
|
}).join("\n");
|
|
3169
3251
|
return `<section class="errors-section">
|
|
3170
|
-
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'
|
|
3252
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3171
3253
|
<div class="errors-list">
|
|
3172
3254
|
${items}
|
|
3173
3255
|
</div>
|
|
@@ -3231,6 +3313,149 @@ function renderScript(taskCount) {
|
|
|
3231
3313
|
</script>`;
|
|
3232
3314
|
}
|
|
3233
3315
|
|
|
3316
|
+
// src/packs/structured-output.ts
|
|
3317
|
+
import { z } from "zod";
|
|
3318
|
+
var structuredOutputPack = {
|
|
3319
|
+
name: "structured-output",
|
|
3320
|
+
label: "Structured Output",
|
|
3321
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3322
|
+
tasks: [
|
|
3323
|
+
{
|
|
3324
|
+
name: "so:flat-entity",
|
|
3325
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3326
|
+
expected: {
|
|
3327
|
+
name: "Maria Garcia",
|
|
3328
|
+
age: 34,
|
|
3329
|
+
role: "software architect",
|
|
3330
|
+
city: "Barcelona",
|
|
3331
|
+
country: "Spain",
|
|
3332
|
+
employeeId: "EMP-2847"
|
|
3333
|
+
},
|
|
3334
|
+
schema: z.object({
|
|
3335
|
+
name: z.string(),
|
|
3336
|
+
age: z.number(),
|
|
3337
|
+
role: z.string(),
|
|
3338
|
+
city: z.string(),
|
|
3339
|
+
country: z.string(),
|
|
3340
|
+
employeeId: z.string()
|
|
3341
|
+
})
|
|
3342
|
+
},
|
|
3343
|
+
{
|
|
3344
|
+
name: "so:nested-object",
|
|
3345
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3346
|
+
expected: {
|
|
3347
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3348
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3349
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3350
|
+
},
|
|
3351
|
+
schema: z.object({
|
|
3352
|
+
recipient: z.object({ company: z.string(), contact: z.string(), floor: z.string() }),
|
|
3353
|
+
address: z.object({
|
|
3354
|
+
street: z.string(),
|
|
3355
|
+
city: z.string(),
|
|
3356
|
+
state: z.string(),
|
|
3357
|
+
zip: z.string(),
|
|
3358
|
+
country: z.string()
|
|
3359
|
+
}),
|
|
3360
|
+
order: z.object({
|
|
3361
|
+
id: z.string(),
|
|
3362
|
+
itemCount: z.number(),
|
|
3363
|
+
weightKg: z.number(),
|
|
3364
|
+
shippingMethod: z.enum(["standard", "express", "overnight"])
|
|
3365
|
+
})
|
|
3366
|
+
})
|
|
3367
|
+
},
|
|
3368
|
+
{
|
|
3369
|
+
name: "so:array-of-objects",
|
|
3370
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3371
|
+
expected: [
|
|
3372
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3373
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3374
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3375
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3376
|
+
],
|
|
3377
|
+
schema: z.array(z.object({ name: z.string(), price: z.number(), category: z.string() }))
|
|
3378
|
+
},
|
|
3379
|
+
{
|
|
3380
|
+
name: "so:empty-arrays",
|
|
3381
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3382
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3383
|
+
schema: z.object({
|
|
3384
|
+
errors: z.array(z.object({ code: z.string(), severity: z.string() })),
|
|
3385
|
+
warnings: z.array(z.string()),
|
|
3386
|
+
status: z.enum(["healthy", "degraded", "down"]),
|
|
3387
|
+
uptimePercent: z.number()
|
|
3388
|
+
})
|
|
3389
|
+
},
|
|
3390
|
+
{
|
|
3391
|
+
name: "so:enum-classification",
|
|
3392
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3393
|
+
expected: [
|
|
3394
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3395
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3396
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3397
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3398
|
+
],
|
|
3399
|
+
schema: z.array(
|
|
3400
|
+
z.object({
|
|
3401
|
+
id: z.string(),
|
|
3402
|
+
priority: z.enum(["low", "medium", "high", "critical"]),
|
|
3403
|
+
category: z.enum(["billing", "technical", "account", "general"])
|
|
3404
|
+
})
|
|
3405
|
+
)
|
|
3406
|
+
},
|
|
3407
|
+
{
|
|
3408
|
+
name: "so:adversarial-input",
|
|
3409
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3410
|
+
|
|
3411
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3412
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3413
|
+
expected: {
|
|
3414
|
+
product: "headphones",
|
|
3415
|
+
price: 59.99,
|
|
3416
|
+
rating: 5,
|
|
3417
|
+
maxRating: 5,
|
|
3418
|
+
features: ["noise-cancelling"],
|
|
3419
|
+
recommended: true,
|
|
3420
|
+
purchaseDate: "2026-01-15"
|
|
3421
|
+
},
|
|
3422
|
+
schema: z.object({
|
|
3423
|
+
product: z.string(),
|
|
3424
|
+
price: z.number(),
|
|
3425
|
+
rating: z.number(),
|
|
3426
|
+
maxRating: z.number(),
|
|
3427
|
+
features: z.array(z.string()),
|
|
3428
|
+
recommended: z.boolean(),
|
|
3429
|
+
purchaseDate: z.string()
|
|
3430
|
+
})
|
|
3431
|
+
}
|
|
3432
|
+
],
|
|
3433
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3434
|
+
};
|
|
3435
|
+
|
|
3436
|
+
// src/packs/index.ts
|
|
3437
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3438
|
+
function register(pack) {
|
|
3439
|
+
registry.set(pack.name, pack);
|
|
3440
|
+
}
|
|
3441
|
+
register(structuredOutputPack);
|
|
3442
|
+
function loadPack(name) {
|
|
3443
|
+
const pack = registry.get(name);
|
|
3444
|
+
if (!pack) {
|
|
3445
|
+
const available = [...registry.keys()].join(", ");
|
|
3446
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3447
|
+
}
|
|
3448
|
+
return pack;
|
|
3449
|
+
}
|
|
3450
|
+
function listPacks() {
|
|
3451
|
+
return [...registry.values()].map((p) => ({
|
|
3452
|
+
name: p.name,
|
|
3453
|
+
label: p.label,
|
|
3454
|
+
description: p.description,
|
|
3455
|
+
taskCount: p.tasks.length
|
|
3456
|
+
}));
|
|
3457
|
+
}
|
|
3458
|
+
|
|
3234
3459
|
// src/ci.ts
|
|
3235
3460
|
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
3236
3461
|
import { dirname } from "path";
|
|
@@ -3521,7 +3746,9 @@ export {
|
|
|
3521
3746
|
gemini,
|
|
3522
3747
|
htmlReporter,
|
|
3523
3748
|
jsonReporter,
|
|
3749
|
+
listPacks,
|
|
3524
3750
|
loadBaseline,
|
|
3751
|
+
loadPack,
|
|
3525
3752
|
markdownReporter,
|
|
3526
3753
|
openai,
|
|
3527
3754
|
openaiCompatible,
|