agent-duelist 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +248 -142
- package/dist/cli.js +2284 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +614 -109
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +612 -109
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/dist/index.cjs
CHANGED
|
@@ -40,7 +40,9 @@ __export(index_exports, {
|
|
|
40
40
|
gemini: () => gemini,
|
|
41
41
|
htmlReporter: () => htmlReporter,
|
|
42
42
|
jsonReporter: () => jsonReporter,
|
|
43
|
+
listPacks: () => listPacks,
|
|
43
44
|
loadBaseline: () => loadBaseline,
|
|
45
|
+
loadPack: () => loadPack,
|
|
44
46
|
markdownReporter: () => markdownReporter,
|
|
45
47
|
openai: () => openai,
|
|
46
48
|
openaiCompatible: () => openaiCompatible,
|
|
@@ -1396,38 +1398,49 @@ var costScorer = ({ result }, providerId) => {
|
|
|
1396
1398
|
};
|
|
1397
1399
|
};
|
|
1398
1400
|
|
|
1401
|
+
// src/utils/deep-equal.ts
|
|
1402
|
+
function deepEqual(expected, actual) {
|
|
1403
|
+
if (expected === actual) return true;
|
|
1404
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1405
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1406
|
+
}
|
|
1407
|
+
if (typeof expected !== typeof actual) return false;
|
|
1408
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1409
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1410
|
+
if (expected.length !== actual.length) return false;
|
|
1411
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1412
|
+
}
|
|
1413
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1414
|
+
const objExpected = expected;
|
|
1415
|
+
const objActual = actual;
|
|
1416
|
+
const keysExpected = Object.keys(objExpected);
|
|
1417
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1418
|
+
}
|
|
1419
|
+
return expected === actual;
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1399
1422
|
// src/scorers/correctness.ts
|
|
1400
1423
|
var correctnessScorer = ({ task, result }) => {
|
|
1401
1424
|
if (task.expected === void 0) {
|
|
1402
1425
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1403
1426
|
}
|
|
1404
|
-
const
|
|
1427
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1428
|
+
const match = deepEqual(task.expected, actual);
|
|
1405
1429
|
return {
|
|
1406
1430
|
name: "correctness",
|
|
1407
1431
|
value: match ? 1 : 0,
|
|
1408
1432
|
details: { expected: task.expected, actual: result.output }
|
|
1409
1433
|
};
|
|
1410
1434
|
};
|
|
1411
|
-
function
|
|
1412
|
-
if (
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
1421
|
-
}
|
|
1422
|
-
if (typeof a === "object" && typeof b === "object") {
|
|
1423
|
-
const objA = a;
|
|
1424
|
-
const objB = b;
|
|
1425
|
-
const keysA = Object.keys(objA);
|
|
1426
|
-
const keysB = Object.keys(objB);
|
|
1427
|
-
if (keysA.length !== keysB.length) return false;
|
|
1428
|
-
return keysA.every((key) => key in objB && deepEqual(objA[key], objB[key]));
|
|
1429
|
-
}
|
|
1430
|
-
return a === b;
|
|
1435
|
+
function normalizeOutput(expected, actual) {
|
|
1436
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1437
|
+
const entries = Object.entries(actual);
|
|
1438
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1439
|
+
if (arrayEntries.length === 1) {
|
|
1440
|
+
return arrayEntries[0][1];
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
return actual;
|
|
1431
1444
|
}
|
|
1432
1445
|
|
|
1433
1446
|
// src/scorers/schema-correctness.ts
|
|
@@ -1447,7 +1460,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1447
1460
|
};
|
|
1448
1461
|
}
|
|
1449
1462
|
}
|
|
1450
|
-
|
|
1463
|
+
let parsed = task.schema.safeParse(data);
|
|
1464
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1465
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1466
|
+
if (arrayEntries.length === 1) {
|
|
1467
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1468
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1469
|
+
}
|
|
1470
|
+
}
|
|
1451
1471
|
return {
|
|
1452
1472
|
name: "schema-correctness",
|
|
1453
1473
|
value: parsed.success ? 1 : 0,
|
|
@@ -1493,18 +1513,36 @@ var import_openai2 = __toESM(require("openai"), 1);
|
|
|
1493
1513
|
|
|
1494
1514
|
// src/providers/openai.ts
|
|
1495
1515
|
var import_openai = __toESM(require("openai"), 1);
|
|
1496
|
-
var
|
|
1516
|
+
var import_zod_to_json_schema2 = require("zod-to-json-schema");
|
|
1497
1517
|
|
|
1498
1518
|
// src/providers/shared.ts
|
|
1499
|
-
var
|
|
1519
|
+
var import_zod_to_json_schema = require("zod-to-json-schema");
|
|
1520
|
+
function buildSchemaSystemMessage(schema) {
|
|
1521
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1522
|
+
const jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema, { target: "openAi" });
|
|
1523
|
+
return [
|
|
1524
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1525
|
+
"",
|
|
1526
|
+
"Your output must conform to this JSON Schema:",
|
|
1527
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1528
|
+
"",
|
|
1529
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1530
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1531
|
+
].join("\n");
|
|
1532
|
+
}
|
|
1500
1533
|
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1501
1534
|
if (!hasSchema) return rawContent;
|
|
1535
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1502
1536
|
try {
|
|
1503
|
-
return JSON.parse(
|
|
1537
|
+
return JSON.parse(cleaned);
|
|
1504
1538
|
} catch {
|
|
1505
1539
|
return rawContent;
|
|
1506
1540
|
}
|
|
1507
1541
|
}
|
|
1542
|
+
function stripCodeFences(content) {
|
|
1543
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1544
|
+
return match ? match[1] : content;
|
|
1545
|
+
}
|
|
1508
1546
|
|
|
1509
1547
|
// src/providers/openai.ts
|
|
1510
1548
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
@@ -1553,7 +1591,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1553
1591
|
if (input.schema) {
|
|
1554
1592
|
params.response_format = { type: "json_object" };
|
|
1555
1593
|
params.messages = [
|
|
1556
|
-
{ role: "system", content:
|
|
1594
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1557
1595
|
...params.messages
|
|
1558
1596
|
];
|
|
1559
1597
|
}
|
|
@@ -1561,7 +1599,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1561
1599
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1562
1600
|
params.tool_choice = "auto";
|
|
1563
1601
|
}
|
|
1564
|
-
const
|
|
1602
|
+
const reqOpts = { signal: input.signal };
|
|
1603
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1604
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1565
1605
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1566
1606
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1567
1607
|
const choice = response.choices[0];
|
|
@@ -1595,7 +1635,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1595
1635
|
const followUp = await client.chat.completions.create({
|
|
1596
1636
|
model: requestModel,
|
|
1597
1637
|
messages: toolMessages
|
|
1598
|
-
},
|
|
1638
|
+
}, reqOpts);
|
|
1599
1639
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1600
1640
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1601
1641
|
finalResponse = followUp;
|
|
@@ -1640,7 +1680,7 @@ function toolDefToOpenAI(tool) {
|
|
|
1640
1680
|
function: {
|
|
1641
1681
|
name: tool.name,
|
|
1642
1682
|
description: tool.description,
|
|
1643
|
-
parameters: (0,
|
|
1683
|
+
parameters: (0, import_zod_to_json_schema2.zodToJsonSchema)(tool.parameters, { target: "openAi" })
|
|
1644
1684
|
}
|
|
1645
1685
|
};
|
|
1646
1686
|
}
|
|
@@ -1689,8 +1729,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1689
1729
|
if (!apiKey) return void 0;
|
|
1690
1730
|
return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
|
|
1691
1731
|
}
|
|
1732
|
+
function isTemperatureError(err) {
|
|
1733
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1734
|
+
const lower = msg.toLowerCase();
|
|
1735
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1736
|
+
}
|
|
1692
1737
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1693
1738
|
let cached = void 0;
|
|
1739
|
+
let useTemperature = true;
|
|
1694
1740
|
return async ({ task, result }) => {
|
|
1695
1741
|
if (task.expected === void 0) {
|
|
1696
1742
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1707,35 +1753,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1707
1753
|
}
|
|
1708
1754
|
const { client, model } = cached;
|
|
1709
1755
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1756
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1710
1757
|
try {
|
|
1711
|
-
const response = await client
|
|
1712
|
-
|
|
1713
|
-
messages: [{ role: "user", content: prompt }],
|
|
1714
|
-
max_completion_tokens: 2048
|
|
1715
|
-
});
|
|
1716
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1717
|
-
const parsed = {};
|
|
1718
|
-
for (const line of content.split("\n")) {
|
|
1719
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1720
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1721
|
-
}
|
|
1722
|
-
const accuracy = parsed.accuracy;
|
|
1723
|
-
const completeness = parsed.completeness;
|
|
1724
|
-
const conciseness = parsed.conciseness;
|
|
1725
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1726
|
-
return {
|
|
1727
|
-
name: "llm-judge-correctness",
|
|
1728
|
-
value: -1,
|
|
1729
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1730
|
-
};
|
|
1731
|
-
}
|
|
1732
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1733
|
-
return {
|
|
1734
|
-
name: "llm-judge-correctness",
|
|
1735
|
-
value: composite,
|
|
1736
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1737
|
-
};
|
|
1758
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1759
|
+
return parseJudgeResponse(response, model);
|
|
1738
1760
|
} catch (err) {
|
|
1761
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1762
|
+
useTemperature = false;
|
|
1763
|
+
try {
|
|
1764
|
+
const response = await callJudge(client, model, messages, false);
|
|
1765
|
+
return parseJudgeResponse(response, model);
|
|
1766
|
+
} catch (retryErr) {
|
|
1767
|
+
return {
|
|
1768
|
+
name: "llm-judge-correctness",
|
|
1769
|
+
value: -1,
|
|
1770
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1771
|
+
};
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1739
1774
|
return {
|
|
1740
1775
|
name: "llm-judge-correctness",
|
|
1741
1776
|
value: -1,
|
|
@@ -1744,18 +1779,89 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1744
1779
|
}
|
|
1745
1780
|
};
|
|
1746
1781
|
}
|
|
1782
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1783
|
+
return client.chat.completions.create({
|
|
1784
|
+
model,
|
|
1785
|
+
messages,
|
|
1786
|
+
max_completion_tokens: 2048,
|
|
1787
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1788
|
+
});
|
|
1789
|
+
}
|
|
1790
|
+
function parseJudgeResponse(response, model) {
|
|
1791
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1792
|
+
const parsed = {};
|
|
1793
|
+
for (const line of content.split("\n")) {
|
|
1794
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1795
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1796
|
+
}
|
|
1797
|
+
const accuracy = parsed.accuracy;
|
|
1798
|
+
const completeness = parsed.completeness;
|
|
1799
|
+
const conciseness = parsed.conciseness;
|
|
1800
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1801
|
+
return {
|
|
1802
|
+
name: "llm-judge-correctness",
|
|
1803
|
+
value: -1,
|
|
1804
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1805
|
+
};
|
|
1806
|
+
}
|
|
1807
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1808
|
+
return {
|
|
1809
|
+
name: "llm-judge-correctness",
|
|
1810
|
+
value: composite,
|
|
1811
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1812
|
+
};
|
|
1813
|
+
}
|
|
1747
1814
|
|
|
1748
1815
|
// src/scorers/tool-usage.ts
|
|
1749
1816
|
var toolUsageScorer = ({ task, result }) => {
|
|
1750
|
-
|
|
1751
|
-
if (!expectedToolName) {
|
|
1817
|
+
if (!task.tools?.length) {
|
|
1752
1818
|
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
1753
1819
|
}
|
|
1754
|
-
const
|
|
1820
|
+
const calls = result.toolCalls ?? [];
|
|
1821
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
1822
|
+
if (expectedIsObject) {
|
|
1823
|
+
const matchingCall = calls.find((c) => {
|
|
1824
|
+
const toolDef = task.tools.find((t) => t.name === c.name);
|
|
1825
|
+
if (!toolDef) return false;
|
|
1826
|
+
return deepEqual(task.expected, c.arguments);
|
|
1827
|
+
});
|
|
1828
|
+
if (matchingCall) {
|
|
1829
|
+
return {
|
|
1830
|
+
name: "tool-usage",
|
|
1831
|
+
value: 1,
|
|
1832
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
1833
|
+
};
|
|
1834
|
+
}
|
|
1835
|
+
const expectedKeys = Object.keys(task.expected);
|
|
1836
|
+
const partialMatch = calls.find((c) => {
|
|
1837
|
+
if (typeof c.arguments !== "object" || c.arguments === null) return false;
|
|
1838
|
+
const argKeys = Object.keys(c.arguments);
|
|
1839
|
+
return expectedKeys.some((k) => argKeys.includes(k));
|
|
1840
|
+
});
|
|
1841
|
+
if (partialMatch) {
|
|
1842
|
+
return {
|
|
1843
|
+
name: "tool-usage",
|
|
1844
|
+
value: 0.5,
|
|
1845
|
+
details: {
|
|
1846
|
+
reason: "correct tool but wrong arguments",
|
|
1847
|
+
expected: task.expected,
|
|
1848
|
+
actual: partialMatch.arguments,
|
|
1849
|
+
toolCalls: calls
|
|
1850
|
+
}
|
|
1851
|
+
};
|
|
1852
|
+
}
|
|
1853
|
+
return {
|
|
1854
|
+
name: "tool-usage",
|
|
1855
|
+
value: 0,
|
|
1856
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
1857
|
+
};
|
|
1858
|
+
}
|
|
1859
|
+
const expectedToolName = task.tools[0].name;
|
|
1860
|
+
const usedTool = calls.some((c) => c.name === expectedToolName);
|
|
1755
1861
|
return {
|
|
1756
1862
|
name: "tool-usage",
|
|
1757
1863
|
value: usedTool ? 1 : 0,
|
|
1758
|
-
details: { expectedToolName, usedTool, toolCalls:
|
|
1864
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
1759
1865
|
};
|
|
1760
1866
|
};
|
|
1761
1867
|
|
|
@@ -1816,7 +1922,8 @@ async function runBenchmarks(options) {
|
|
|
1816
1922
|
prompt: task.prompt,
|
|
1817
1923
|
schema: task.schema,
|
|
1818
1924
|
tools: task.tools,
|
|
1819
|
-
signal
|
|
1925
|
+
signal,
|
|
1926
|
+
timeout
|
|
1820
1927
|
}), timeout);
|
|
1821
1928
|
const scores = await Promise.all(
|
|
1822
1929
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
@@ -2025,37 +2132,76 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
2025
2132
|
}
|
|
2026
2133
|
return stats;
|
|
2027
2134
|
}
|
|
2135
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2136
|
+
"correctness",
|
|
2137
|
+
"schema-correctness",
|
|
2138
|
+
"fuzzy-similarity",
|
|
2139
|
+
"llm-judge-correctness",
|
|
2140
|
+
"tool-usage"
|
|
2141
|
+
]);
|
|
2142
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2143
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2144
|
+
if (qualityColumns.length === 0) return true;
|
|
2145
|
+
return qualityColumns.some((col) => {
|
|
2146
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2147
|
+
return val !== void 0 && val > 0;
|
|
2148
|
+
});
|
|
2149
|
+
}
|
|
2028
2150
|
function computeMedals(columnStats, providerIds) {
|
|
2029
2151
|
const medals = /* @__PURE__ */ new Map();
|
|
2030
2152
|
if (providerIds.length < 2) {
|
|
2031
2153
|
for (const id of providerIds) medals.set(id, "none");
|
|
2032
2154
|
return medals;
|
|
2033
2155
|
}
|
|
2034
|
-
const
|
|
2035
|
-
|
|
2036
|
-
|
|
2156
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2157
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2158
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2159
|
+
for (const id of providerIds) {
|
|
2160
|
+
qualityWins.set(id, 0);
|
|
2161
|
+
efficiencyWins.set(id, 0);
|
|
2162
|
+
}
|
|
2163
|
+
for (const [colName, colStats] of columnStats) {
|
|
2037
2164
|
if (colStats.best === void 0) continue;
|
|
2038
2165
|
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2039
2166
|
if (bestProviders.length === 1) {
|
|
2040
|
-
|
|
2167
|
+
const winnerId = bestProviders[0][0];
|
|
2168
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2169
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2170
|
+
} else {
|
|
2171
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
2172
|
+
}
|
|
2041
2173
|
}
|
|
2042
2174
|
}
|
|
2043
|
-
const totalWins = [...
|
|
2175
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
2044
2176
|
if (totalWins === 0) {
|
|
2045
2177
|
for (const id of providerIds) medals.set(id, "none");
|
|
2046
2178
|
return medals;
|
|
2047
2179
|
}
|
|
2048
|
-
const
|
|
2049
|
-
(
|
|
2050
|
-
|
|
2180
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2181
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2182
|
+
if (qDiff !== 0) return qDiff;
|
|
2183
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2184
|
+
if (eDiff !== 0) return eDiff;
|
|
2185
|
+
return a.localeCompare(b);
|
|
2186
|
+
});
|
|
2051
2187
|
const medalList = ["gold", "silver", "bronze"];
|
|
2052
2188
|
let rank = 0;
|
|
2053
|
-
for (let i = 0; i <
|
|
2054
|
-
if (i > 0
|
|
2055
|
-
|
|
2189
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2190
|
+
if (i > 0) {
|
|
2191
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2192
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2193
|
+
if (currQ < prevQ) {
|
|
2194
|
+
rank = i;
|
|
2195
|
+
} else if (currQ === prevQ) {
|
|
2196
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2197
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2198
|
+
if (currE < prevE) rank = i;
|
|
2199
|
+
}
|
|
2056
2200
|
}
|
|
2057
|
-
|
|
2058
|
-
|
|
2201
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2202
|
+
}
|
|
2203
|
+
for (const id of providerIds) {
|
|
2204
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
2059
2205
|
}
|
|
2060
2206
|
return medals;
|
|
2061
2207
|
}
|
|
@@ -2452,24 +2598,10 @@ function printSummary(results, providers, byProvider) {
|
|
|
2452
2598
|
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2453
2599
|
}
|
|
2454
2600
|
}
|
|
2455
|
-
if (!single) {
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2460
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2461
|
-
const maxWins = Math.max(...wins.values());
|
|
2462
|
-
if (maxWins > 0) {
|
|
2463
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2464
|
-
console.log("");
|
|
2465
|
-
if (topProviders.length === 1) {
|
|
2466
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2467
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2468
|
-
} else {
|
|
2469
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2470
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2471
|
-
}
|
|
2472
|
-
}
|
|
2601
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2602
|
+
console.log("");
|
|
2603
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2604
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2473
2605
|
}
|
|
2474
2606
|
console.log("");
|
|
2475
2607
|
}
|
|
@@ -2503,15 +2635,15 @@ function defineArena(config) {
|
|
|
2503
2635
|
if (config.providers.length === 0) {
|
|
2504
2636
|
throw new Error("At least one provider is required");
|
|
2505
2637
|
}
|
|
2506
|
-
if (config.tasks.length === 0) {
|
|
2507
|
-
throw new Error("At least one task is required");
|
|
2508
|
-
}
|
|
2509
2638
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2510
2639
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2511
2640
|
const runs = config.runs ?? 1;
|
|
2512
2641
|
return {
|
|
2513
2642
|
config,
|
|
2514
2643
|
async run(options) {
|
|
2644
|
+
if (config.tasks.length === 0) {
|
|
2645
|
+
throw new Error("At least one task is required");
|
|
2646
|
+
}
|
|
2515
2647
|
return runBenchmarks({
|
|
2516
2648
|
providers: config.providers,
|
|
2517
2649
|
tasks: config.tasks,
|
|
@@ -2537,13 +2669,15 @@ function anthropic(model, options) {
|
|
|
2537
2669
|
model,
|
|
2538
2670
|
async run(input) {
|
|
2539
2671
|
const start = Date.now();
|
|
2540
|
-
const systemMessage = input.schema ?
|
|
2672
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2673
|
+
const reqOpts = { signal: input.signal };
|
|
2674
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2541
2675
|
const response = await client.messages.create({
|
|
2542
2676
|
model,
|
|
2543
2677
|
max_tokens: maxTokens,
|
|
2544
2678
|
system: systemMessage,
|
|
2545
2679
|
messages: [{ role: "user", content: input.prompt }]
|
|
2546
|
-
},
|
|
2680
|
+
}, reqOpts);
|
|
2547
2681
|
const latencyMs = Date.now() - start;
|
|
2548
2682
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2549
2683
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2688,17 +2822,8 @@ function htmlReporter(results) {
|
|
|
2688
2822
|
return { id, avg };
|
|
2689
2823
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2690
2824
|
let overallWinner;
|
|
2691
|
-
if (multi) {
|
|
2692
|
-
|
|
2693
|
-
for (const id of providers) wins.set(id, 0);
|
|
2694
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2695
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2696
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2697
|
-
const maxWins = Math.max(...wins.values());
|
|
2698
|
-
if (maxWins > 0) {
|
|
2699
|
-
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2700
|
-
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2701
|
-
}
|
|
2825
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2826
|
+
overallWinner = byCorrectness.id;
|
|
2702
2827
|
}
|
|
2703
2828
|
const errorResults = results.filter((r) => r.error);
|
|
2704
2829
|
const deduped = dedupeErrors(errorResults);
|
|
@@ -3219,7 +3344,7 @@ function renderErrors(errors) {
|
|
|
3219
3344
|
</div>`;
|
|
3220
3345
|
}).join("\n");
|
|
3221
3346
|
return `<section class="errors-section">
|
|
3222
|
-
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'
|
|
3347
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3223
3348
|
<div class="errors-list">
|
|
3224
3349
|
${items}
|
|
3225
3350
|
</div>
|
|
@@ -3283,6 +3408,384 @@ function renderScript(taskCount) {
|
|
|
3283
3408
|
</script>`;
|
|
3284
3409
|
}
|
|
3285
3410
|
|
|
3411
|
+
// src/packs/structured-output.ts
|
|
3412
|
+
var import_zod = require("zod");
|
|
3413
|
+
var structuredOutputPack = {
|
|
3414
|
+
name: "structured-output",
|
|
3415
|
+
label: "Structured Output",
|
|
3416
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3417
|
+
tasks: [
|
|
3418
|
+
{
|
|
3419
|
+
name: "so:flat-entity",
|
|
3420
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3421
|
+
expected: {
|
|
3422
|
+
name: "Maria Garcia",
|
|
3423
|
+
age: 34,
|
|
3424
|
+
role: "software architect",
|
|
3425
|
+
city: "Barcelona",
|
|
3426
|
+
country: "Spain",
|
|
3427
|
+
employeeId: "EMP-2847"
|
|
3428
|
+
},
|
|
3429
|
+
schema: import_zod.z.object({
|
|
3430
|
+
name: import_zod.z.string(),
|
|
3431
|
+
age: import_zod.z.number(),
|
|
3432
|
+
role: import_zod.z.string(),
|
|
3433
|
+
city: import_zod.z.string(),
|
|
3434
|
+
country: import_zod.z.string(),
|
|
3435
|
+
employeeId: import_zod.z.string()
|
|
3436
|
+
})
|
|
3437
|
+
},
|
|
3438
|
+
{
|
|
3439
|
+
name: "so:nested-object",
|
|
3440
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3441
|
+
expected: {
|
|
3442
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3443
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3444
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3445
|
+
},
|
|
3446
|
+
schema: import_zod.z.object({
|
|
3447
|
+
recipient: import_zod.z.object({ company: import_zod.z.string(), contact: import_zod.z.string(), floor: import_zod.z.string() }),
|
|
3448
|
+
address: import_zod.z.object({
|
|
3449
|
+
street: import_zod.z.string(),
|
|
3450
|
+
city: import_zod.z.string(),
|
|
3451
|
+
state: import_zod.z.string(),
|
|
3452
|
+
zip: import_zod.z.string(),
|
|
3453
|
+
country: import_zod.z.string()
|
|
3454
|
+
}),
|
|
3455
|
+
order: import_zod.z.object({
|
|
3456
|
+
id: import_zod.z.string(),
|
|
3457
|
+
itemCount: import_zod.z.number(),
|
|
3458
|
+
weightKg: import_zod.z.number(),
|
|
3459
|
+
shippingMethod: import_zod.z.enum(["standard", "express", "overnight"])
|
|
3460
|
+
})
|
|
3461
|
+
})
|
|
3462
|
+
},
|
|
3463
|
+
{
|
|
3464
|
+
name: "so:array-of-objects",
|
|
3465
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3466
|
+
expected: [
|
|
3467
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3468
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3469
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3470
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3471
|
+
],
|
|
3472
|
+
schema: import_zod.z.array(import_zod.z.object({ name: import_zod.z.string(), price: import_zod.z.number(), category: import_zod.z.string() }))
|
|
3473
|
+
},
|
|
3474
|
+
{
|
|
3475
|
+
name: "so:empty-arrays",
|
|
3476
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3477
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3478
|
+
schema: import_zod.z.object({
|
|
3479
|
+
errors: import_zod.z.array(import_zod.z.object({ code: import_zod.z.string(), severity: import_zod.z.string() })),
|
|
3480
|
+
warnings: import_zod.z.array(import_zod.z.string()),
|
|
3481
|
+
status: import_zod.z.enum(["healthy", "degraded", "down"]),
|
|
3482
|
+
uptimePercent: import_zod.z.number()
|
|
3483
|
+
})
|
|
3484
|
+
},
|
|
3485
|
+
{
|
|
3486
|
+
name: "so:enum-classification",
|
|
3487
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3488
|
+
expected: [
|
|
3489
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3490
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3491
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3492
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3493
|
+
],
|
|
3494
|
+
schema: import_zod.z.array(
|
|
3495
|
+
import_zod.z.object({
|
|
3496
|
+
id: import_zod.z.string(),
|
|
3497
|
+
priority: import_zod.z.enum(["low", "medium", "high", "critical"]),
|
|
3498
|
+
category: import_zod.z.enum(["billing", "technical", "account", "general"])
|
|
3499
|
+
})
|
|
3500
|
+
)
|
|
3501
|
+
},
|
|
3502
|
+
{
|
|
3503
|
+
name: "so:adversarial-input",
|
|
3504
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3505
|
+
|
|
3506
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3507
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3508
|
+
expected: {
|
|
3509
|
+
product: "headphones",
|
|
3510
|
+
price: 59.99,
|
|
3511
|
+
rating: 5,
|
|
3512
|
+
maxRating: 5,
|
|
3513
|
+
features: ["noise-cancelling"],
|
|
3514
|
+
recommended: true,
|
|
3515
|
+
purchaseDate: "2026-01-15"
|
|
3516
|
+
},
|
|
3517
|
+
schema: import_zod.z.object({
|
|
3518
|
+
product: import_zod.z.string(),
|
|
3519
|
+
price: import_zod.z.number(),
|
|
3520
|
+
rating: import_zod.z.number(),
|
|
3521
|
+
maxRating: import_zod.z.number(),
|
|
3522
|
+
features: import_zod.z.array(import_zod.z.string()),
|
|
3523
|
+
recommended: import_zod.z.boolean(),
|
|
3524
|
+
purchaseDate: import_zod.z.string()
|
|
3525
|
+
})
|
|
3526
|
+
}
|
|
3527
|
+
],
|
|
3528
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3529
|
+
};
|
|
3530
|
+
|
|
3531
|
+
// src/packs/tool-calling.ts
|
|
3532
|
+
var import_zod2 = require("zod");
|
|
3533
|
+
var toolCallingPack = {
|
|
3534
|
+
name: "tool-calling",
|
|
3535
|
+
label: "Tool Calling",
|
|
3536
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
3537
|
+
tasks: [
|
|
3538
|
+
{
|
|
3539
|
+
name: "tc:simple-single-tool",
|
|
3540
|
+
prompt: "What's the current weather in Tokyo?",
|
|
3541
|
+
tools: [{
|
|
3542
|
+
name: "getWeather",
|
|
3543
|
+
description: "Get current weather for a city",
|
|
3544
|
+
parameters: import_zod2.z.object({
|
|
3545
|
+
city: import_zod2.z.string(),
|
|
3546
|
+
units: import_zod2.z.enum(["celsius", "fahrenheit"]).optional()
|
|
3547
|
+
}),
|
|
3548
|
+
handler: async ({ city, units }) => ({
|
|
3549
|
+
city,
|
|
3550
|
+
tempC: 8,
|
|
3551
|
+
condition: "cloudy",
|
|
3552
|
+
units: units ?? "celsius"
|
|
3553
|
+
})
|
|
3554
|
+
}],
|
|
3555
|
+
expected: { city: "Tokyo" }
|
|
3556
|
+
},
|
|
3557
|
+
{
|
|
3558
|
+
name: "tc:complex-params",
|
|
3559
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
3560
|
+
tools: [{
|
|
3561
|
+
name: "searchRestaurants",
|
|
3562
|
+
description: "Search for restaurants matching criteria",
|
|
3563
|
+
parameters: import_zod2.z.object({
|
|
3564
|
+
cuisine: import_zod2.z.string(),
|
|
3565
|
+
location: import_zod2.z.string(),
|
|
3566
|
+
radiusMiles: import_zod2.z.number(),
|
|
3567
|
+
minRating: import_zod2.z.number(),
|
|
3568
|
+
openNow: import_zod2.z.boolean()
|
|
3569
|
+
}),
|
|
3570
|
+
handler: async (_args) => ({
|
|
3571
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
3572
|
+
})
|
|
3573
|
+
}],
|
|
3574
|
+
expected: {
|
|
3575
|
+
cuisine: "Italian",
|
|
3576
|
+
location: "downtown Portland",
|
|
3577
|
+
radiusMiles: 2,
|
|
3578
|
+
minRating: 4,
|
|
3579
|
+
openNow: true
|
|
3580
|
+
}
|
|
3581
|
+
},
|
|
3582
|
+
{
|
|
3583
|
+
name: "tc:select-from-many",
|
|
3584
|
+
prompt: "Convert 150 USD to Euros.",
|
|
3585
|
+
tools: [
|
|
3586
|
+
{
|
|
3587
|
+
name: "getWeather",
|
|
3588
|
+
description: "Get current weather for a city",
|
|
3589
|
+
parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
|
|
3590
|
+
handler: async () => ({ tempC: 20 })
|
|
3591
|
+
},
|
|
3592
|
+
{
|
|
3593
|
+
name: "convertCurrency",
|
|
3594
|
+
description: "Convert an amount between currencies",
|
|
3595
|
+
parameters: import_zod2.z.object({
|
|
3596
|
+
amount: import_zod2.z.number(),
|
|
3597
|
+
from: import_zod2.z.string(),
|
|
3598
|
+
to: import_zod2.z.string()
|
|
3599
|
+
}),
|
|
3600
|
+
handler: async ({ amount, from, to }) => ({
|
|
3601
|
+
amount,
|
|
3602
|
+
from,
|
|
3603
|
+
to,
|
|
3604
|
+
result: 138.75,
|
|
3605
|
+
rate: 0.925
|
|
3606
|
+
})
|
|
3607
|
+
},
|
|
3608
|
+
{
|
|
3609
|
+
name: "translateText",
|
|
3610
|
+
description: "Translate text between languages",
|
|
3611
|
+
parameters: import_zod2.z.object({ text: import_zod2.z.string(), targetLang: import_zod2.z.string() }),
|
|
3612
|
+
handler: async () => ({ translated: "" })
|
|
3613
|
+
},
|
|
3614
|
+
{
|
|
3615
|
+
name: "calculateTip",
|
|
3616
|
+
description: "Calculate tip amount for a bill",
|
|
3617
|
+
parameters: import_zod2.z.object({ billAmount: import_zod2.z.number(), tipPercent: import_zod2.z.number() }),
|
|
3618
|
+
handler: async () => ({ tip: 0 })
|
|
3619
|
+
}
|
|
3620
|
+
],
|
|
3621
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
3622
|
+
},
|
|
3623
|
+
{
|
|
3624
|
+
name: "tc:parallel-calls",
|
|
3625
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
3626
|
+
tools: [{
|
|
3627
|
+
name: "getWeather",
|
|
3628
|
+
description: "Get current weather for a city",
|
|
3629
|
+
parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
|
|
3630
|
+
handler: async ({ city }) => {
|
|
3631
|
+
const data = {
|
|
3632
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
3633
|
+
London: { tempC: 9, condition: "rainy" }
|
|
3634
|
+
};
|
|
3635
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
3636
|
+
}
|
|
3637
|
+
}],
|
|
3638
|
+
expected: "weather data for Paris and London"
|
|
3639
|
+
}
|
|
3640
|
+
],
|
|
3641
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
3642
|
+
};
|
|
3643
|
+
|
|
3644
|
+
// src/packs/reasoning.ts
|
|
3645
|
+
var import_zod3 = require("zod");
|
|
3646
|
+
var reasoningPack = {
|
|
3647
|
+
name: "reasoning",
|
|
3648
|
+
label: "Reasoning",
|
|
3649
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
3650
|
+
tasks: [
|
|
3651
|
+
{
|
|
3652
|
+
name: "rs:saas-mrr-calc",
|
|
3653
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
3654
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
3655
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
3656
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
3657
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
3658
|
+
expected: { mrr: 27425 },
|
|
3659
|
+
schema: import_zod3.z.object({
|
|
3660
|
+
reasoning: import_zod3.z.string().optional(),
|
|
3661
|
+
mrr: import_zod3.z.number()
|
|
3662
|
+
})
|
|
3663
|
+
},
|
|
3664
|
+
{
|
|
3665
|
+
name: "rs:logical-deduction",
|
|
3666
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
3667
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
3668
|
+
1. Alice does not use Python, Java, or Go.
|
|
3669
|
+
2. Bob uses TypeScript.
|
|
3670
|
+
3. Carol uses neither Rust nor Go.
|
|
3671
|
+
4. Dave does not use Java.
|
|
3672
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
3673
|
+
What language does each developer use? Return as JSON.`,
|
|
3674
|
+
expected: {
|
|
3675
|
+
Alice: "Rust",
|
|
3676
|
+
Bob: "TypeScript",
|
|
3677
|
+
Carol: "Java",
|
|
3678
|
+
Dave: "Go",
|
|
3679
|
+
Eve: "Python"
|
|
3680
|
+
},
|
|
3681
|
+
schema: import_zod3.z.object({
|
|
3682
|
+
Alice: import_zod3.z.string(),
|
|
3683
|
+
Bob: import_zod3.z.string(),
|
|
3684
|
+
Carol: import_zod3.z.string(),
|
|
3685
|
+
Dave: import_zod3.z.string(),
|
|
3686
|
+
Eve: import_zod3.z.string()
|
|
3687
|
+
})
|
|
3688
|
+
},
|
|
3689
|
+
{
|
|
3690
|
+
name: "rs:data-interpretation",
|
|
3691
|
+
prompt: `Given this quarterly revenue data:
|
|
3692
|
+
| Quarter | Revenue | Growth |
|
|
3693
|
+
|---------|---------|--------|
|
|
3694
|
+
| Q1 2025 | $2.1M | - |
|
|
3695
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
3696
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
3697
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
3698
|
+
|
|
3699
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
3700
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
3701
|
+
expected: {
|
|
3702
|
+
highestGrowthQuarter: "Q4 2025",
|
|
3703
|
+
absoluteIncrease: 0.6,
|
|
3704
|
+
fullYearRevenue: 9.5
|
|
3705
|
+
},
|
|
3706
|
+
schema: import_zod3.z.object({
|
|
3707
|
+
highestGrowthQuarter: import_zod3.z.string(),
|
|
3708
|
+
absoluteIncrease: import_zod3.z.number(),
|
|
3709
|
+
fullYearRevenue: import_zod3.z.number()
|
|
3710
|
+
})
|
|
3711
|
+
},
|
|
3712
|
+
{
|
|
3713
|
+
name: "rs:critical-path",
|
|
3714
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
3715
|
+
- Build (3 min, no dependency)
|
|
3716
|
+
- Unit tests (5 min, depends on Build)
|
|
3717
|
+
- Integration tests (8 min, depends on Build)
|
|
3718
|
+
- Security scan (4 min, depends on Build)
|
|
3719
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
3720
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
3721
|
+
|
|
3722
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
3723
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
3724
|
+
expected: {
|
|
3725
|
+
totalMinutes: 16,
|
|
3726
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
3727
|
+
},
|
|
3728
|
+
schema: import_zod3.z.object({
|
|
3729
|
+
totalMinutes: import_zod3.z.number(),
|
|
3730
|
+
criticalPath: import_zod3.z.array(import_zod3.z.string())
|
|
3731
|
+
})
|
|
3732
|
+
},
|
|
3733
|
+
{
|
|
3734
|
+
name: "rs:pricing-rules",
|
|
3735
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
3736
|
+
Rules:
|
|
3737
|
+
- Base price: $100
|
|
3738
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
3739
|
+
- Annual billing: additional 15% off the discounted price
|
|
3740
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
3741
|
+
|
|
3742
|
+
Customers:
|
|
3743
|
+
A: 50 seats, monthly billing, for-profit
|
|
3744
|
+
B: 200 seats, annual billing, for-profit
|
|
3745
|
+
C: 75 seats, annual billing, non-profit
|
|
3746
|
+
D: 150 seats, monthly billing, for-profit
|
|
3747
|
+
|
|
3748
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
3749
|
+
expected: [
|
|
3750
|
+
{ id: "A", finalPrice: 100 },
|
|
3751
|
+
{ id: "B", finalPrice: 59.5 },
|
|
3752
|
+
{ id: "C", finalPrice: 50 },
|
|
3753
|
+
{ id: "D", finalPrice: 70 }
|
|
3754
|
+
],
|
|
3755
|
+
schema: import_zod3.z.array(import_zod3.z.object({
|
|
3756
|
+
id: import_zod3.z.string(),
|
|
3757
|
+
finalPrice: import_zod3.z.number()
|
|
3758
|
+
}))
|
|
3759
|
+
}
|
|
3760
|
+
],
|
|
3761
|
+
scorers: ["correctness", "latency", "cost"]
|
|
3762
|
+
};
|
|
3763
|
+
|
|
3764
|
+
// src/packs/index.ts
|
|
3765
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3766
|
+
function register(pack) {
|
|
3767
|
+
registry.set(pack.name, pack);
|
|
3768
|
+
}
|
|
3769
|
+
register(structuredOutputPack);
|
|
3770
|
+
register(toolCallingPack);
|
|
3771
|
+
register(reasoningPack);
|
|
3772
|
+
function loadPack(name) {
|
|
3773
|
+
const pack = registry.get(name);
|
|
3774
|
+
if (!pack) {
|
|
3775
|
+
const available = [...registry.keys()].join(", ");
|
|
3776
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3777
|
+
}
|
|
3778
|
+
return pack;
|
|
3779
|
+
}
|
|
3780
|
+
function listPacks() {
|
|
3781
|
+
return [...registry.values()].map((p) => ({
|
|
3782
|
+
name: p.name,
|
|
3783
|
+
label: p.label,
|
|
3784
|
+
description: p.description,
|
|
3785
|
+
taskCount: p.tasks.length
|
|
3786
|
+
}));
|
|
3787
|
+
}
|
|
3788
|
+
|
|
3286
3789
|
// src/ci.ts
|
|
3287
3790
|
var import_node_fs = require("fs");
|
|
3288
3791
|
var import_node_path = require("path");
|
|
@@ -3574,7 +4077,9 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
3574
4077
|
gemini,
|
|
3575
4078
|
htmlReporter,
|
|
3576
4079
|
jsonReporter,
|
|
4080
|
+
listPacks,
|
|
3577
4081
|
loadBaseline,
|
|
4082
|
+
loadPack,
|
|
3578
4083
|
markdownReporter,
|
|
3579
4084
|
openai,
|
|
3580
4085
|
openaiCompatible,
|