agent-duelist 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +248 -142
- package/dist/cli.js +2284 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +614 -109
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +612 -109
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/dist/index.js
CHANGED
|
@@ -1344,38 +1344,49 @@ var costScorer = ({ result }, providerId) => {
|
|
|
1344
1344
|
};
|
|
1345
1345
|
};
|
|
1346
1346
|
|
|
1347
|
+
// src/utils/deep-equal.ts
|
|
1348
|
+
function deepEqual(expected, actual) {
|
|
1349
|
+
if (expected === actual) return true;
|
|
1350
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1351
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1352
|
+
}
|
|
1353
|
+
if (typeof expected !== typeof actual) return false;
|
|
1354
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1355
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1356
|
+
if (expected.length !== actual.length) return false;
|
|
1357
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1358
|
+
}
|
|
1359
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1360
|
+
const objExpected = expected;
|
|
1361
|
+
const objActual = actual;
|
|
1362
|
+
const keysExpected = Object.keys(objExpected);
|
|
1363
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1364
|
+
}
|
|
1365
|
+
return expected === actual;
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1347
1368
|
// src/scorers/correctness.ts
|
|
1348
1369
|
var correctnessScorer = ({ task, result }) => {
|
|
1349
1370
|
if (task.expected === void 0) {
|
|
1350
1371
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1351
1372
|
}
|
|
1352
|
-
const
|
|
1373
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1374
|
+
const match = deepEqual(task.expected, actual);
|
|
1353
1375
|
return {
|
|
1354
1376
|
name: "correctness",
|
|
1355
1377
|
value: match ? 1 : 0,
|
|
1356
1378
|
details: { expected: task.expected, actual: result.output }
|
|
1357
1379
|
};
|
|
1358
1380
|
};
|
|
1359
|
-
function
|
|
1360
|
-
if (
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
return a.every((val, i) => deepEqual(val, b[i]));
|
|
1369
|
-
}
|
|
1370
|
-
if (typeof a === "object" && typeof b === "object") {
|
|
1371
|
-
const objA = a;
|
|
1372
|
-
const objB = b;
|
|
1373
|
-
const keysA = Object.keys(objA);
|
|
1374
|
-
const keysB = Object.keys(objB);
|
|
1375
|
-
if (keysA.length !== keysB.length) return false;
|
|
1376
|
-
return keysA.every((key) => key in objB && deepEqual(objA[key], objB[key]));
|
|
1377
|
-
}
|
|
1378
|
-
return a === b;
|
|
1381
|
+
function normalizeOutput(expected, actual) {
|
|
1382
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1383
|
+
const entries = Object.entries(actual);
|
|
1384
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1385
|
+
if (arrayEntries.length === 1) {
|
|
1386
|
+
return arrayEntries[0][1];
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
return actual;
|
|
1379
1390
|
}
|
|
1380
1391
|
|
|
1381
1392
|
// src/scorers/schema-correctness.ts
|
|
@@ -1395,7 +1406,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1395
1406
|
};
|
|
1396
1407
|
}
|
|
1397
1408
|
}
|
|
1398
|
-
|
|
1409
|
+
let parsed = task.schema.safeParse(data);
|
|
1410
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1411
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1412
|
+
if (arrayEntries.length === 1) {
|
|
1413
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1414
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1399
1417
|
return {
|
|
1400
1418
|
name: "schema-correctness",
|
|
1401
1419
|
value: parsed.success ? 1 : 0,
|
|
@@ -1441,18 +1459,36 @@ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
|
1441
1459
|
|
|
1442
1460
|
// src/providers/openai.ts
|
|
1443
1461
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1444
|
-
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1462
|
+
import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
|
|
1445
1463
|
|
|
1446
1464
|
// src/providers/shared.ts
|
|
1447
|
-
|
|
1465
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1466
|
+
function buildSchemaSystemMessage(schema) {
|
|
1467
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1468
|
+
const jsonSchema = zodToJsonSchema(schema, { target: "openAi" });
|
|
1469
|
+
return [
|
|
1470
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1471
|
+
"",
|
|
1472
|
+
"Your output must conform to this JSON Schema:",
|
|
1473
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1474
|
+
"",
|
|
1475
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1476
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1477
|
+
].join("\n");
|
|
1478
|
+
}
|
|
1448
1479
|
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1449
1480
|
if (!hasSchema) return rawContent;
|
|
1481
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1450
1482
|
try {
|
|
1451
|
-
return JSON.parse(
|
|
1483
|
+
return JSON.parse(cleaned);
|
|
1452
1484
|
} catch {
|
|
1453
1485
|
return rawContent;
|
|
1454
1486
|
}
|
|
1455
1487
|
}
|
|
1488
|
+
function stripCodeFences(content) {
|
|
1489
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1490
|
+
return match ? match[1] : content;
|
|
1491
|
+
}
|
|
1456
1492
|
|
|
1457
1493
|
// src/providers/openai.ts
|
|
1458
1494
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
@@ -1501,7 +1537,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1501
1537
|
if (input.schema) {
|
|
1502
1538
|
params.response_format = { type: "json_object" };
|
|
1503
1539
|
params.messages = [
|
|
1504
|
-
{ role: "system", content:
|
|
1540
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1505
1541
|
...params.messages
|
|
1506
1542
|
];
|
|
1507
1543
|
}
|
|
@@ -1509,7 +1545,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1509
1545
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1510
1546
|
params.tool_choice = "auto";
|
|
1511
1547
|
}
|
|
1512
|
-
const
|
|
1548
|
+
const reqOpts = { signal: input.signal };
|
|
1549
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1550
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1513
1551
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1514
1552
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1515
1553
|
const choice = response.choices[0];
|
|
@@ -1543,7 +1581,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1543
1581
|
const followUp = await client.chat.completions.create({
|
|
1544
1582
|
model: requestModel,
|
|
1545
1583
|
messages: toolMessages
|
|
1546
|
-
},
|
|
1584
|
+
}, reqOpts);
|
|
1547
1585
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1548
1586
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1549
1587
|
finalResponse = followUp;
|
|
@@ -1588,7 +1626,7 @@ function toolDefToOpenAI(tool) {
|
|
|
1588
1626
|
function: {
|
|
1589
1627
|
name: tool.name,
|
|
1590
1628
|
description: tool.description,
|
|
1591
|
-
parameters:
|
|
1629
|
+
parameters: zodToJsonSchema2(tool.parameters, { target: "openAi" })
|
|
1592
1630
|
}
|
|
1593
1631
|
};
|
|
1594
1632
|
}
|
|
@@ -1637,8 +1675,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1637
1675
|
if (!apiKey) return void 0;
|
|
1638
1676
|
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
1639
1677
|
}
|
|
1678
|
+
function isTemperatureError(err) {
|
|
1679
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1680
|
+
const lower = msg.toLowerCase();
|
|
1681
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1682
|
+
}
|
|
1640
1683
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1641
1684
|
let cached = void 0;
|
|
1685
|
+
let useTemperature = true;
|
|
1642
1686
|
return async ({ task, result }) => {
|
|
1643
1687
|
if (task.expected === void 0) {
|
|
1644
1688
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1655,35 +1699,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1655
1699
|
}
|
|
1656
1700
|
const { client, model } = cached;
|
|
1657
1701
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1702
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1658
1703
|
try {
|
|
1659
|
-
const response = await client
|
|
1660
|
-
|
|
1661
|
-
messages: [{ role: "user", content: prompt }],
|
|
1662
|
-
max_completion_tokens: 2048
|
|
1663
|
-
});
|
|
1664
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1665
|
-
const parsed = {};
|
|
1666
|
-
for (const line of content.split("\n")) {
|
|
1667
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1668
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1669
|
-
}
|
|
1670
|
-
const accuracy = parsed.accuracy;
|
|
1671
|
-
const completeness = parsed.completeness;
|
|
1672
|
-
const conciseness = parsed.conciseness;
|
|
1673
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1674
|
-
return {
|
|
1675
|
-
name: "llm-judge-correctness",
|
|
1676
|
-
value: -1,
|
|
1677
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1678
|
-
};
|
|
1679
|
-
}
|
|
1680
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1681
|
-
return {
|
|
1682
|
-
name: "llm-judge-correctness",
|
|
1683
|
-
value: composite,
|
|
1684
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1685
|
-
};
|
|
1704
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1705
|
+
return parseJudgeResponse(response, model);
|
|
1686
1706
|
} catch (err) {
|
|
1707
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1708
|
+
useTemperature = false;
|
|
1709
|
+
try {
|
|
1710
|
+
const response = await callJudge(client, model, messages, false);
|
|
1711
|
+
return parseJudgeResponse(response, model);
|
|
1712
|
+
} catch (retryErr) {
|
|
1713
|
+
return {
|
|
1714
|
+
name: "llm-judge-correctness",
|
|
1715
|
+
value: -1,
|
|
1716
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1717
|
+
};
|
|
1718
|
+
}
|
|
1719
|
+
}
|
|
1687
1720
|
return {
|
|
1688
1721
|
name: "llm-judge-correctness",
|
|
1689
1722
|
value: -1,
|
|
@@ -1692,18 +1725,89 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1692
1725
|
}
|
|
1693
1726
|
};
|
|
1694
1727
|
}
|
|
1728
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1729
|
+
return client.chat.completions.create({
|
|
1730
|
+
model,
|
|
1731
|
+
messages,
|
|
1732
|
+
max_completion_tokens: 2048,
|
|
1733
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1734
|
+
});
|
|
1735
|
+
}
|
|
1736
|
+
function parseJudgeResponse(response, model) {
|
|
1737
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1738
|
+
const parsed = {};
|
|
1739
|
+
for (const line of content.split("\n")) {
|
|
1740
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1741
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1742
|
+
}
|
|
1743
|
+
const accuracy = parsed.accuracy;
|
|
1744
|
+
const completeness = parsed.completeness;
|
|
1745
|
+
const conciseness = parsed.conciseness;
|
|
1746
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1747
|
+
return {
|
|
1748
|
+
name: "llm-judge-correctness",
|
|
1749
|
+
value: -1,
|
|
1750
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1751
|
+
};
|
|
1752
|
+
}
|
|
1753
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1754
|
+
return {
|
|
1755
|
+
name: "llm-judge-correctness",
|
|
1756
|
+
value: composite,
|
|
1757
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1758
|
+
};
|
|
1759
|
+
}
|
|
1695
1760
|
|
|
1696
1761
|
// src/scorers/tool-usage.ts
|
|
1697
1762
|
var toolUsageScorer = ({ task, result }) => {
|
|
1698
|
-
|
|
1699
|
-
if (!expectedToolName) {
|
|
1763
|
+
if (!task.tools?.length) {
|
|
1700
1764
|
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
1701
1765
|
}
|
|
1702
|
-
const
|
|
1766
|
+
const calls = result.toolCalls ?? [];
|
|
1767
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
1768
|
+
if (expectedIsObject) {
|
|
1769
|
+
const matchingCall = calls.find((c) => {
|
|
1770
|
+
const toolDef = task.tools.find((t) => t.name === c.name);
|
|
1771
|
+
if (!toolDef) return false;
|
|
1772
|
+
return deepEqual(task.expected, c.arguments);
|
|
1773
|
+
});
|
|
1774
|
+
if (matchingCall) {
|
|
1775
|
+
return {
|
|
1776
|
+
name: "tool-usage",
|
|
1777
|
+
value: 1,
|
|
1778
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
1779
|
+
};
|
|
1780
|
+
}
|
|
1781
|
+
const expectedKeys = Object.keys(task.expected);
|
|
1782
|
+
const partialMatch = calls.find((c) => {
|
|
1783
|
+
if (typeof c.arguments !== "object" || c.arguments === null) return false;
|
|
1784
|
+
const argKeys = Object.keys(c.arguments);
|
|
1785
|
+
return expectedKeys.some((k) => argKeys.includes(k));
|
|
1786
|
+
});
|
|
1787
|
+
if (partialMatch) {
|
|
1788
|
+
return {
|
|
1789
|
+
name: "tool-usage",
|
|
1790
|
+
value: 0.5,
|
|
1791
|
+
details: {
|
|
1792
|
+
reason: "correct tool but wrong arguments",
|
|
1793
|
+
expected: task.expected,
|
|
1794
|
+
actual: partialMatch.arguments,
|
|
1795
|
+
toolCalls: calls
|
|
1796
|
+
}
|
|
1797
|
+
};
|
|
1798
|
+
}
|
|
1799
|
+
return {
|
|
1800
|
+
name: "tool-usage",
|
|
1801
|
+
value: 0,
|
|
1802
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
1803
|
+
};
|
|
1804
|
+
}
|
|
1805
|
+
const expectedToolName = task.tools[0].name;
|
|
1806
|
+
const usedTool = calls.some((c) => c.name === expectedToolName);
|
|
1703
1807
|
return {
|
|
1704
1808
|
name: "tool-usage",
|
|
1705
1809
|
value: usedTool ? 1 : 0,
|
|
1706
|
-
details: { expectedToolName, usedTool, toolCalls:
|
|
1810
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
1707
1811
|
};
|
|
1708
1812
|
};
|
|
1709
1813
|
|
|
@@ -1764,7 +1868,8 @@ async function runBenchmarks(options) {
|
|
|
1764
1868
|
prompt: task.prompt,
|
|
1765
1869
|
schema: task.schema,
|
|
1766
1870
|
tools: task.tools,
|
|
1767
|
-
signal
|
|
1871
|
+
signal,
|
|
1872
|
+
timeout
|
|
1768
1873
|
}), timeout);
|
|
1769
1874
|
const scores = await Promise.all(
|
|
1770
1875
|
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
@@ -1973,37 +2078,76 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1973
2078
|
}
|
|
1974
2079
|
return stats;
|
|
1975
2080
|
}
|
|
2081
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2082
|
+
"correctness",
|
|
2083
|
+
"schema-correctness",
|
|
2084
|
+
"fuzzy-similarity",
|
|
2085
|
+
"llm-judge-correctness",
|
|
2086
|
+
"tool-usage"
|
|
2087
|
+
]);
|
|
2088
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2089
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2090
|
+
if (qualityColumns.length === 0) return true;
|
|
2091
|
+
return qualityColumns.some((col) => {
|
|
2092
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2093
|
+
return val !== void 0 && val > 0;
|
|
2094
|
+
});
|
|
2095
|
+
}
|
|
1976
2096
|
function computeMedals(columnStats, providerIds) {
|
|
1977
2097
|
const medals = /* @__PURE__ */ new Map();
|
|
1978
2098
|
if (providerIds.length < 2) {
|
|
1979
2099
|
for (const id of providerIds) medals.set(id, "none");
|
|
1980
2100
|
return medals;
|
|
1981
2101
|
}
|
|
1982
|
-
const
|
|
1983
|
-
|
|
1984
|
-
|
|
2102
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2103
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2104
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2105
|
+
for (const id of providerIds) {
|
|
2106
|
+
qualityWins.set(id, 0);
|
|
2107
|
+
efficiencyWins.set(id, 0);
|
|
2108
|
+
}
|
|
2109
|
+
for (const [colName, colStats] of columnStats) {
|
|
1985
2110
|
if (colStats.best === void 0) continue;
|
|
1986
2111
|
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
1987
2112
|
if (bestProviders.length === 1) {
|
|
1988
|
-
|
|
2113
|
+
const winnerId = bestProviders[0][0];
|
|
2114
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2115
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2116
|
+
} else {
|
|
2117
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
2118
|
+
}
|
|
1989
2119
|
}
|
|
1990
2120
|
}
|
|
1991
|
-
const totalWins = [...
|
|
2121
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
1992
2122
|
if (totalWins === 0) {
|
|
1993
2123
|
for (const id of providerIds) medals.set(id, "none");
|
|
1994
2124
|
return medals;
|
|
1995
2125
|
}
|
|
1996
|
-
const
|
|
1997
|
-
(
|
|
1998
|
-
|
|
2126
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2127
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2128
|
+
if (qDiff !== 0) return qDiff;
|
|
2129
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2130
|
+
if (eDiff !== 0) return eDiff;
|
|
2131
|
+
return a.localeCompare(b);
|
|
2132
|
+
});
|
|
1999
2133
|
const medalList = ["gold", "silver", "bronze"];
|
|
2000
2134
|
let rank = 0;
|
|
2001
|
-
for (let i = 0; i <
|
|
2002
|
-
if (i > 0
|
|
2003
|
-
|
|
2135
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2136
|
+
if (i > 0) {
|
|
2137
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2138
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2139
|
+
if (currQ < prevQ) {
|
|
2140
|
+
rank = i;
|
|
2141
|
+
} else if (currQ === prevQ) {
|
|
2142
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2143
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2144
|
+
if (currE < prevE) rank = i;
|
|
2145
|
+
}
|
|
2004
2146
|
}
|
|
2005
|
-
|
|
2006
|
-
|
|
2147
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2148
|
+
}
|
|
2149
|
+
for (const id of providerIds) {
|
|
2150
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
2007
2151
|
}
|
|
2008
2152
|
return medals;
|
|
2009
2153
|
}
|
|
@@ -2400,24 +2544,10 @@ function printSummary(results, providers, byProvider) {
|
|
|
2400
2544
|
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2401
2545
|
}
|
|
2402
2546
|
}
|
|
2403
|
-
if (!single) {
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2408
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2409
|
-
const maxWins = Math.max(...wins.values());
|
|
2410
|
-
if (maxWins > 0) {
|
|
2411
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2412
|
-
console.log("");
|
|
2413
|
-
if (topProviders.length === 1) {
|
|
2414
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2415
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2416
|
-
} else {
|
|
2417
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2418
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2419
|
-
}
|
|
2420
|
-
}
|
|
2547
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2548
|
+
console.log("");
|
|
2549
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2550
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2421
2551
|
}
|
|
2422
2552
|
console.log("");
|
|
2423
2553
|
}
|
|
@@ -2451,15 +2581,15 @@ function defineArena(config) {
|
|
|
2451
2581
|
if (config.providers.length === 0) {
|
|
2452
2582
|
throw new Error("At least one provider is required");
|
|
2453
2583
|
}
|
|
2454
|
-
if (config.tasks.length === 0) {
|
|
2455
|
-
throw new Error("At least one task is required");
|
|
2456
|
-
}
|
|
2457
2584
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2458
2585
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2459
2586
|
const runs = config.runs ?? 1;
|
|
2460
2587
|
return {
|
|
2461
2588
|
config,
|
|
2462
2589
|
async run(options) {
|
|
2590
|
+
if (config.tasks.length === 0) {
|
|
2591
|
+
throw new Error("At least one task is required");
|
|
2592
|
+
}
|
|
2463
2593
|
return runBenchmarks({
|
|
2464
2594
|
providers: config.providers,
|
|
2465
2595
|
tasks: config.tasks,
|
|
@@ -2485,13 +2615,15 @@ function anthropic(model, options) {
|
|
|
2485
2615
|
model,
|
|
2486
2616
|
async run(input) {
|
|
2487
2617
|
const start = Date.now();
|
|
2488
|
-
const systemMessage = input.schema ?
|
|
2618
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2619
|
+
const reqOpts = { signal: input.signal };
|
|
2620
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2489
2621
|
const response = await client.messages.create({
|
|
2490
2622
|
model,
|
|
2491
2623
|
max_tokens: maxTokens,
|
|
2492
2624
|
system: systemMessage,
|
|
2493
2625
|
messages: [{ role: "user", content: input.prompt }]
|
|
2494
|
-
},
|
|
2626
|
+
}, reqOpts);
|
|
2495
2627
|
const latencyMs = Date.now() - start;
|
|
2496
2628
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2497
2629
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
@@ -2636,17 +2768,8 @@ function htmlReporter(results) {
|
|
|
2636
2768
|
return { id, avg };
|
|
2637
2769
|
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2638
2770
|
let overallWinner;
|
|
2639
|
-
if (multi) {
|
|
2640
|
-
|
|
2641
|
-
for (const id of providers) wins.set(id, 0);
|
|
2642
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2643
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2644
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2645
|
-
const maxWins = Math.max(...wins.values());
|
|
2646
|
-
if (maxWins > 0) {
|
|
2647
|
-
const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2648
|
-
if (tops.length === 1) overallWinner = tops[0][0];
|
|
2649
|
-
}
|
|
2771
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2772
|
+
overallWinner = byCorrectness.id;
|
|
2650
2773
|
}
|
|
2651
2774
|
const errorResults = results.filter((r) => r.error);
|
|
2652
2775
|
const deduped = dedupeErrors(errorResults);
|
|
@@ -3167,7 +3290,7 @@ function renderErrors(errors) {
|
|
|
3167
3290
|
</div>`;
|
|
3168
3291
|
}).join("\n");
|
|
3169
3292
|
return `<section class="errors-section">
|
|
3170
|
-
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'
|
|
3293
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3171
3294
|
<div class="errors-list">
|
|
3172
3295
|
${items}
|
|
3173
3296
|
</div>
|
|
@@ -3231,6 +3354,384 @@ function renderScript(taskCount) {
|
|
|
3231
3354
|
</script>`;
|
|
3232
3355
|
}
|
|
3233
3356
|
|
|
3357
|
+
// src/packs/structured-output.ts
|
|
3358
|
+
import { z } from "zod";
|
|
3359
|
+
var structuredOutputPack = {
|
|
3360
|
+
name: "structured-output",
|
|
3361
|
+
label: "Structured Output",
|
|
3362
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3363
|
+
tasks: [
|
|
3364
|
+
{
|
|
3365
|
+
name: "so:flat-entity",
|
|
3366
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3367
|
+
expected: {
|
|
3368
|
+
name: "Maria Garcia",
|
|
3369
|
+
age: 34,
|
|
3370
|
+
role: "software architect",
|
|
3371
|
+
city: "Barcelona",
|
|
3372
|
+
country: "Spain",
|
|
3373
|
+
employeeId: "EMP-2847"
|
|
3374
|
+
},
|
|
3375
|
+
schema: z.object({
|
|
3376
|
+
name: z.string(),
|
|
3377
|
+
age: z.number(),
|
|
3378
|
+
role: z.string(),
|
|
3379
|
+
city: z.string(),
|
|
3380
|
+
country: z.string(),
|
|
3381
|
+
employeeId: z.string()
|
|
3382
|
+
})
|
|
3383
|
+
},
|
|
3384
|
+
{
|
|
3385
|
+
name: "so:nested-object",
|
|
3386
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3387
|
+
expected: {
|
|
3388
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3389
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3390
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3391
|
+
},
|
|
3392
|
+
schema: z.object({
|
|
3393
|
+
recipient: z.object({ company: z.string(), contact: z.string(), floor: z.string() }),
|
|
3394
|
+
address: z.object({
|
|
3395
|
+
street: z.string(),
|
|
3396
|
+
city: z.string(),
|
|
3397
|
+
state: z.string(),
|
|
3398
|
+
zip: z.string(),
|
|
3399
|
+
country: z.string()
|
|
3400
|
+
}),
|
|
3401
|
+
order: z.object({
|
|
3402
|
+
id: z.string(),
|
|
3403
|
+
itemCount: z.number(),
|
|
3404
|
+
weightKg: z.number(),
|
|
3405
|
+
shippingMethod: z.enum(["standard", "express", "overnight"])
|
|
3406
|
+
})
|
|
3407
|
+
})
|
|
3408
|
+
},
|
|
3409
|
+
{
|
|
3410
|
+
name: "so:array-of-objects",
|
|
3411
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3412
|
+
expected: [
|
|
3413
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3414
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3415
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3416
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3417
|
+
],
|
|
3418
|
+
schema: z.array(z.object({ name: z.string(), price: z.number(), category: z.string() }))
|
|
3419
|
+
},
|
|
3420
|
+
{
|
|
3421
|
+
name: "so:empty-arrays",
|
|
3422
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3423
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3424
|
+
schema: z.object({
|
|
3425
|
+
errors: z.array(z.object({ code: z.string(), severity: z.string() })),
|
|
3426
|
+
warnings: z.array(z.string()),
|
|
3427
|
+
status: z.enum(["healthy", "degraded", "down"]),
|
|
3428
|
+
uptimePercent: z.number()
|
|
3429
|
+
})
|
|
3430
|
+
},
|
|
3431
|
+
{
|
|
3432
|
+
name: "so:enum-classification",
|
|
3433
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3434
|
+
expected: [
|
|
3435
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3436
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3437
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3438
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3439
|
+
],
|
|
3440
|
+
schema: z.array(
|
|
3441
|
+
z.object({
|
|
3442
|
+
id: z.string(),
|
|
3443
|
+
priority: z.enum(["low", "medium", "high", "critical"]),
|
|
3444
|
+
category: z.enum(["billing", "technical", "account", "general"])
|
|
3445
|
+
})
|
|
3446
|
+
)
|
|
3447
|
+
},
|
|
3448
|
+
{
|
|
3449
|
+
name: "so:adversarial-input",
|
|
3450
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3451
|
+
|
|
3452
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3453
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3454
|
+
expected: {
|
|
3455
|
+
product: "headphones",
|
|
3456
|
+
price: 59.99,
|
|
3457
|
+
rating: 5,
|
|
3458
|
+
maxRating: 5,
|
|
3459
|
+
features: ["noise-cancelling"],
|
|
3460
|
+
recommended: true,
|
|
3461
|
+
purchaseDate: "2026-01-15"
|
|
3462
|
+
},
|
|
3463
|
+
schema: z.object({
|
|
3464
|
+
product: z.string(),
|
|
3465
|
+
price: z.number(),
|
|
3466
|
+
rating: z.number(),
|
|
3467
|
+
maxRating: z.number(),
|
|
3468
|
+
features: z.array(z.string()),
|
|
3469
|
+
recommended: z.boolean(),
|
|
3470
|
+
purchaseDate: z.string()
|
|
3471
|
+
})
|
|
3472
|
+
}
|
|
3473
|
+
],
|
|
3474
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3475
|
+
};
|
|
3476
|
+
|
|
3477
|
+
// src/packs/tool-calling.ts
|
|
3478
|
+
import { z as z2 } from "zod";
|
|
3479
|
+
var toolCallingPack = {
|
|
3480
|
+
name: "tool-calling",
|
|
3481
|
+
label: "Tool Calling",
|
|
3482
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
3483
|
+
tasks: [
|
|
3484
|
+
{
|
|
3485
|
+
name: "tc:simple-single-tool",
|
|
3486
|
+
prompt: "What's the current weather in Tokyo?",
|
|
3487
|
+
tools: [{
|
|
3488
|
+
name: "getWeather",
|
|
3489
|
+
description: "Get current weather for a city",
|
|
3490
|
+
parameters: z2.object({
|
|
3491
|
+
city: z2.string(),
|
|
3492
|
+
units: z2.enum(["celsius", "fahrenheit"]).optional()
|
|
3493
|
+
}),
|
|
3494
|
+
handler: async ({ city, units }) => ({
|
|
3495
|
+
city,
|
|
3496
|
+
tempC: 8,
|
|
3497
|
+
condition: "cloudy",
|
|
3498
|
+
units: units ?? "celsius"
|
|
3499
|
+
})
|
|
3500
|
+
}],
|
|
3501
|
+
expected: { city: "Tokyo" }
|
|
3502
|
+
},
|
|
3503
|
+
{
|
|
3504
|
+
name: "tc:complex-params",
|
|
3505
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
3506
|
+
tools: [{
|
|
3507
|
+
name: "searchRestaurants",
|
|
3508
|
+
description: "Search for restaurants matching criteria",
|
|
3509
|
+
parameters: z2.object({
|
|
3510
|
+
cuisine: z2.string(),
|
|
3511
|
+
location: z2.string(),
|
|
3512
|
+
radiusMiles: z2.number(),
|
|
3513
|
+
minRating: z2.number(),
|
|
3514
|
+
openNow: z2.boolean()
|
|
3515
|
+
}),
|
|
3516
|
+
handler: async (_args) => ({
|
|
3517
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
3518
|
+
})
|
|
3519
|
+
}],
|
|
3520
|
+
expected: {
|
|
3521
|
+
cuisine: "Italian",
|
|
3522
|
+
location: "downtown Portland",
|
|
3523
|
+
radiusMiles: 2,
|
|
3524
|
+
minRating: 4,
|
|
3525
|
+
openNow: true
|
|
3526
|
+
}
|
|
3527
|
+
},
|
|
3528
|
+
{
|
|
3529
|
+
name: "tc:select-from-many",
|
|
3530
|
+
prompt: "Convert 150 USD to Euros.",
|
|
3531
|
+
tools: [
|
|
3532
|
+
{
|
|
3533
|
+
name: "getWeather",
|
|
3534
|
+
description: "Get current weather for a city",
|
|
3535
|
+
parameters: z2.object({ city: z2.string() }),
|
|
3536
|
+
handler: async () => ({ tempC: 20 })
|
|
3537
|
+
},
|
|
3538
|
+
{
|
|
3539
|
+
name: "convertCurrency",
|
|
3540
|
+
description: "Convert an amount between currencies",
|
|
3541
|
+
parameters: z2.object({
|
|
3542
|
+
amount: z2.number(),
|
|
3543
|
+
from: z2.string(),
|
|
3544
|
+
to: z2.string()
|
|
3545
|
+
}),
|
|
3546
|
+
handler: async ({ amount, from, to }) => ({
|
|
3547
|
+
amount,
|
|
3548
|
+
from,
|
|
3549
|
+
to,
|
|
3550
|
+
result: 138.75,
|
|
3551
|
+
rate: 0.925
|
|
3552
|
+
})
|
|
3553
|
+
},
|
|
3554
|
+
{
|
|
3555
|
+
name: "translateText",
|
|
3556
|
+
description: "Translate text between languages",
|
|
3557
|
+
parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
|
|
3558
|
+
handler: async () => ({ translated: "" })
|
|
3559
|
+
},
|
|
3560
|
+
{
|
|
3561
|
+
name: "calculateTip",
|
|
3562
|
+
description: "Calculate tip amount for a bill",
|
|
3563
|
+
parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
|
|
3564
|
+
handler: async () => ({ tip: 0 })
|
|
3565
|
+
}
|
|
3566
|
+
],
|
|
3567
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
3568
|
+
},
|
|
3569
|
+
{
|
|
3570
|
+
name: "tc:parallel-calls",
|
|
3571
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
3572
|
+
tools: [{
|
|
3573
|
+
name: "getWeather",
|
|
3574
|
+
description: "Get current weather for a city",
|
|
3575
|
+
parameters: z2.object({ city: z2.string() }),
|
|
3576
|
+
handler: async ({ city }) => {
|
|
3577
|
+
const data = {
|
|
3578
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
3579
|
+
London: { tempC: 9, condition: "rainy" }
|
|
3580
|
+
};
|
|
3581
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
3582
|
+
}
|
|
3583
|
+
}],
|
|
3584
|
+
expected: "weather data for Paris and London"
|
|
3585
|
+
}
|
|
3586
|
+
],
|
|
3587
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
3588
|
+
};
|
|
3589
|
+
|
|
3590
|
+
// src/packs/reasoning.ts
|
|
3591
|
+
import { z as z3 } from "zod";
|
|
3592
|
+
var reasoningPack = {
|
|
3593
|
+
name: "reasoning",
|
|
3594
|
+
label: "Reasoning",
|
|
3595
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
3596
|
+
tasks: [
|
|
3597
|
+
{
|
|
3598
|
+
name: "rs:saas-mrr-calc",
|
|
3599
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
3600
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
3601
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
3602
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
3603
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
3604
|
+
expected: { mrr: 27425 },
|
|
3605
|
+
schema: z3.object({
|
|
3606
|
+
reasoning: z3.string().optional(),
|
|
3607
|
+
mrr: z3.number()
|
|
3608
|
+
})
|
|
3609
|
+
},
|
|
3610
|
+
{
|
|
3611
|
+
name: "rs:logical-deduction",
|
|
3612
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
3613
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
3614
|
+
1. Alice does not use Python, Java, or Go.
|
|
3615
|
+
2. Bob uses TypeScript.
|
|
3616
|
+
3. Carol uses neither Rust nor Go.
|
|
3617
|
+
4. Dave does not use Java.
|
|
3618
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
3619
|
+
What language does each developer use? Return as JSON.`,
|
|
3620
|
+
expected: {
|
|
3621
|
+
Alice: "Rust",
|
|
3622
|
+
Bob: "TypeScript",
|
|
3623
|
+
Carol: "Java",
|
|
3624
|
+
Dave: "Go",
|
|
3625
|
+
Eve: "Python"
|
|
3626
|
+
},
|
|
3627
|
+
schema: z3.object({
|
|
3628
|
+
Alice: z3.string(),
|
|
3629
|
+
Bob: z3.string(),
|
|
3630
|
+
Carol: z3.string(),
|
|
3631
|
+
Dave: z3.string(),
|
|
3632
|
+
Eve: z3.string()
|
|
3633
|
+
})
|
|
3634
|
+
},
|
|
3635
|
+
{
|
|
3636
|
+
name: "rs:data-interpretation",
|
|
3637
|
+
prompt: `Given this quarterly revenue data:
|
|
3638
|
+
| Quarter | Revenue | Growth |
|
|
3639
|
+
|---------|---------|--------|
|
|
3640
|
+
| Q1 2025 | $2.1M | - |
|
|
3641
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
3642
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
3643
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
3644
|
+
|
|
3645
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
3646
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
3647
|
+
expected: {
|
|
3648
|
+
highestGrowthQuarter: "Q4 2025",
|
|
3649
|
+
absoluteIncrease: 0.6,
|
|
3650
|
+
fullYearRevenue: 9.5
|
|
3651
|
+
},
|
|
3652
|
+
schema: z3.object({
|
|
3653
|
+
highestGrowthQuarter: z3.string(),
|
|
3654
|
+
absoluteIncrease: z3.number(),
|
|
3655
|
+
fullYearRevenue: z3.number()
|
|
3656
|
+
})
|
|
3657
|
+
},
|
|
3658
|
+
{
|
|
3659
|
+
name: "rs:critical-path",
|
|
3660
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
3661
|
+
- Build (3 min, no dependency)
|
|
3662
|
+
- Unit tests (5 min, depends on Build)
|
|
3663
|
+
- Integration tests (8 min, depends on Build)
|
|
3664
|
+
- Security scan (4 min, depends on Build)
|
|
3665
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
3666
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
3667
|
+
|
|
3668
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
3669
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
3670
|
+
expected: {
|
|
3671
|
+
totalMinutes: 16,
|
|
3672
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
3673
|
+
},
|
|
3674
|
+
schema: z3.object({
|
|
3675
|
+
totalMinutes: z3.number(),
|
|
3676
|
+
criticalPath: z3.array(z3.string())
|
|
3677
|
+
})
|
|
3678
|
+
},
|
|
3679
|
+
{
|
|
3680
|
+
name: "rs:pricing-rules",
|
|
3681
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
3682
|
+
Rules:
|
|
3683
|
+
- Base price: $100
|
|
3684
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
3685
|
+
- Annual billing: additional 15% off the discounted price
|
|
3686
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
3687
|
+
|
|
3688
|
+
Customers:
|
|
3689
|
+
A: 50 seats, monthly billing, for-profit
|
|
3690
|
+
B: 200 seats, annual billing, for-profit
|
|
3691
|
+
C: 75 seats, annual billing, non-profit
|
|
3692
|
+
D: 150 seats, monthly billing, for-profit
|
|
3693
|
+
|
|
3694
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
3695
|
+
expected: [
|
|
3696
|
+
{ id: "A", finalPrice: 100 },
|
|
3697
|
+
{ id: "B", finalPrice: 59.5 },
|
|
3698
|
+
{ id: "C", finalPrice: 50 },
|
|
3699
|
+
{ id: "D", finalPrice: 70 }
|
|
3700
|
+
],
|
|
3701
|
+
schema: z3.array(z3.object({
|
|
3702
|
+
id: z3.string(),
|
|
3703
|
+
finalPrice: z3.number()
|
|
3704
|
+
}))
|
|
3705
|
+
}
|
|
3706
|
+
],
|
|
3707
|
+
scorers: ["correctness", "latency", "cost"]
|
|
3708
|
+
};
|
|
3709
|
+
|
|
3710
|
+
// src/packs/index.ts
|
|
3711
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3712
|
+
function register(pack) {
|
|
3713
|
+
registry.set(pack.name, pack);
|
|
3714
|
+
}
|
|
3715
|
+
register(structuredOutputPack);
|
|
3716
|
+
register(toolCallingPack);
|
|
3717
|
+
register(reasoningPack);
|
|
3718
|
+
function loadPack(name) {
|
|
3719
|
+
const pack = registry.get(name);
|
|
3720
|
+
if (!pack) {
|
|
3721
|
+
const available = [...registry.keys()].join(", ");
|
|
3722
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3723
|
+
}
|
|
3724
|
+
return pack;
|
|
3725
|
+
}
|
|
3726
|
+
function listPacks() {
|
|
3727
|
+
return [...registry.values()].map((p) => ({
|
|
3728
|
+
name: p.name,
|
|
3729
|
+
label: p.label,
|
|
3730
|
+
description: p.description,
|
|
3731
|
+
taskCount: p.tasks.length
|
|
3732
|
+
}));
|
|
3733
|
+
}
|
|
3734
|
+
|
|
3234
3735
|
// src/ci.ts
|
|
3235
3736
|
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
3236
3737
|
import { dirname } from "path";
|
|
@@ -3521,7 +4022,9 @@ export {
|
|
|
3521
4022
|
gemini,
|
|
3522
4023
|
htmlReporter,
|
|
3523
4024
|
jsonReporter,
|
|
4025
|
+
listPacks,
|
|
3524
4026
|
loadBaseline,
|
|
4027
|
+
loadPack,
|
|
3525
4028
|
markdownReporter,
|
|
3526
4029
|
openai,
|
|
3527
4030
|
openaiCompatible,
|