agent-duelist 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -40,7 +40,9 @@ __export(index_exports, {
40
40
  gemini: () => gemini,
41
41
  htmlReporter: () => htmlReporter,
42
42
  jsonReporter: () => jsonReporter,
43
+ listPacks: () => listPacks,
43
44
  loadBaseline: () => loadBaseline,
45
+ loadPack: () => loadPack,
44
46
  markdownReporter: () => markdownReporter,
45
47
  openai: () => openai,
46
48
  openaiCompatible: () => openaiCompatible,
@@ -1396,38 +1398,49 @@ var costScorer = ({ result }, providerId) => {
1396
1398
  };
1397
1399
  };
1398
1400
 
1401
+ // src/utils/deep-equal.ts
1402
+ function deepEqual(expected, actual) {
1403
+ if (expected === actual) return true;
1404
+ if (typeof expected === "string" && typeof actual === "string") {
1405
+ return expected.trim().toLowerCase() === actual.trim().toLowerCase();
1406
+ }
1407
+ if (typeof expected !== typeof actual) return false;
1408
+ if (expected === null || actual === null) return expected === actual;
1409
+ if (Array.isArray(expected) && Array.isArray(actual)) {
1410
+ if (expected.length !== actual.length) return false;
1411
+ return expected.every((val, i) => deepEqual(val, actual[i]));
1412
+ }
1413
+ if (typeof expected === "object" && typeof actual === "object") {
1414
+ const objExpected = expected;
1415
+ const objActual = actual;
1416
+ const keysExpected = Object.keys(objExpected);
1417
+ return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
1418
+ }
1419
+ return expected === actual;
1420
+ }
1421
+
1399
1422
  // src/scorers/correctness.ts
1400
1423
  var correctnessScorer = ({ task, result }) => {
1401
1424
  if (task.expected === void 0) {
1402
1425
  return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
1403
1426
  }
1404
- const match = deepEqual(task.expected, result.output);
1427
+ const actual = normalizeOutput(task.expected, result.output);
1428
+ const match = deepEqual(task.expected, actual);
1405
1429
  return {
1406
1430
  name: "correctness",
1407
1431
  value: match ? 1 : 0,
1408
1432
  details: { expected: task.expected, actual: result.output }
1409
1433
  };
1410
1434
  };
1411
- function deepEqual(a, b) {
1412
- if (a === b) return true;
1413
- if (typeof a === "string" && typeof b === "string") {
1414
- return a.trim().toLowerCase() === b.trim().toLowerCase();
1415
- }
1416
- if (typeof a !== typeof b) return false;
1417
- if (a === null || b === null) return a === b;
1418
- if (Array.isArray(a) && Array.isArray(b)) {
1419
- if (a.length !== b.length) return false;
1420
- return a.every((val, i) => deepEqual(val, b[i]));
1421
- }
1422
- if (typeof a === "object" && typeof b === "object") {
1423
- const objA = a;
1424
- const objB = b;
1425
- const keysA = Object.keys(objA);
1426
- const keysB = Object.keys(objB);
1427
- if (keysA.length !== keysB.length) return false;
1428
- return keysA.every((key) => key in objB && deepEqual(objA[key], objB[key]));
1429
- }
1430
- return a === b;
1435
+ function normalizeOutput(expected, actual) {
1436
+ if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
1437
+ const entries = Object.entries(actual);
1438
+ const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
1439
+ if (arrayEntries.length === 1) {
1440
+ return arrayEntries[0][1];
1441
+ }
1442
+ }
1443
+ return actual;
1431
1444
  }
1432
1445
 
1433
1446
  // src/scorers/schema-correctness.ts
@@ -1447,7 +1460,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
1447
1460
  };
1448
1461
  }
1449
1462
  }
1450
- const parsed = task.schema.safeParse(data);
1463
+ let parsed = task.schema.safeParse(data);
1464
+ if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
1465
+ const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
1466
+ if (arrayEntries.length === 1) {
1467
+ const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
1468
+ if (unwrapped.success) parsed = unwrapped;
1469
+ }
1470
+ }
1451
1471
  return {
1452
1472
  name: "schema-correctness",
1453
1473
  value: parsed.success ? 1 : 0,
@@ -1493,18 +1513,36 @@ var import_openai2 = __toESM(require("openai"), 1);
1493
1513
 
1494
1514
  // src/providers/openai.ts
1495
1515
  var import_openai = __toESM(require("openai"), 1);
1496
- var import_zod_to_json_schema = require("zod-to-json-schema");
1516
+ var import_zod_to_json_schema2 = require("zod-to-json-schema");
1497
1517
 
1498
1518
  // src/providers/shared.ts
1499
- var SCHEMA_SYSTEM_MESSAGE = "Respond with valid JSON matching the requested schema.";
1519
+ var import_zod_to_json_schema = require("zod-to-json-schema");
1520
+ function buildSchemaSystemMessage(schema) {
1521
+ if (!schema) return "Respond with valid JSON.";
1522
+ const jsonSchema = (0, import_zod_to_json_schema.zodToJsonSchema)(schema, { target: "openAi" });
1523
+ return [
1524
+ "Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
1525
+ "",
1526
+ "Your output must conform to this JSON Schema:",
1527
+ JSON.stringify(jsonSchema, null, 2),
1528
+ "",
1529
+ "IMPORTANT: Output the actual data values, NOT the schema definition itself.",
1530
+ 'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
1531
+ ].join("\n");
1532
+ }
1500
1533
  function parseSchemaOutput(rawContent, hasSchema) {
1501
1534
  if (!hasSchema) return rawContent;
1535
+ const cleaned = stripCodeFences(rawContent);
1502
1536
  try {
1503
- return JSON.parse(rawContent);
1537
+ return JSON.parse(cleaned);
1504
1538
  } catch {
1505
1539
  return rawContent;
1506
1540
  }
1507
1541
  }
1542
+ function stripCodeFences(content) {
1543
+ const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
1544
+ return match ? match[1] : content;
1545
+ }
1508
1546
 
1509
1547
  // src/providers/openai.ts
1510
1548
  var REQUEST_TIMEOUT_MS = 6e4;
@@ -1553,7 +1591,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1553
1591
  if (input.schema) {
1554
1592
  params.response_format = { type: "json_object" };
1555
1593
  params.messages = [
1556
- { role: "system", content: SCHEMA_SYSTEM_MESSAGE },
1594
+ { role: "system", content: buildSchemaSystemMessage(input.schema) },
1557
1595
  ...params.messages
1558
1596
  ];
1559
1597
  }
@@ -1561,7 +1599,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1561
1599
  params.tools = input.tools.map(toolDefToOpenAI);
1562
1600
  params.tool_choice = "auto";
1563
1601
  }
1564
- const response = await client.chat.completions.create(params, { signal: input.signal });
1602
+ const reqOpts = { signal: input.signal };
1603
+ if (input.timeout) reqOpts.timeout = input.timeout;
1604
+ const response = await client.chat.completions.create(params, reqOpts);
1565
1605
  let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1566
1606
  let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1567
1607
  const choice = response.choices[0];
@@ -1595,7 +1635,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1595
1635
  const followUp = await client.chat.completions.create({
1596
1636
  model: requestModel,
1597
1637
  messages: toolMessages
1598
- }, { signal: input.signal });
1638
+ }, reqOpts);
1599
1639
  totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1600
1640
  totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1601
1641
  finalResponse = followUp;
@@ -1640,7 +1680,7 @@ function toolDefToOpenAI(tool) {
1640
1680
  function: {
1641
1681
  name: tool.name,
1642
1682
  description: tool.description,
1643
- parameters: (0, import_zod_to_json_schema.zodToJsonSchema)(tool.parameters, { target: "openAi" })
1683
+ parameters: (0, import_zod_to_json_schema2.zodToJsonSchema)(tool.parameters, { target: "openAi" })
1644
1684
  }
1645
1685
  };
1646
1686
  }
@@ -1689,8 +1729,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1689
1729
  if (!apiKey) return void 0;
1690
1730
  return { client: new import_openai2.default({ apiKey, timeout: timeoutMs }), model };
1691
1731
  }
1732
+ function isTemperatureError(err) {
1733
+ const msg = err instanceof Error ? err.message : String(err);
1734
+ const lower = msg.toLowerCase();
1735
+ return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
1736
+ }
1692
1737
  function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1693
1738
  let cached = void 0;
1739
+ let useTemperature = true;
1694
1740
  return async ({ task, result }) => {
1695
1741
  if (task.expected === void 0) {
1696
1742
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
@@ -1707,35 +1753,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1707
1753
  }
1708
1754
  const { client, model } = cached;
1709
1755
  const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
1756
+ const messages = [{ role: "user", content: prompt }];
1710
1757
  try {
1711
- const response = await client.chat.completions.create({
1712
- model,
1713
- messages: [{ role: "user", content: prompt }],
1714
- max_completion_tokens: 2048
1715
- });
1716
- const content = response.choices[0]?.message?.content?.trim() ?? "";
1717
- const parsed = {};
1718
- for (const line of content.split("\n")) {
1719
- const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
1720
- if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
1721
- }
1722
- const accuracy = parsed.accuracy;
1723
- const completeness = parsed.completeness;
1724
- const conciseness = parsed.conciseness;
1725
- if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
1726
- return {
1727
- name: "llm-judge-correctness",
1728
- value: -1,
1729
- details: { reason: `judge returned unparseable scores: "${content}"`, model }
1730
- };
1731
- }
1732
- const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
1733
- return {
1734
- name: "llm-judge-correctness",
1735
- value: composite,
1736
- details: { model, accuracy, completeness, conciseness }
1737
- };
1758
+ const response = await callJudge(client, model, messages, useTemperature);
1759
+ return parseJudgeResponse(response, model);
1738
1760
  } catch (err) {
1761
+ if (useTemperature && isTemperatureError(err)) {
1762
+ useTemperature = false;
1763
+ try {
1764
+ const response = await callJudge(client, model, messages, false);
1765
+ return parseJudgeResponse(response, model);
1766
+ } catch (retryErr) {
1767
+ return {
1768
+ name: "llm-judge-correctness",
1769
+ value: -1,
1770
+ details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
1771
+ };
1772
+ }
1773
+ }
1739
1774
  return {
1740
1775
  name: "llm-judge-correctness",
1741
1776
  value: -1,
@@ -1744,18 +1779,89 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1744
1779
  }
1745
1780
  };
1746
1781
  }
1782
+ async function callJudge(client, model, messages, withTemperature) {
1783
+ return client.chat.completions.create({
1784
+ model,
1785
+ messages,
1786
+ max_completion_tokens: 2048,
1787
+ ...withTemperature ? { temperature: 0 } : {}
1788
+ });
1789
+ }
1790
+ function parseJudgeResponse(response, model) {
1791
+ const content = response.choices[0]?.message?.content?.trim() ?? "";
1792
+ const parsed = {};
1793
+ for (const line of content.split("\n")) {
1794
+ const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
1795
+ if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
1796
+ }
1797
+ const accuracy = parsed.accuracy;
1798
+ const completeness = parsed.completeness;
1799
+ const conciseness = parsed.conciseness;
1800
+ if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
1801
+ return {
1802
+ name: "llm-judge-correctness",
1803
+ value: -1,
1804
+ details: { reason: `judge returned unparseable scores: "${content}"`, model }
1805
+ };
1806
+ }
1807
+ const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
1808
+ return {
1809
+ name: "llm-judge-correctness",
1810
+ value: composite,
1811
+ details: { model, accuracy, completeness, conciseness }
1812
+ };
1813
+ }
1747
1814
 
1748
1815
  // src/scorers/tool-usage.ts
1749
1816
  var toolUsageScorer = ({ task, result }) => {
1750
- const expectedToolName = task.tools?.[0]?.name;
1751
- if (!expectedToolName) {
1817
+ if (!task.tools?.length) {
1752
1818
  return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
1753
1819
  }
1754
- const usedTool = result.toolCalls?.some((c) => c.name === expectedToolName) ?? false;
1820
+ const calls = result.toolCalls ?? [];
1821
+ const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
1822
+ if (expectedIsObject) {
1823
+ const matchingCall = calls.find((c) => {
1824
+ const toolDef = task.tools.find((t) => t.name === c.name);
1825
+ if (!toolDef) return false;
1826
+ return deepEqual(task.expected, c.arguments);
1827
+ });
1828
+ if (matchingCall) {
1829
+ return {
1830
+ name: "tool-usage",
1831
+ value: 1,
1832
+ details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
1833
+ };
1834
+ }
1835
+ const expectedKeys = Object.keys(task.expected);
1836
+ const partialMatch = calls.find((c) => {
1837
+ if (typeof c.arguments !== "object" || c.arguments === null) return false;
1838
+ const argKeys = Object.keys(c.arguments);
1839
+ return expectedKeys.some((k) => argKeys.includes(k));
1840
+ });
1841
+ if (partialMatch) {
1842
+ return {
1843
+ name: "tool-usage",
1844
+ value: 0.5,
1845
+ details: {
1846
+ reason: "correct tool but wrong arguments",
1847
+ expected: task.expected,
1848
+ actual: partialMatch.arguments,
1849
+ toolCalls: calls
1850
+ }
1851
+ };
1852
+ }
1853
+ return {
1854
+ name: "tool-usage",
1855
+ value: 0,
1856
+ details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
1857
+ };
1858
+ }
1859
+ const expectedToolName = task.tools[0].name;
1860
+ const usedTool = calls.some((c) => c.name === expectedToolName);
1755
1861
  return {
1756
1862
  name: "tool-usage",
1757
1863
  value: usedTool ? 1 : 0,
1758
- details: { expectedToolName, usedTool, toolCalls: result.toolCalls ?? [] }
1864
+ details: { expectedToolName, usedTool, toolCalls: calls }
1759
1865
  };
1760
1866
  };
1761
1867
 
@@ -1816,7 +1922,8 @@ async function runBenchmarks(options) {
1816
1922
  prompt: task.prompt,
1817
1923
  schema: task.schema,
1818
1924
  tools: task.tools,
1819
- signal
1925
+ signal,
1926
+ timeout
1820
1927
  }), timeout);
1821
1928
  const scores = await Promise.all(
1822
1929
  scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
@@ -2025,37 +2132,76 @@ function computeColumnStats(providerData, scorerNames) {
2025
2132
  }
2026
2133
  return stats;
2027
2134
  }
2135
+ var QUALITY_SCORERS = /* @__PURE__ */ new Set([
2136
+ "correctness",
2137
+ "schema-correctness",
2138
+ "fuzzy-similarity",
2139
+ "llm-judge-correctness",
2140
+ "tool-usage"
2141
+ ]);
2142
+ function passesQualityGate(providerId, columnStats) {
2143
+ const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
2144
+ if (qualityColumns.length === 0) return true;
2145
+ return qualityColumns.some((col) => {
2146
+ const val = columnStats.get(col)?.values.get(providerId);
2147
+ return val !== void 0 && val > 0;
2148
+ });
2149
+ }
2028
2150
  function computeMedals(columnStats, providerIds) {
2029
2151
  const medals = /* @__PURE__ */ new Map();
2030
2152
  if (providerIds.length < 2) {
2031
2153
  for (const id of providerIds) medals.set(id, "none");
2032
2154
  return medals;
2033
2155
  }
2034
- const wins = /* @__PURE__ */ new Map();
2035
- for (const id of providerIds) wins.set(id, 0);
2036
- for (const [, colStats] of columnStats) {
2156
+ const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
2157
+ const qualityWins = /* @__PURE__ */ new Map();
2158
+ const efficiencyWins = /* @__PURE__ */ new Map();
2159
+ for (const id of providerIds) {
2160
+ qualityWins.set(id, 0);
2161
+ efficiencyWins.set(id, 0);
2162
+ }
2163
+ for (const [colName, colStats] of columnStats) {
2037
2164
  if (colStats.best === void 0) continue;
2038
2165
  const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
2039
2166
  if (bestProviders.length === 1) {
2040
- wins.set(bestProviders[0][0], (wins.get(bestProviders[0][0]) ?? 0) + 1);
2167
+ const winnerId = bestProviders[0][0];
2168
+ if (QUALITY_SCORERS.has(colName)) {
2169
+ qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
2170
+ } else {
2171
+ efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
2172
+ }
2041
2173
  }
2042
2174
  }
2043
- const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
2175
+ const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
2044
2176
  if (totalWins === 0) {
2045
2177
  for (const id of providerIds) medals.set(id, "none");
2046
2178
  return medals;
2047
2179
  }
2048
- const sorted = [...wins.entries()].sort(
2049
- (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
2050
- );
2180
+ const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
2181
+ const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
2182
+ if (qDiff !== 0) return qDiff;
2183
+ const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
2184
+ if (eDiff !== 0) return eDiff;
2185
+ return a.localeCompare(b);
2186
+ });
2051
2187
  const medalList = ["gold", "silver", "bronze"];
2052
2188
  let rank = 0;
2053
- for (let i = 0; i < sorted.length; i++) {
2054
- if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
2055
- rank = i;
2189
+ for (let i = 0; i < eligibleSorted.length; i++) {
2190
+ if (i > 0) {
2191
+ const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
2192
+ const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
2193
+ if (currQ < prevQ) {
2194
+ rank = i;
2195
+ } else if (currQ === prevQ) {
2196
+ const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
2197
+ const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
2198
+ if (currE < prevE) rank = i;
2199
+ }
2056
2200
  }
2057
- const hasWins = sorted[i][1] > 0;
2058
- medals.set(sorted[i][0], hasWins && rank < medalList.length ? medalList[rank] : "none");
2201
+ medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
2202
+ }
2203
+ for (const id of providerIds) {
2204
+ if (!eligible.has(id)) medals.set(id, "none");
2059
2205
  }
2060
2206
  return medals;
2061
2207
  }
@@ -2452,24 +2598,10 @@ function printSummary(results, providers, byProvider) {
2452
2598
  console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2453
2599
  }
2454
2600
  }
2455
- if (!single) {
2456
- const wins = /* @__PURE__ */ new Map();
2457
- for (const id of providers) wins.set(id, 0);
2458
- if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2459
- if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2460
- if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2461
- const maxWins = Math.max(...wins.values());
2462
- if (maxWins > 0) {
2463
- const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2464
- console.log("");
2465
- if (topProviders.length === 1) {
2466
- const [winnerId, winCount] = topProviders[0];
2467
- console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2468
- } else {
2469
- const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2470
- console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2471
- }
2472
- }
2601
+ if (!single && byCorrectness && byCorrectness.avg > 0) {
2602
+ console.log("");
2603
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
2604
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
2473
2605
  }
2474
2606
  console.log("");
2475
2607
  }
@@ -2503,15 +2635,15 @@ function defineArena(config) {
2503
2635
  if (config.providers.length === 0) {
2504
2636
  throw new Error("At least one provider is required");
2505
2637
  }
2506
- if (config.tasks.length === 0) {
2507
- throw new Error("At least one task is required");
2508
- }
2509
2638
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
2510
2639
  const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
2511
2640
  const runs = config.runs ?? 1;
2512
2641
  return {
2513
2642
  config,
2514
2643
  async run(options) {
2644
+ if (config.tasks.length === 0) {
2645
+ throw new Error("At least one task is required");
2646
+ }
2515
2647
  return runBenchmarks({
2516
2648
  providers: config.providers,
2517
2649
  tasks: config.tasks,
@@ -2537,13 +2669,15 @@ function anthropic(model, options) {
2537
2669
  model,
2538
2670
  async run(input) {
2539
2671
  const start = Date.now();
2540
- const systemMessage = input.schema ? SCHEMA_SYSTEM_MESSAGE : void 0;
2672
+ const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
2673
+ const reqOpts = { signal: input.signal };
2674
+ if (input.timeout) reqOpts.timeout = input.timeout;
2541
2675
  const response = await client.messages.create({
2542
2676
  model,
2543
2677
  max_tokens: maxTokens,
2544
2678
  system: systemMessage,
2545
2679
  messages: [{ role: "user", content: input.prompt }]
2546
- }, { signal: input.signal });
2680
+ }, reqOpts);
2547
2681
  const latencyMs = Date.now() - start;
2548
2682
  const textBlock = response.content.find((b) => b.type === "text");
2549
2683
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
@@ -2688,17 +2822,8 @@ function htmlReporter(results) {
2688
2822
  return { id, avg };
2689
2823
  }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2690
2824
  let overallWinner;
2691
- if (multi) {
2692
- const wins = /* @__PURE__ */ new Map();
2693
- for (const id of providers) wins.set(id, 0);
2694
- if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2695
- if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2696
- if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2697
- const maxWins = Math.max(...wins.values());
2698
- if (maxWins > 0) {
2699
- const tops = [...wins.entries()].filter(([, w]) => w === maxWins);
2700
- if (tops.length === 1) overallWinner = tops[0][0];
2701
- }
2825
+ if (multi && byCorrectness && byCorrectness.avg > 0) {
2826
+ overallWinner = byCorrectness.id;
2702
2827
  }
2703
2828
  const errorResults = results.filter((r) => r.error);
2704
2829
  const deduped = dedupeErrors(errorResults);
@@ -3219,7 +3344,7 @@ function renderErrors(errors) {
3219
3344
  </div>`;
3220
3345
  }).join("\n");
3221
3346
  return `<section class="errors-section">
3222
- <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'block'">Errors</h2>
3347
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
3223
3348
  <div class="errors-list">
3224
3349
  ${items}
3225
3350
  </div>
@@ -3283,6 +3408,384 @@ function renderScript(taskCount) {
3283
3408
  </script>`;
3284
3409
  }
3285
3410
 
3411
+ // src/packs/structured-output.ts
3412
+ var import_zod = require("zod");
3413
+ var structuredOutputPack = {
3414
+ name: "structured-output",
3415
+ label: "Structured Output",
3416
+ description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
3417
+ tasks: [
3418
+ {
3419
+ name: "so:flat-entity",
3420
+ prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
3421
+ expected: {
3422
+ name: "Maria Garcia",
3423
+ age: 34,
3424
+ role: "software architect",
3425
+ city: "Barcelona",
3426
+ country: "Spain",
3427
+ employeeId: "EMP-2847"
3428
+ },
3429
+ schema: import_zod.z.object({
3430
+ name: import_zod.z.string(),
3431
+ age: import_zod.z.number(),
3432
+ role: import_zod.z.string(),
3433
+ city: import_zod.z.string(),
3434
+ country: import_zod.z.string(),
3435
+ employeeId: import_zod.z.string()
3436
+ })
3437
+ },
3438
+ {
3439
+ name: "so:nested-object",
3440
+ prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
3441
+ expected: {
3442
+ recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
3443
+ address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
3444
+ order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
3445
+ },
3446
+ schema: import_zod.z.object({
3447
+ recipient: import_zod.z.object({ company: import_zod.z.string(), contact: import_zod.z.string(), floor: import_zod.z.string() }),
3448
+ address: import_zod.z.object({
3449
+ street: import_zod.z.string(),
3450
+ city: import_zod.z.string(),
3451
+ state: import_zod.z.string(),
3452
+ zip: import_zod.z.string(),
3453
+ country: import_zod.z.string()
3454
+ }),
3455
+ order: import_zod.z.object({
3456
+ id: import_zod.z.string(),
3457
+ itemCount: import_zod.z.number(),
3458
+ weightKg: import_zod.z.number(),
3459
+ shippingMethod: import_zod.z.enum(["standard", "express", "overnight"])
3460
+ })
3461
+ })
3462
+ },
3463
+ {
3464
+ name: "so:array-of-objects",
3465
+ prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
3466
+ expected: [
3467
+ { name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
3468
+ { name: "ComfortMax Chair", price: 199, category: "Furniture" },
3469
+ { name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
3470
+ { name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
3471
+ ],
3472
+ schema: import_zod.z.array(import_zod.z.object({ name: import_zod.z.string(), price: import_zod.z.number(), category: import_zod.z.string() }))
3473
+ },
3474
+ {
3475
+ name: "so:empty-arrays",
3476
+ prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
3477
+ expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
3478
+ schema: import_zod.z.object({
3479
+ errors: import_zod.z.array(import_zod.z.object({ code: import_zod.z.string(), severity: import_zod.z.string() })),
3480
+ warnings: import_zod.z.array(import_zod.z.string()),
3481
+ status: import_zod.z.enum(["healthy", "degraded", "down"]),
3482
+ uptimePercent: import_zod.z.number()
3483
+ })
3484
+ },
3485
+ {
3486
+ name: "so:enum-classification",
3487
+ prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
3488
+ expected: [
3489
+ { id: "A", priority: "high", category: "billing" },
3490
+ { id: "B", priority: "high", category: "technical" },
3491
+ { id: "C", priority: "low", category: "account" },
3492
+ { id: "D", priority: "critical", category: "technical" }
3493
+ ],
3494
+ schema: import_zod.z.array(
3495
+ import_zod.z.object({
3496
+ id: import_zod.z.string(),
3497
+ priority: import_zod.z.enum(["low", "medium", "high", "critical"]),
3498
+ category: import_zod.z.enum(["billing", "technical", "account", "general"])
3499
+ })
3500
+ )
3501
+ },
3502
+ {
3503
+ name: "so:adversarial-input",
3504
+ prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
3505
+
3506
+ User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
3507
+ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
3508
+ expected: {
3509
+ product: "headphones",
3510
+ price: 59.99,
3511
+ rating: 5,
3512
+ maxRating: 5,
3513
+ features: ["noise-cancelling"],
3514
+ recommended: true,
3515
+ purchaseDate: "2026-01-15"
3516
+ },
3517
+ schema: import_zod.z.object({
3518
+ product: import_zod.z.string(),
3519
+ price: import_zod.z.number(),
3520
+ rating: import_zod.z.number(),
3521
+ maxRating: import_zod.z.number(),
3522
+ features: import_zod.z.array(import_zod.z.string()),
3523
+ recommended: import_zod.z.boolean(),
3524
+ purchaseDate: import_zod.z.string()
3525
+ })
3526
+ }
3527
+ ],
3528
+ scorers: ["correctness", "schema-correctness", "latency", "cost"]
3529
+ };
3530
+
3531
+ // src/packs/tool-calling.ts
3532
+ var import_zod2 = require("zod");
3533
+ var toolCallingPack = {
3534
+ name: "tool-calling",
3535
+ label: "Tool Calling",
3536
+ description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
3537
+ tasks: [
3538
+ {
3539
+ name: "tc:simple-single-tool",
3540
+ prompt: "What's the current weather in Tokyo?",
3541
+ tools: [{
3542
+ name: "getWeather",
3543
+ description: "Get current weather for a city",
3544
+ parameters: import_zod2.z.object({
3545
+ city: import_zod2.z.string(),
3546
+ units: import_zod2.z.enum(["celsius", "fahrenheit"]).optional()
3547
+ }),
3548
+ handler: async ({ city, units }) => ({
3549
+ city,
3550
+ tempC: 8,
3551
+ condition: "cloudy",
3552
+ units: units ?? "celsius"
3553
+ })
3554
+ }],
3555
+ expected: { city: "Tokyo" }
3556
+ },
3557
+ {
3558
+ name: "tc:complex-params",
3559
+ prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
3560
+ tools: [{
3561
+ name: "searchRestaurants",
3562
+ description: "Search for restaurants matching criteria",
3563
+ parameters: import_zod2.z.object({
3564
+ cuisine: import_zod2.z.string(),
3565
+ location: import_zod2.z.string(),
3566
+ radiusMiles: import_zod2.z.number(),
3567
+ minRating: import_zod2.z.number(),
3568
+ openNow: import_zod2.z.boolean()
3569
+ }),
3570
+ handler: async (_args) => ({
3571
+ results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
3572
+ })
3573
+ }],
3574
+ expected: {
3575
+ cuisine: "Italian",
3576
+ location: "downtown Portland",
3577
+ radiusMiles: 2,
3578
+ minRating: 4,
3579
+ openNow: true
3580
+ }
3581
+ },
3582
+ {
3583
+ name: "tc:select-from-many",
3584
+ prompt: "Convert 150 USD to Euros.",
3585
+ tools: [
3586
+ {
3587
+ name: "getWeather",
3588
+ description: "Get current weather for a city",
3589
+ parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
3590
+ handler: async () => ({ tempC: 20 })
3591
+ },
3592
+ {
3593
+ name: "convertCurrency",
3594
+ description: "Convert an amount between currencies",
3595
+ parameters: import_zod2.z.object({
3596
+ amount: import_zod2.z.number(),
3597
+ from: import_zod2.z.string(),
3598
+ to: import_zod2.z.string()
3599
+ }),
3600
+ handler: async ({ amount, from, to }) => ({
3601
+ amount,
3602
+ from,
3603
+ to,
3604
+ result: 138.75,
3605
+ rate: 0.925
3606
+ })
3607
+ },
3608
+ {
3609
+ name: "translateText",
3610
+ description: "Translate text between languages",
3611
+ parameters: import_zod2.z.object({ text: import_zod2.z.string(), targetLang: import_zod2.z.string() }),
3612
+ handler: async () => ({ translated: "" })
3613
+ },
3614
+ {
3615
+ name: "calculateTip",
3616
+ description: "Calculate tip amount for a bill",
3617
+ parameters: import_zod2.z.object({ billAmount: import_zod2.z.number(), tipPercent: import_zod2.z.number() }),
3618
+ handler: async () => ({ tip: 0 })
3619
+ }
3620
+ ],
3621
+ expected: { amount: 150, from: "USD", to: "EUR" }
3622
+ },
3623
+ {
3624
+ name: "tc:parallel-calls",
3625
+ prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
3626
+ tools: [{
3627
+ name: "getWeather",
3628
+ description: "Get current weather for a city",
3629
+ parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
3630
+ handler: async ({ city }) => {
3631
+ const data = {
3632
+ Paris: { tempC: 12, condition: "partly cloudy" },
3633
+ London: { tempC: 9, condition: "rainy" }
3634
+ };
3635
+ return data[city] ?? { tempC: 15, condition: "unknown" };
3636
+ }
3637
+ }],
3638
+ expected: "weather data for Paris and London"
3639
+ }
3640
+ ],
3641
+ scorers: ["tool-usage", "latency", "cost"]
3642
+ };
3643
+
3644
+ // src/packs/reasoning.ts
3645
+ var import_zod3 = require("zod");
3646
+ var reasoningPack = {
3647
+ name: "reasoning",
3648
+ label: "Reasoning",
3649
+ description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
3650
+ tasks: [
3651
+ {
3652
+ name: "rs:saas-mrr-calc",
3653
+ prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
3654
+ In Q1 they had 200 basic subscribers and 85 pro subscribers.
3655
+ In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
3656
+ No one churned. What is the Q2 monthly recurring revenue (MRR)?
3657
+ Return as JSON with your reasoning and the final MRR number.`,
3658
+ expected: { mrr: 27425 },
3659
+ schema: import_zod3.z.object({
3660
+ reasoning: import_zod3.z.string().optional(),
3661
+ mrr: import_zod3.z.number()
3662
+ })
3663
+ },
3664
+ {
3665
+ name: "rs:logical-deduction",
3666
+ prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
3667
+ primary language: Rust, TypeScript, Python, Go, and Java. Given:
3668
+ 1. Alice does not use Python, Java, or Go.
3669
+ 2. Bob uses TypeScript.
3670
+ 3. Carol uses neither Rust nor Go.
3671
+ 4. Dave does not use Java.
3672
+ 5. Eve uses neither Rust, Go, nor Java.
3673
+ What language does each developer use? Return as JSON.`,
3674
+ expected: {
3675
+ Alice: "Rust",
3676
+ Bob: "TypeScript",
3677
+ Carol: "Java",
3678
+ Dave: "Go",
3679
+ Eve: "Python"
3680
+ },
3681
+ schema: import_zod3.z.object({
3682
+ Alice: import_zod3.z.string(),
3683
+ Bob: import_zod3.z.string(),
3684
+ Carol: import_zod3.z.string(),
3685
+ Dave: import_zod3.z.string(),
3686
+ Eve: import_zod3.z.string()
3687
+ })
3688
+ },
3689
+ {
3690
+ name: "rs:data-interpretation",
3691
+ prompt: `Given this quarterly revenue data:
3692
+ | Quarter | Revenue | Growth |
3693
+ |---------|---------|--------|
3694
+ | Q1 2025 | $2.1M | - |
3695
+ | Q2 2025 | $2.4M | 14.3% |
3696
+ | Q3 2025 | $2.2M | -8.3% |
3697
+ | Q4 2025 | $2.8M | 27.3% |
3698
+
3699
+ Which quarter had the highest absolute revenue increase compared to the previous
3700
+ quarter? What was the full-year total revenue in millions? Return as JSON.`,
3701
+ expected: {
3702
+ highestGrowthQuarter: "Q4 2025",
3703
+ absoluteIncrease: 0.6,
3704
+ fullYearRevenue: 9.5
3705
+ },
3706
+ schema: import_zod3.z.object({
3707
+ highestGrowthQuarter: import_zod3.z.string(),
3708
+ absoluteIncrease: import_zod3.z.number(),
3709
+ fullYearRevenue: import_zod3.z.number()
3710
+ })
3711
+ },
3712
+ {
3713
+ name: "rs:critical-path",
3714
+ prompt: `A deployment pipeline has these stages with dependencies:
3715
+ - Build (3 min, no dependency)
3716
+ - Unit tests (5 min, depends on Build)
3717
+ - Integration tests (8 min, depends on Build)
3718
+ - Security scan (4 min, depends on Build)
3719
+ - Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
3720
+ - Smoke tests (3 min, depends on Staging deploy)
3721
+
3722
+ Assuming stages run in parallel where possible, what is the total pipeline
3723
+ duration in minutes? Which stages are on the critical path? Return as JSON.`,
3724
+ expected: {
3725
+ totalMinutes: 16,
3726
+ criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
3727
+ },
3728
+ schema: import_zod3.z.object({
3729
+ totalMinutes: import_zod3.z.number(),
3730
+ criticalPath: import_zod3.z.array(import_zod3.z.string())
3731
+ })
3732
+ },
3733
+ {
3734
+ name: "rs:pricing-rules",
3735
+ prompt: `Apply these pricing rules to each customer and return the final price:
3736
+ Rules:
3737
+ - Base price: $100
3738
+ - Enterprise customers (>100 seats): 30% discount
3739
+ - Annual billing: additional 15% off the discounted price
3740
+ - Non-profit organizations: flat $50 regardless of other rules
3741
+
3742
+ Customers:
3743
+ A: 50 seats, monthly billing, for-profit
3744
+ B: 200 seats, annual billing, for-profit
3745
+ C: 75 seats, annual billing, non-profit
3746
+ D: 150 seats, monthly billing, for-profit
3747
+
3748
+ Return as a JSON array with customer id and finalPrice.`,
3749
+ expected: [
3750
+ { id: "A", finalPrice: 100 },
3751
+ { id: "B", finalPrice: 59.5 },
3752
+ { id: "C", finalPrice: 50 },
3753
+ { id: "D", finalPrice: 70 }
3754
+ ],
3755
+ schema: import_zod3.z.array(import_zod3.z.object({
3756
+ id: import_zod3.z.string(),
3757
+ finalPrice: import_zod3.z.number()
3758
+ }))
3759
+ }
3760
+ ],
3761
+ scorers: ["correctness", "latency", "cost"]
3762
+ };
3763
+
3764
+ // src/packs/index.ts
3765
+ var registry = /* @__PURE__ */ new Map();
3766
+ function register(pack) {
3767
+ registry.set(pack.name, pack);
3768
+ }
3769
+ register(structuredOutputPack);
3770
+ register(toolCallingPack);
3771
+ register(reasoningPack);
3772
+ function loadPack(name) {
3773
+ const pack = registry.get(name);
3774
+ if (!pack) {
3775
+ const available = [...registry.keys()].join(", ");
3776
+ throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
3777
+ }
3778
+ return pack;
3779
+ }
3780
+ function listPacks() {
3781
+ return [...registry.values()].map((p) => ({
3782
+ name: p.name,
3783
+ label: p.label,
3784
+ description: p.description,
3785
+ taskCount: p.tasks.length
3786
+ }));
3787
+ }
3788
+
3286
3789
  // src/ci.ts
3287
3790
  var import_node_fs = require("fs");
3288
3791
  var import_node_path = require("path");
@@ -3574,7 +4077,9 @@ async function upsertPrComment(ctx, body, marker) {
3574
4077
  gemini,
3575
4078
  htmlReporter,
3576
4079
  jsonReporter,
4080
+ listPacks,
3577
4081
  loadBaseline,
4082
+ loadPack,
3578
4083
  markdownReporter,
3579
4084
  openai,
3580
4085
  openaiCompatible,