agent-duelist 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1349,33 +1349,42 @@ var correctnessScorer = ({ task, result }) => {
1349
1349
  if (task.expected === void 0) {
1350
1350
  return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
1351
1351
  }
1352
- const match = deepEqual(task.expected, result.output);
1352
+ const actual = normalizeOutput(task.expected, result.output);
1353
+ const match = deepEqual(task.expected, actual);
1353
1354
  return {
1354
1355
  name: "correctness",
1355
1356
  value: match ? 1 : 0,
1356
1357
  details: { expected: task.expected, actual: result.output }
1357
1358
  };
1358
1359
  };
1359
- function deepEqual(a, b) {
1360
- if (a === b) return true;
1361
- if (typeof a === "string" && typeof b === "string") {
1362
- return a.trim().toLowerCase() === b.trim().toLowerCase();
1363
- }
1364
- if (typeof a !== typeof b) return false;
1365
- if (a === null || b === null) return a === b;
1366
- if (Array.isArray(a) && Array.isArray(b)) {
1367
- if (a.length !== b.length) return false;
1368
- return a.every((val, i) => deepEqual(val, b[i]));
1369
- }
1370
- if (typeof a === "object" && typeof b === "object") {
1371
- const objA = a;
1372
- const objB = b;
1373
- const keysA = Object.keys(objA);
1374
- const keysB = Object.keys(objB);
1375
- if (keysA.length !== keysB.length) return false;
1376
- return keysA.every((key) => key in objB && deepEqual(objA[key], objB[key]));
1377
- }
1378
- return a === b;
1360
+ function normalizeOutput(expected, actual) {
1361
+ if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
1362
+ const entries = Object.entries(actual);
1363
+ const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
1364
+ if (arrayEntries.length === 1) {
1365
+ return arrayEntries[0][1];
1366
+ }
1367
+ }
1368
+ return actual;
1369
+ }
1370
+ function deepEqual(expected, actual) {
1371
+ if (expected === actual) return true;
1372
+ if (typeof expected === "string" && typeof actual === "string") {
1373
+ return expected.trim().toLowerCase() === actual.trim().toLowerCase();
1374
+ }
1375
+ if (typeof expected !== typeof actual) return false;
1376
+ if (expected === null || actual === null) return expected === actual;
1377
+ if (Array.isArray(expected) && Array.isArray(actual)) {
1378
+ if (expected.length !== actual.length) return false;
1379
+ return expected.every((val, i) => deepEqual(val, actual[i]));
1380
+ }
1381
+ if (typeof expected === "object" && typeof actual === "object") {
1382
+ const objExpected = expected;
1383
+ const objActual = actual;
1384
+ const keysExpected = Object.keys(objExpected);
1385
+ return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
1386
+ }
1387
+ return expected === actual;
1379
1388
  }
1380
1389
 
1381
1390
  // src/scorers/schema-correctness.ts
@@ -1395,7 +1404,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
1395
1404
  };
1396
1405
  }
1397
1406
  }
1398
- const parsed = task.schema.safeParse(data);
1407
+ let parsed = task.schema.safeParse(data);
1408
+ if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
1409
+ const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
1410
+ if (arrayEntries.length === 1) {
1411
+ const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
1412
+ if (unwrapped.success) parsed = unwrapped;
1413
+ }
1414
+ }
1399
1415
  return {
1400
1416
  name: "schema-correctness",
1401
1417
  value: parsed.success ? 1 : 0,
@@ -1410,11 +1426,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
1410
1426
  }
1411
1427
  const a = stringify(task.expected);
1412
1428
  const b = stringify(result.output);
1413
- const similarity = jaccardSimilarity(tokenize(a), tokenize(b));
1429
+ const setA = tokenize(a);
1430
+ const setB = tokenize(b);
1431
+ const similarity = jaccardSimilarity(setA, setB);
1414
1432
  return {
1415
1433
  name: "fuzzy-similarity",
1416
1434
  value: Math.round(similarity * 100) / 100,
1417
- details: { method: "jaccard", expectedTokens: tokenize(a).size, actualTokens: tokenize(b).size }
1435
+ details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
1418
1436
  };
1419
1437
  };
1420
1438
  function stringify(value) {
@@ -1439,7 +1457,38 @@ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
1439
1457
 
1440
1458
  // src/providers/openai.ts
1441
1459
  import OpenAI, { AzureOpenAI } from "openai";
1460
+ import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
1461
+
1462
+ // src/providers/shared.ts
1442
1463
  import { zodToJsonSchema } from "zod-to-json-schema";
1464
+ function buildSchemaSystemMessage(schema) {
1465
+ if (!schema) return "Respond with valid JSON.";
1466
+ const jsonSchema = zodToJsonSchema(schema, { target: "openAi" });
1467
+ return [
1468
+ "Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
1469
+ "",
1470
+ "Your output must conform to this JSON Schema:",
1471
+ JSON.stringify(jsonSchema, null, 2),
1472
+ "",
1473
+ "IMPORTANT: Output the actual data values, NOT the schema definition itself.",
1474
+ 'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
1475
+ ].join("\n");
1476
+ }
1477
+ function parseSchemaOutput(rawContent, hasSchema) {
1478
+ if (!hasSchema) return rawContent;
1479
+ const cleaned = stripCodeFences(rawContent);
1480
+ try {
1481
+ return JSON.parse(cleaned);
1482
+ } catch {
1483
+ return rawContent;
1484
+ }
1485
+ }
1486
+ function stripCodeFences(content) {
1487
+ const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
1488
+ return match ? match[1] : content;
1489
+ }
1490
+
1491
+ // src/providers/openai.ts
1443
1492
  var REQUEST_TIMEOUT_MS = 6e4;
1444
1493
  function openai(model, options) {
1445
1494
  const client = new OpenAI({
@@ -1486,7 +1535,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1486
1535
  if (input.schema) {
1487
1536
  params.response_format = { type: "json_object" };
1488
1537
  params.messages = [
1489
- { role: "system", content: "Respond with valid JSON matching the requested schema." },
1538
+ { role: "system", content: buildSchemaSystemMessage(input.schema) },
1490
1539
  ...params.messages
1491
1540
  ];
1492
1541
  }
@@ -1494,7 +1543,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1494
1543
  params.tools = input.tools.map(toolDefToOpenAI);
1495
1544
  params.tool_choice = "auto";
1496
1545
  }
1497
- const response = await client.chat.completions.create(params, { signal: input.signal });
1546
+ const reqOpts = { signal: input.signal };
1547
+ if (input.timeout) reqOpts.timeout = input.timeout;
1548
+ const response = await client.chat.completions.create(params, reqOpts);
1498
1549
  let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
1499
1550
  let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
1500
1551
  const choice = response.choices[0];
@@ -1528,7 +1579,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1528
1579
  const followUp = await client.chat.completions.create({
1529
1580
  model: requestModel,
1530
1581
  messages: toolMessages
1531
- }, { signal: input.signal });
1582
+ }, reqOpts);
1532
1583
  totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
1533
1584
  totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
1534
1585
  finalResponse = followUp;
@@ -1539,13 +1590,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1539
1590
  if (stripThinking) {
1540
1591
  rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
1541
1592
  }
1542
- let output = rawContent;
1543
- if (input.schema) {
1544
- try {
1545
- output = JSON.parse(rawContent);
1546
- } catch {
1547
- }
1548
- }
1593
+ const output = parseSchemaOutput(rawContent, !!input.schema);
1549
1594
  return {
1550
1595
  output,
1551
1596
  usage: {
@@ -1559,13 +1604,27 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
1559
1604
  }
1560
1605
  };
1561
1606
  }
1607
+ function gemini(model, options) {
1608
+ const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
1609
+ if (!apiKey) {
1610
+ throw new Error(
1611
+ `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
1612
+ );
1613
+ }
1614
+ const client = new OpenAI({
1615
+ apiKey,
1616
+ baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
1617
+ timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
1618
+ });
1619
+ return makeProvider(`google/${model}`, "Google AI", model, client, model);
1620
+ }
1562
1621
  function toolDefToOpenAI(tool) {
1563
1622
  return {
1564
1623
  type: "function",
1565
1624
  function: {
1566
1625
  name: tool.name,
1567
1626
  description: tool.description,
1568
- parameters: zodToJsonSchema(tool.parameters, { target: "openAi" })
1627
+ parameters: zodToJsonSchema2(tool.parameters, { target: "openAi" })
1569
1628
  }
1570
1629
  };
1571
1630
  }
@@ -1614,8 +1673,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1614
1673
  if (!apiKey) return void 0;
1615
1674
  return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
1616
1675
  }
1676
+ function isTemperatureError(err) {
1677
+ const msg = err instanceof Error ? err.message : String(err);
1678
+ const lower = msg.toLowerCase();
1679
+ return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
1680
+ }
1617
1681
  function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1618
1682
  let cached = void 0;
1683
+ let useTemperature = true;
1619
1684
  return async ({ task, result }) => {
1620
1685
  if (task.expected === void 0) {
1621
1686
  return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
@@ -1632,36 +1697,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1632
1697
  }
1633
1698
  const { client, model } = cached;
1634
1699
  const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
1700
+ const messages = [{ role: "user", content: prompt }];
1635
1701
  try {
1636
- const response = await client.chat.completions.create({
1637
- model,
1638
- messages: [{ role: "user", content: prompt }],
1639
- temperature: 0,
1640
- max_tokens: 2048
1641
- });
1642
- const content = response.choices[0]?.message?.content?.trim() ?? "";
1643
- const parsed = {};
1644
- for (const line of content.split("\n")) {
1645
- const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
1646
- if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
1647
- }
1648
- const accuracy = parsed.accuracy;
1649
- const completeness = parsed.completeness;
1650
- const conciseness = parsed.conciseness;
1651
- if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
1652
- return {
1653
- name: "llm-judge-correctness",
1654
- value: -1,
1655
- details: { reason: `judge returned unparseable scores: "${content}"`, model }
1656
- };
1657
- }
1658
- const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
1659
- return {
1660
- name: "llm-judge-correctness",
1661
- value: composite,
1662
- details: { model, accuracy, completeness, conciseness }
1663
- };
1702
+ const response = await callJudge(client, model, messages, useTemperature);
1703
+ return parseJudgeResponse(response, model);
1664
1704
  } catch (err) {
1705
+ if (useTemperature && isTemperatureError(err)) {
1706
+ useTemperature = false;
1707
+ try {
1708
+ const response = await callJudge(client, model, messages, false);
1709
+ return parseJudgeResponse(response, model);
1710
+ } catch (retryErr) {
1711
+ return {
1712
+ name: "llm-judge-correctness",
1713
+ value: -1,
1714
+ details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
1715
+ };
1716
+ }
1717
+ }
1665
1718
  return {
1666
1719
  name: "llm-judge-correctness",
1667
1720
  value: -1,
@@ -1670,6 +1723,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
1670
1723
  }
1671
1724
  };
1672
1725
  }
1726
+ async function callJudge(client, model, messages, withTemperature) {
1727
+ return client.chat.completions.create({
1728
+ model,
1729
+ messages,
1730
+ max_completion_tokens: 2048,
1731
+ ...withTemperature ? { temperature: 0 } : {}
1732
+ });
1733
+ }
1734
+ function parseJudgeResponse(response, model) {
1735
+ const content = response.choices[0]?.message?.content?.trim() ?? "";
1736
+ const parsed = {};
1737
+ for (const line of content.split("\n")) {
1738
+ const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
1739
+ if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
1740
+ }
1741
+ const accuracy = parsed.accuracy;
1742
+ const completeness = parsed.completeness;
1743
+ const conciseness = parsed.conciseness;
1744
+ if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
1745
+ return {
1746
+ name: "llm-judge-correctness",
1747
+ value: -1,
1748
+ details: { reason: `judge returned unparseable scores: "${content}"`, model }
1749
+ };
1750
+ }
1751
+ const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
1752
+ return {
1753
+ name: "llm-judge-correctness",
1754
+ value: composite,
1755
+ details: { model, accuracy, completeness, conciseness }
1756
+ };
1757
+ }
1673
1758
 
1674
1759
  // src/scorers/tool-usage.ts
1675
1760
  var toolUsageScorer = ({ task, result }) => {
@@ -1733,118 +1818,174 @@ async function runBenchmarks(options) {
1733
1818
  const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
1734
1819
  const results = [];
1735
1820
  for (const task of tasks) {
1736
- for (const provider of providers) {
1737
- for (let run = 1; run <= runs; run++) {
1738
- let result;
1739
- try {
1740
- const taskResult = await withTimeout((signal) => provider.run({
1741
- prompt: task.prompt,
1742
- schema: task.schema,
1743
- tools: task.tools,
1744
- signal
1745
- }), timeout);
1746
- const scores = await Promise.all(
1747
- scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1748
- );
1749
- result = {
1750
- providerId: provider.id,
1751
- taskName: task.name,
1752
- run,
1753
- scores,
1754
- raw: {
1755
- output: taskResult.output,
1756
- latencyMs: taskResult.latencyMs,
1757
- usage: taskResult.usage,
1758
- toolCalls: taskResult.toolCalls
1759
- }
1760
- };
1761
- } catch (err) {
1762
- const message = err instanceof Error ? err.message : String(err);
1763
- result = {
1764
- providerId: provider.id,
1765
- taskName: task.name,
1766
- run,
1767
- scores: [],
1768
- error: message,
1769
- raw: { output: "", latencyMs: 0 }
1770
- };
1771
- }
1772
- results.push(result);
1773
- onResult?.(result);
1774
- }
1821
+ for (let run = 1; run <= runs; run++) {
1822
+ const runResults = await Promise.all(
1823
+ providers.map(async (provider) => {
1824
+ let result;
1825
+ try {
1826
+ const taskResult = await withTimeout((signal) => provider.run({
1827
+ prompt: task.prompt,
1828
+ schema: task.schema,
1829
+ tools: task.tools,
1830
+ signal,
1831
+ timeout
1832
+ }), timeout);
1833
+ const scores = await Promise.all(
1834
+ scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
1835
+ );
1836
+ result = {
1837
+ providerId: provider.id,
1838
+ taskName: task.name,
1839
+ run,
1840
+ scores,
1841
+ raw: {
1842
+ output: taskResult.output,
1843
+ latencyMs: taskResult.latencyMs,
1844
+ usage: taskResult.usage,
1845
+ toolCalls: taskResult.toolCalls
1846
+ }
1847
+ };
1848
+ } catch (err) {
1849
+ const message = err instanceof Error ? err.message : String(err);
1850
+ result = {
1851
+ providerId: provider.id,
1852
+ taskName: task.name,
1853
+ run,
1854
+ scores: [],
1855
+ error: message,
1856
+ raw: { output: "", latencyMs: 0 }
1857
+ };
1858
+ }
1859
+ onResult?.(result);
1860
+ return result;
1861
+ })
1862
+ );
1863
+ results.push(...runResults);
1775
1864
  }
1776
1865
  }
1777
1866
  return results;
1778
1867
  }
1779
1868
 
1780
- // src/reporter/console.ts
1781
- var reset = "\x1B[0m";
1782
- var boldCode = "\x1B[1m";
1783
- var dimCode = "\x1B[2m";
1784
- var green = "\x1B[32m";
1785
- var red = "\x1B[31m";
1786
- var yellow = "\x1B[33m";
1787
- var cyan = "\x1B[36m";
1788
- var brightGreen = "\x1B[92m";
1789
- var brightWhite = "\x1B[97m";
1790
- function bold(s) {
1791
- return `${boldCode}${s}${reset}`;
1792
- }
1793
- function dim(s) {
1794
- return `${dimCode}${s}${reset}`;
1869
+ // src/utils/format.ts
1870
+ var MAX_FRACTION_DIGITS = 100;
1871
+ function formatCost(usd) {
1872
+ if (usd === void 0) return "\u2014";
1873
+ if (usd === 0) return "$0.00";
1874
+ if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
1875
+ const digits = Math.min(
1876
+ MAX_FRACTION_DIGITS,
1877
+ Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
1878
+ );
1879
+ return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
1795
1880
  }
1796
- function stripAnsi(s) {
1797
- return s.replace(/\x1b\[[0-9;]*m/g, "");
1881
+ function formatDelta(delta, precision = 4) {
1882
+ const sign = delta >= 0 ? "+" : "";
1883
+ return `${sign}${delta.toFixed(precision)}`;
1798
1884
  }
1799
- function displayWidth(s) {
1800
- const stripped = stripAnsi(s);
1801
- let width = 0;
1802
- for (const ch of stripped) {
1803
- const code = ch.codePointAt(0) ?? 0;
1804
- if (code >= 126976) width += 2;
1805
- else if (code >= 9728 && code <= 10175) width += 2;
1806
- else width += 1;
1885
+
1886
+ // src/reporter/shared.ts
1887
+ function groupResults(results) {
1888
+ const taskSet = /* @__PURE__ */ new Set();
1889
+ const providerSet = /* @__PURE__ */ new Set();
1890
+ const scorerSet = /* @__PURE__ */ new Set();
1891
+ const grouped = /* @__PURE__ */ new Map();
1892
+ const byProvider = /* @__PURE__ */ new Map();
1893
+ let hasErrors = false;
1894
+ let maxRun = 0;
1895
+ for (const r of results) {
1896
+ taskSet.add(r.taskName);
1897
+ providerSet.add(r.providerId);
1898
+ for (const s of r.scores) scorerSet.add(s.name);
1899
+ if (r.error) hasErrors = true;
1900
+ if (r.run > maxRun) maxRun = r.run;
1901
+ const key = `${r.taskName}::${r.providerId}`;
1902
+ let group = grouped.get(key);
1903
+ if (!group) {
1904
+ group = [];
1905
+ grouped.set(key, group);
1906
+ }
1907
+ group.push(r);
1908
+ let provGroup = byProvider.get(r.providerId);
1909
+ if (!provGroup) {
1910
+ provGroup = [];
1911
+ byProvider.set(r.providerId, provGroup);
1912
+ }
1913
+ provGroup.push(r);
1807
1914
  }
1808
- return width;
1809
- }
1810
- function padCell(str, targetWidth, align) {
1811
- const dw = displayWidth(str);
1812
- const padding = Math.max(0, targetWidth - dw);
1813
- if (align === "right") return " ".repeat(padding) + str;
1814
- return str + " ".repeat(padding);
1815
- }
1816
- function sparkBar(ratio, width = 8) {
1817
- const clamped = Math.max(0, Math.min(1, ratio));
1818
- const fillLen = Math.round(clamped * width);
1819
- const fill = "\u2593".repeat(fillLen);
1820
- const track = "\u2591".repeat(width - fillLen);
1821
- return { fill, track };
1915
+ return {
1916
+ tasks: [...taskSet],
1917
+ providers: [...providerSet],
1918
+ scorerNames: [...scorerSet],
1919
+ grouped,
1920
+ byProvider,
1921
+ hasErrors,
1922
+ maxRun
1923
+ };
1822
1924
  }
1823
- function drawTableLine(widths, position) {
1824
- const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1825
- if (position === "bottom") {
1826
- return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
1925
+ function aggregateProviderTask(providerId, grouped, task) {
1926
+ const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
1927
+ const errorResults = taskResults.filter((r) => r.error);
1928
+ const successResults = taskResults.filter((r) => !r.error);
1929
+ if (successResults.length === 0) {
1930
+ return {
1931
+ providerId,
1932
+ avgScores: {},
1933
+ avgDetails: { costUsd: void 0, totalTokens: void 0 },
1934
+ latencyMs: void 0,
1935
+ allErrors: errorResults.length > 0,
1936
+ errorCount: errorResults.length
1937
+ };
1827
1938
  }
1828
- if (position === "merge") {
1829
- return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
1939
+ return {
1940
+ providerId,
1941
+ avgScores: averageScores(successResults),
1942
+ avgDetails: averageDetails(successResults),
1943
+ latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1944
+ allErrors: false,
1945
+ errorCount: errorResults.length
1946
+ };
1947
+ }
1948
+ function averageScores(results) {
1949
+ const sums = {};
1950
+ const counts = {};
1951
+ for (const result of results) {
1952
+ for (const score of result.scores) {
1953
+ if (score.value < 0) continue;
1954
+ sums[score.name] = (sums[score.name] ?? 0) + score.value;
1955
+ counts[score.name] = (counts[score.name] ?? 0) + 1;
1956
+ }
1830
1957
  }
1831
- const segments = widths.map((w) => "\u2500".repeat(w + 2));
1832
- if (position === "top") {
1833
- return dim(`\u250C${segments.join("\u252C")}\u2510`);
1958
+ const avgs = {};
1959
+ for (const name of Object.keys(sums)) {
1960
+ avgs[name] = sums[name] / counts[name];
1834
1961
  }
1835
- return dim(`\u251C${segments.join("\u253C")}\u2524`);
1962
+ return avgs;
1836
1963
  }
1837
- function drawTableRow(cells, widths, aligns) {
1838
- const parts = cells.map(
1839
- (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
1840
- );
1841
- return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
1964
+ function averageDetails(results) {
1965
+ let costSum = 0;
1966
+ let costCount = 0;
1967
+ let tokenSum = 0;
1968
+ let tokenCount = 0;
1969
+ for (const result of results) {
1970
+ const costScore = result.scores.find((s) => s.name === "cost");
1971
+ const details = costScore?.details;
1972
+ if (details?.estimatedUsd != null) {
1973
+ costSum += details.estimatedUsd;
1974
+ costCount++;
1975
+ }
1976
+ if (details?.totalTokens != null) {
1977
+ tokenSum += details.totalTokens;
1978
+ tokenCount++;
1979
+ }
1980
+ }
1981
+ return {
1982
+ costUsd: costCount > 0 ? costSum / costCount : void 0,
1983
+ totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
1984
+ };
1842
1985
  }
1843
- function drawSpanRow(content, widths) {
1844
- const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
1845
- const dw = displayWidth(content);
1846
- const padding = Math.max(0, totalInner - dw - 1);
1847
- return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
1986
+ function average(nums) {
1987
+ if (nums.length === 0) return void 0;
1988
+ return nums.reduce((a, b) => a + b, 0) / nums.length;
1848
1989
  }
1849
1990
  function computeColumnStats(providerData, scorerNames) {
1850
1991
  const stats = /* @__PURE__ */ new Map();
@@ -1896,62 +2037,274 @@ function computeColumnStats(providerData, scorerNames) {
1896
2037
  }
1897
2038
  return stats;
1898
2039
  }
1899
- function colorByRank(text, value, colStats, providerCount) {
1900
- if (value === void 0) return dim("\u2014");
1901
- if (providerCount < 2) return text;
1902
- if (colStats.best === void 0 || colStats.worst === void 0) return text;
1903
- if (colStats.best === colStats.worst) return text;
1904
- if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
1905
- if (value === colStats.worst) return `${red}${text}${reset}`;
1906
- return `${yellow}${text}${reset}`;
2040
+ var QUALITY_SCORERS = /* @__PURE__ */ new Set([
2041
+ "correctness",
2042
+ "schema-correctness",
2043
+ "fuzzy-similarity",
2044
+ "llm-judge-correctness",
2045
+ "tool-usage"
2046
+ ]);
2047
+ function passesQualityGate(providerId, columnStats) {
2048
+ const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
2049
+ if (qualityColumns.length === 0) return true;
2050
+ return qualityColumns.some((col) => {
2051
+ const val = columnStats.get(col)?.values.get(providerId);
2052
+ return val !== void 0 && val > 0;
2053
+ });
1907
2054
  }
1908
2055
  function computeMedals(columnStats, providerIds) {
1909
2056
  const medals = /* @__PURE__ */ new Map();
1910
2057
  if (providerIds.length < 2) {
1911
- for (const id of providerIds) medals.set(id, "");
2058
+ for (const id of providerIds) medals.set(id, "none");
1912
2059
  return medals;
1913
2060
  }
1914
- const wins = /* @__PURE__ */ new Map();
1915
- for (const id of providerIds) wins.set(id, 0);
1916
- for (const [, colStats] of columnStats) {
2061
+ const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
2062
+ const qualityWins = /* @__PURE__ */ new Map();
2063
+ const efficiencyWins = /* @__PURE__ */ new Map();
2064
+ for (const id of providerIds) {
2065
+ qualityWins.set(id, 0);
2066
+ efficiencyWins.set(id, 0);
2067
+ }
2068
+ for (const [colName, colStats] of columnStats) {
1917
2069
  if (colStats.best === void 0) continue;
1918
- for (const [providerId, value] of colStats.values) {
1919
- if (value !== void 0 && value === colStats.best) {
1920
- wins.set(providerId, (wins.get(providerId) ?? 0) + 1);
2070
+ const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
2071
+ if (bestProviders.length === 1) {
2072
+ const winnerId = bestProviders[0][0];
2073
+ if (QUALITY_SCORERS.has(colName)) {
2074
+ qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
2075
+ } else {
2076
+ efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
1921
2077
  }
1922
2078
  }
1923
2079
  }
1924
- const totalWins = [...wins.values()].reduce((a, b) => a + b, 0);
2080
+ const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
1925
2081
  if (totalWins === 0) {
1926
- for (const id of providerIds) medals.set(id, "");
2082
+ for (const id of providerIds) medals.set(id, "none");
1927
2083
  return medals;
1928
2084
  }
1929
- const sorted = [...wins.entries()].sort(
1930
- (a, b) => b[1] - a[1] || a[0].localeCompare(b[0])
1931
- );
1932
- const medalList = ["\u{1F947}", "\u{1F948}", "\u{1F949}"];
2085
+ const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
2086
+ const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
2087
+ if (qDiff !== 0) return qDiff;
2088
+ const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
2089
+ if (eDiff !== 0) return eDiff;
2090
+ return a.localeCompare(b);
2091
+ });
2092
+ const medalList = ["gold", "silver", "bronze"];
1933
2093
  let rank = 0;
1934
- for (let i = 0; i < sorted.length; i++) {
1935
- if (i > 0 && sorted[i][1] < sorted[i - 1][1]) {
1936
- rank = i;
2094
+ for (let i = 0; i < eligibleSorted.length; i++) {
2095
+ if (i > 0) {
2096
+ const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
2097
+ const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
2098
+ if (currQ < prevQ) {
2099
+ rank = i;
2100
+ } else if (currQ === prevQ) {
2101
+ const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
2102
+ const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
2103
+ if (currE < prevE) rank = i;
2104
+ }
1937
2105
  }
1938
- medals.set(sorted[i][0], rank < medalList.length ? medalList[rank] : "");
2106
+ medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
2107
+ }
2108
+ for (const id of providerIds) {
2109
+ if (!eligible.has(id)) medals.set(id, "none");
1939
2110
  }
1940
2111
  return medals;
1941
2112
  }
2113
+ function providerLabel(providerId) {
2114
+ const prefix = providerId.split("/")[0];
2115
+ switch (prefix) {
2116
+ case "azure":
2117
+ return "(OpenAI via Azure)";
2118
+ case "openai":
2119
+ return "(OpenAI)";
2120
+ case "anthropic":
2121
+ return "(Anthropic)";
2122
+ case "google":
2123
+ return "(Google)";
2124
+ case "mistral":
2125
+ return "(Mistral)";
2126
+ case "meta":
2127
+ return "(Meta)";
2128
+ case "deepseek":
2129
+ return "(DeepSeek)";
2130
+ case "cohere":
2131
+ return "(Cohere)";
2132
+ case "qwen":
2133
+ return "(Qwen)";
2134
+ case "xai":
2135
+ return "(xAI)";
2136
+ case "minimax":
2137
+ return "(MiniMax)";
2138
+ case "moonshot":
2139
+ return "(Moonshot / Kimi)";
2140
+ case "perplexity":
2141
+ return "(Perplexity)";
2142
+ case "amazon":
2143
+ return "(Amazon)";
2144
+ case "nvidia":
2145
+ return "(NVIDIA)";
2146
+ case "microsoft":
2147
+ return "(Microsoft)";
2148
+ case "ai21":
2149
+ return "(AI21 Labs)";
2150
+ case "bytedance":
2151
+ return "(ByteDance)";
2152
+ case "together":
2153
+ return "(Together AI)";
2154
+ case "fireworks":
2155
+ return "(Fireworks AI)";
2156
+ case "groq":
2157
+ return "(Groq)";
2158
+ case "cerebras":
2159
+ return "(Cerebras)";
2160
+ default:
2161
+ return `(${prefix})`;
2162
+ }
2163
+ }
2164
+ function apiKeyHint(providerId, error) {
2165
+ const lower = error.toLowerCase();
2166
+ const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2167
+ if (!isAuthError) return void 0;
2168
+ const prefix = providerId.split("/")[0];
2169
+ switch (prefix) {
2170
+ case "openai":
2171
+ return "Set: export OPENAI_API_KEY=sk-...";
2172
+ case "azure":
2173
+ return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2174
+ case "anthropic":
2175
+ return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2176
+ case "google":
2177
+ return "Set: export GOOGLE_API_KEY=...";
2178
+ default:
2179
+ return `Check the API key for ${providerId}`;
2180
+ }
2181
+ }
2182
+ function rankProviders(successByProvider, providers, scorerName) {
2183
+ const ranked = providers.map((id) => {
2184
+ const runs = successByProvider.get(id) ?? [];
2185
+ const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2186
+ const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2187
+ return { id, avg };
2188
+ }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2189
+ return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2190
+ }
2191
+ function scorerLabel(name) {
2192
+ switch (name) {
2193
+ case "correctness":
2194
+ return "Match";
2195
+ case "schema-correctness":
2196
+ return "Schema";
2197
+ case "fuzzy-similarity":
2198
+ return "Fuzzy";
2199
+ case "llm-judge-correctness":
2200
+ return "Judge";
2201
+ case "tool-usage":
2202
+ return "Tool";
2203
+ default:
2204
+ return name;
2205
+ }
2206
+ }
2207
+ function medalEmoji(medal) {
2208
+ switch (medal) {
2209
+ case "gold":
2210
+ return "\u{1F947}";
2211
+ case "silver":
2212
+ return "\u{1F948}";
2213
+ case "bronze":
2214
+ return "\u{1F949}";
2215
+ case "none":
2216
+ return "";
2217
+ }
2218
+ }
2219
+
2220
+ // src/reporter/console.ts
2221
+ var reset = "\x1B[0m";
2222
+ var boldCode = "\x1B[1m";
2223
+ var dimCode = "\x1B[2m";
2224
+ var green = "\x1B[32m";
2225
+ var red = "\x1B[31m";
2226
+ var yellow = "\x1B[33m";
2227
+ var cyan = "\x1B[36m";
2228
+ var brightGreen = "\x1B[92m";
2229
+ var brightWhite = "\x1B[97m";
2230
+ function bold(s) {
2231
+ return `${boldCode}${s}${reset}`;
2232
+ }
2233
+ function dim(s) {
2234
+ return `${dimCode}${s}${reset}`;
2235
+ }
2236
+ function stripAnsi(s) {
2237
+ return s.replace(/\x1b\[[0-9;]*m/g, "");
2238
+ }
2239
+ function displayWidth(s) {
2240
+ const stripped = stripAnsi(s);
2241
+ let width = 0;
2242
+ for (const ch of stripped) {
2243
+ const code = ch.codePointAt(0) ?? 0;
2244
+ if (code >= 126976) width += 2;
2245
+ else if (code >= 9728 && code <= 10175) width += 2;
2246
+ else width += 1;
2247
+ }
2248
+ return width;
2249
+ }
2250
+ function padCell(str, targetWidth, align) {
2251
+ const dw = displayWidth(str);
2252
+ const padding = Math.max(0, targetWidth - dw);
2253
+ if (align === "right") return " ".repeat(padding) + str;
2254
+ return str + " ".repeat(padding);
2255
+ }
2256
+ function sparkBar(ratio, width = 8) {
2257
+ const clamped = Math.max(0, Math.min(1, ratio));
2258
+ const fillLen = Math.round(clamped * width);
2259
+ const fill = "\u2593".repeat(fillLen);
2260
+ const track = "\u2591".repeat(width - fillLen);
2261
+ return { fill, track };
2262
+ }
2263
+ function drawTableLine(widths, position) {
2264
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2265
+ if (position === "bottom") {
2266
+ return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
2267
+ }
2268
+ if (position === "merge") {
2269
+ return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
2270
+ }
2271
+ const segments = widths.map((w) => "\u2500".repeat(w + 2));
2272
+ if (position === "top") {
2273
+ return dim(`\u250C${segments.join("\u252C")}\u2510`);
2274
+ }
2275
+ return dim(`\u251C${segments.join("\u253C")}\u2524`);
2276
+ }
2277
+ function drawTableRow(cells, widths, aligns) {
2278
+ const parts = cells.map(
2279
+ (cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
2280
+ );
2281
+ return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
2282
+ }
2283
+ function drawSpanRow(content, widths) {
2284
+ const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
2285
+ const dw = displayWidth(content);
2286
+ const padding = Math.max(0, totalInner - dw - 1);
2287
+ return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
2288
+ }
2289
+ function colorByRank(text, value, colStats, providerCount) {
2290
+ if (value === void 0) return dim("\u2014");
2291
+ if (providerCount < 2) return text;
2292
+ if (colStats.best === void 0 || colStats.worst === void 0) return text;
2293
+ if (colStats.best === colStats.worst) return text;
2294
+ if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
2295
+ if (value === colStats.worst) return `${red}${text}${reset}`;
2296
+ return `${yellow}${text}${reset}`;
2297
+ }
1942
2298
  function consoleReporter(results, options) {
1943
2299
  const showSparklines = options?.sparklines ?? true;
1944
2300
  if (results.length === 0) {
1945
2301
  console.log("\nNo results to display.\n");
1946
2302
  return;
1947
2303
  }
1948
- const tasks = [...new Set(results.map((r) => r.taskName))];
1949
- const providers = [...new Set(results.map((r) => r.providerId))];
1950
- const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
2304
+ const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
1951
2305
  const hasCost = scorerNames.includes("cost");
1952
- const hasErrors = results.some((r) => r.error);
1953
2306
  const multi = providers.length >= 2;
1954
- const runsPerCell = Math.max(...results.map((r) => r.run));
2307
+ const runsPerCell = maxRun;
1955
2308
  const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
1956
2309
  console.log("");
1957
2310
  console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
@@ -1960,29 +2313,9 @@ function consoleReporter(results, options) {
1960
2313
  for (const task of tasks) {
1961
2314
  console.log(` ${bold(`Task: ${task}`)}`);
1962
2315
  console.log("");
1963
- const providerData = providers.map((providerId) => {
1964
- const taskResults = results.filter((r) => r.taskName === task && r.providerId === providerId);
1965
- const errorResults2 = taskResults.filter((r) => r.error);
1966
- const successResults = taskResults.filter((r) => !r.error);
1967
- if (successResults.length === 0) {
1968
- return {
1969
- providerId,
1970
- avgScores: {},
1971
- avgDetails: { costUsd: void 0, totalTokens: void 0 },
1972
- latencyMs: void 0,
1973
- allErrors: errorResults2.length > 0,
1974
- errorCount: errorResults2.length
1975
- };
1976
- }
1977
- return {
1978
- providerId,
1979
- avgScores: averageScores(successResults),
1980
- avgDetails: averageDetails(successResults),
1981
- latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
1982
- allErrors: false,
1983
- errorCount: errorResults2.length
1984
- };
1985
- });
2316
+ const providerData = providers.map(
2317
+ (providerId) => aggregateProviderTask(providerId, grouped, task)
2318
+ );
1986
2319
  const columnStats = computeColumnStats(providerData, scorerNames);
1987
2320
  const medals = computeMedals(columnStats, providers);
1988
2321
  const maxProviderLen = Math.max(...providers.map((id) => id.length));
@@ -1997,8 +2330,7 @@ function consoleReporter(results, options) {
1997
2330
  cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
1998
2331
  cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
1999
2332
  } else {
2000
- const label = name === "correctness" ? "Match" : name === "schema-correctness" ? "Schema" : name === "fuzzy-similarity" ? "Fuzzy" : name === "llm-judge-correctness" ? "Judge" : name === "tool-usage" ? "Tool" : name;
2001
- cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2333
+ cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
2002
2334
  }
2003
2335
  }
2004
2336
  if (hasErrors) {
@@ -2011,7 +2343,7 @@ function consoleReporter(results, options) {
2011
2343
  console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
2012
2344
  console.log(` ${drawTableLine(widths, "header")}`);
2013
2345
  for (const pd of providerData) {
2014
- const medal = medals.get(pd.providerId) ?? "";
2346
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
2015
2347
  const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
2016
2348
  const cells = [providerCell];
2017
2349
  if (pd.allErrors) {
@@ -2084,7 +2416,7 @@ function consoleReporter(results, options) {
2084
2416
  console.log(` ${drawTableRow(cells, widths, aligns)}`);
2085
2417
  }
2086
2418
  if (multi && providerData.some((p) => !p.allErrors)) {
2087
- const winnerId = [...medals.entries()].find(([, m]) => m === "\u{1F947}")?.[0];
2419
+ const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
2088
2420
  if (winnerId) {
2089
2421
  console.log(` ${drawTableLine(widths, "merge")}`);
2090
2422
  const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
@@ -2094,7 +2426,7 @@ function consoleReporter(results, options) {
2094
2426
  console.log(` ${drawTableLine(widths, "bottom")}`);
2095
2427
  console.log("");
2096
2428
  }
2097
- printSummary(results, providers);
2429
+ printSummary(results, providers, byProvider);
2098
2430
  const errorResults = results.filter((r) => r.error);
2099
2431
  if (errorResults.length > 0) {
2100
2432
  console.log(` ${bold("Errors")}`);
@@ -2117,203 +2449,66 @@ function consoleReporter(results, options) {
2117
2449
  console.log("");
2118
2450
  }
2119
2451
  }
2120
- function printSummary(results, providers) {
2452
+ function printSummary(results, providers, byProvider) {
2121
2453
  const successResults = results.filter((r) => !r.error);
2122
2454
  if (successResults.length === 0) return;
2123
- console.log(` ${bold("Summary")}`);
2124
- console.log(` ${dim("\u2501".repeat(72))}`);
2125
- console.log("");
2126
- const single = providers.length === 1;
2127
- const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
2128
- const byCorrectness = rankProviders(successResults, providers, correctnessKey);
2129
- if (byCorrectness) {
2130
- const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2131
- const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2132
- if (single) {
2133
- console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2134
- } else {
2135
- console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2136
- }
2137
- }
2138
- const byLatency = providers.map((id) => {
2139
- const runs = successResults.filter((r) => r.providerId === id);
2140
- const avg = average(runs.map((r) => r.raw.latencyMs));
2141
- return { id, avg: avg ?? Infinity };
2142
- }).sort((a, b) => a.avg - b.avg)[0];
2143
- if (byLatency && byLatency.avg !== Infinity) {
2144
- const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2145
- const msStr = `${Math.round(byLatency.avg)}ms`;
2146
- if (single) {
2147
- console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2148
- } else {
2149
- console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2150
- }
2151
- }
2152
- const byCost = providers.map((id) => {
2153
- const runs = successResults.filter((r) => r.providerId === id);
2154
- const costs = runs.map((r) => {
2155
- const s = r.scores.find((s2) => s2.name === "cost");
2156
- return s && s.value >= 0 ? s.value : void 0;
2157
- }).filter((c) => c !== void 0);
2158
- const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2159
- return { id, avg };
2160
- }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2161
- if (byCost?.avg !== void 0) {
2162
- const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2163
- const costStr = formatCost(byCost.avg);
2164
- if (single) {
2165
- console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2166
- } else {
2167
- console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2168
- }
2169
- }
2170
- if (!single) {
2171
- const wins = /* @__PURE__ */ new Map();
2172
- for (const id of providers) wins.set(id, 0);
2173
- if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
2174
- if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
2175
- if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
2176
- const maxWins = Math.max(...wins.values());
2177
- if (maxWins > 0) {
2178
- const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
2179
- console.log("");
2180
- if (topProviders.length === 1) {
2181
- const [winnerId, winCount] = topProviders[0];
2182
- console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
2183
- } else {
2184
- const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
2185
- console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
2186
- }
2187
- }
2188
- }
2189
- console.log("");
2190
- }
2191
- function rankProviders(results, providers, scorerName) {
2192
- const ranked = providers.map((id) => {
2193
- const runs = results.filter((r) => r.providerId === id);
2194
- const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
2195
- const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
2196
- return { id, avg };
2197
- }).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
2198
- return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
2199
- }
2200
- function averageScores(results) {
2201
- const sums = {};
2202
- const counts = {};
2203
- for (const result of results) {
2204
- for (const score of result.scores) {
2205
- if (score.value < 0) continue;
2206
- sums[score.name] = (sums[score.name] ?? 0) + score.value;
2207
- counts[score.name] = (counts[score.name] ?? 0) + 1;
2208
- }
2209
- }
2210
- const avgs = {};
2211
- for (const name of Object.keys(sums)) {
2212
- avgs[name] = sums[name] / counts[name];
2213
- }
2214
- return avgs;
2215
- }
2216
- function averageDetails(results) {
2217
- let costSum = 0;
2218
- let costCount = 0;
2219
- let tokenSum = 0;
2220
- let tokenCount = 0;
2221
- for (const result of results) {
2222
- const costScore = result.scores.find((s) => s.name === "cost");
2223
- const details = costScore?.details;
2224
- if (details?.estimatedUsd != null) {
2225
- costSum += details.estimatedUsd;
2226
- costCount++;
2455
+ const successByProvider = /* @__PURE__ */ new Map();
2456
+ for (const id of providers) {
2457
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2458
+ }
2459
+ console.log(` ${bold("Summary")}`);
2460
+ console.log(` ${dim("\u2501".repeat(72))}`);
2461
+ console.log("");
2462
+ const single = providers.length === 1;
2463
+ const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
2464
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2465
+ if (byCorrectness) {
2466
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2467
+ const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
2468
+ if (single) {
2469
+ console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
2470
+ } else {
2471
+ console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
2227
2472
  }
2228
- if (details?.totalTokens != null) {
2229
- tokenSum += details.totalTokens;
2230
- tokenCount++;
2473
+ }
2474
+ const byLatency = providers.map((id) => {
2475
+ const runs = successByProvider.get(id) ?? [];
2476
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2477
+ return { id, avg: avg ?? Infinity };
2478
+ }).sort((a, b) => a.avg - b.avg)[0];
2479
+ if (byLatency && byLatency.avg !== Infinity) {
2480
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2481
+ const msStr = `${Math.round(byLatency.avg)}ms`;
2482
+ if (single) {
2483
+ console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
2484
+ } else {
2485
+ console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
2231
2486
  }
2232
2487
  }
2233
- return {
2234
- costUsd: costCount > 0 ? costSum / costCount : void 0,
2235
- totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
2236
- };
2237
- }
2238
- function average(nums) {
2239
- if (nums.length === 0) return void 0;
2240
- return nums.reduce((a, b) => a + b, 0) / nums.length;
2241
- }
2242
- function formatCost(usd) {
2243
- if (usd === void 0) return "\u2014";
2244
- if (usd === 0) return "$0.00";
2245
- if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
2246
- const digits = Math.max(4, -Math.floor(Math.log10(usd)) + 1);
2247
- return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
2248
- }
2249
- function apiKeyHint(providerId, error) {
2250
- const lower = error.toLowerCase();
2251
- const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
2252
- if (!isAuthError) return void 0;
2253
- const prefix = providerId.split("/")[0];
2254
- switch (prefix) {
2255
- case "openai":
2256
- return "Set: export OPENAI_API_KEY=sk-...";
2257
- case "azure":
2258
- return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
2259
- case "anthropic":
2260
- return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
2261
- case "google":
2262
- return "Set: export GOOGLE_API_KEY=...";
2263
- default:
2264
- return `Check the API key for ${providerId}`;
2488
+ const byCost = providers.map((id) => {
2489
+ const runs = successByProvider.get(id) ?? [];
2490
+ const costs = runs.map((r) => {
2491
+ const s = r.scores.find((s2) => s2.name === "cost");
2492
+ return s && s.value >= 0 ? s.value : void 0;
2493
+ }).filter((c) => c !== void 0);
2494
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2495
+ return { id, avg };
2496
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2497
+ if (byCost?.avg !== void 0) {
2498
+ const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
2499
+ const costStr = formatCost(byCost.avg);
2500
+ if (single) {
2501
+ console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
2502
+ } else {
2503
+ console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
2504
+ }
2265
2505
  }
2266
- }
2267
- function providerLabel(providerId) {
2268
- const prefix = providerId.split("/")[0];
2269
- switch (prefix) {
2270
- case "azure":
2271
- return "(OpenAI via Azure)";
2272
- case "openai":
2273
- return "(OpenAI)";
2274
- case "anthropic":
2275
- return "(Anthropic)";
2276
- case "google":
2277
- return "(Google)";
2278
- case "mistral":
2279
- return "(Mistral)";
2280
- case "meta":
2281
- return "(Meta)";
2282
- case "deepseek":
2283
- return "(DeepSeek)";
2284
- case "cohere":
2285
- return "(Cohere)";
2286
- case "qwen":
2287
- return "(Qwen)";
2288
- case "xai":
2289
- return "(xAI)";
2290
- case "minimax":
2291
- return "(MiniMax)";
2292
- case "moonshot":
2293
- return "(Moonshot / Kimi)";
2294
- case "perplexity":
2295
- return "(Perplexity)";
2296
- case "amazon":
2297
- return "(Amazon)";
2298
- case "nvidia":
2299
- return "(NVIDIA)";
2300
- case "microsoft":
2301
- return "(Microsoft)";
2302
- case "ai21":
2303
- return "(AI21 Labs)";
2304
- case "bytedance":
2305
- return "(ByteDance)";
2306
- case "together":
2307
- return "(Together AI)";
2308
- case "fireworks":
2309
- return "(Fireworks AI)";
2310
- case "groq":
2311
- return "(Groq)";
2312
- case "cerebras":
2313
- return "(Cerebras)";
2314
- default:
2315
- return `(${prefix})`;
2506
+ if (!single && byCorrectness && byCorrectness.avg > 0) {
2507
+ console.log("");
2508
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
2509
+ console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
2316
2510
  }
2511
+ console.log("");
2317
2512
  }
2318
2513
 
2319
2514
  // src/reporter/json.ts
@@ -2345,15 +2540,15 @@ function defineArena(config) {
2345
2540
  if (config.providers.length === 0) {
2346
2541
  throw new Error("At least one provider is required");
2347
2542
  }
2348
- if (config.tasks.length === 0) {
2349
- throw new Error("At least one task is required");
2350
- }
2351
2543
  const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
2352
2544
  const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
2353
2545
  const runs = config.runs ?? 1;
2354
2546
  return {
2355
2547
  config,
2356
2548
  async run(options) {
2549
+ if (config.tasks.length === 0) {
2550
+ throw new Error("At least one task is required");
2551
+ }
2357
2552
  return runBenchmarks({
2358
2553
  providers: config.providers,
2359
2554
  tasks: config.tasks,
@@ -2379,23 +2574,19 @@ function anthropic(model, options) {
2379
2574
  model,
2380
2575
  async run(input) {
2381
2576
  const start = Date.now();
2382
- const systemMessage = input.schema ? "Respond with valid JSON matching the requested schema." : void 0;
2577
+ const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
2578
+ const reqOpts = { signal: input.signal };
2579
+ if (input.timeout) reqOpts.timeout = input.timeout;
2383
2580
  const response = await client.messages.create({
2384
2581
  model,
2385
2582
  max_tokens: maxTokens,
2386
2583
  system: systemMessage,
2387
2584
  messages: [{ role: "user", content: input.prompt }]
2388
- }, { signal: input.signal });
2585
+ }, reqOpts);
2389
2586
  const latencyMs = Date.now() - start;
2390
2587
  const textBlock = response.content.find((b) => b.type === "text");
2391
2588
  const rawContent = textBlock?.type === "text" ? textBlock.text : "";
2392
- let output = rawContent;
2393
- if (input.schema) {
2394
- try {
2395
- output = JSON.parse(rawContent);
2396
- } catch {
2397
- }
2398
- }
2589
+ const output = parseSchemaOutput(rawContent, !!input.schema);
2399
2590
  return {
2400
2591
  output,
2401
2592
  usage: {
@@ -2409,23 +2600,6 @@ function anthropic(model, options) {
2409
2600
  };
2410
2601
  }
2411
2602
 
2412
- // src/providers/gemini.ts
2413
- import OpenAI3 from "openai";
2414
- function gemini(model, options) {
2415
- const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
2416
- if (!apiKey) {
2417
- throw new Error(
2418
- `Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
2419
- );
2420
- }
2421
- const client = new OpenAI3({
2422
- apiKey,
2423
- baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
2424
- timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
2425
- });
2426
- return makeProvider(`google/${model}`, "Google AI", model, client, model);
2427
- }
2428
-
2429
2603
  // src/reporter/markdown.ts
2430
2604
  var COMMENT_MARKER = "<!-- duelist-ci-report -->";
2431
2605
  function markdownReporter(report, _current) {
@@ -2470,7 +2644,7 @@ function markdownComparisonTable(comparisons) {
2470
2644
  for (const c of comparisons) {
2471
2645
  const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
2472
2646
  const currentStr = formatStats(c.current);
2473
- const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
2647
+ const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
2474
2648
  const status = statusIndicator(c);
2475
2649
  lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
2476
2650
  }
@@ -2503,10 +2677,6 @@ function formatStats(stats) {
2503
2677
  }
2504
2678
  return stats.mean.toFixed(3);
2505
2679
  }
2506
- function formatDelta(delta) {
2507
- const sign = delta >= 0 ? "+" : "";
2508
- return `${sign}${delta.toFixed(3)}`;
2509
- }
2510
2680
  function statusIndicator(c) {
2511
2681
  if (c.regressed) return "\u{1F534} regressed";
2512
2682
  if (c.improved) return "\u{1F7E2} improved";
@@ -2514,6 +2684,778 @@ function statusIndicator(c) {
2514
2684
  return "\u26AA unchanged";
2515
2685
  }
2516
2686
 
2687
+ // src/reporter/html.ts
2688
+ function esc(s) {
2689
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
2690
+ }
2691
+ function htmlReporter(results) {
2692
+ if (results.length === 0) {
2693
+ return emptyReport();
2694
+ }
2695
+ const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
2696
+ const hasCost = scorerNames.includes("cost");
2697
+ const multi = providers.length >= 2;
2698
+ const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
2699
+ const taskSections = tasks.map((task) => {
2700
+ const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
2701
+ const columnStats = computeColumnStats(providerData, scorerNames);
2702
+ const medals = computeMedals(columnStats, providers);
2703
+ const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
2704
+ return { task, providerData, columnStats, medals, winnerId };
2705
+ });
2706
+ const successResults = results.filter((r) => !r.error);
2707
+ const successByProvider = /* @__PURE__ */ new Map();
2708
+ for (const id of providers) {
2709
+ successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
2710
+ }
2711
+ const correctnessKey = successResults.some(
2712
+ (r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
2713
+ ) ? "llm-judge-correctness" : "correctness";
2714
+ const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
2715
+ const byLatency = providers.map((id) => {
2716
+ const runs = successByProvider.get(id) ?? [];
2717
+ const avg = average(runs.map((r) => r.raw.latencyMs));
2718
+ return { id, avg: avg ?? Infinity };
2719
+ }).sort((a, b) => a.avg - b.avg)[0];
2720
+ const byCost = providers.map((id) => {
2721
+ const runs = successByProvider.get(id) ?? [];
2722
+ const costs = runs.map((r) => {
2723
+ const s = r.scores.find((s2) => s2.name === "cost");
2724
+ return s && s.value >= 0 ? s.value : void 0;
2725
+ }).filter((c) => c !== void 0);
2726
+ const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
2727
+ return { id, avg };
2728
+ }).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
2729
+ let overallWinner;
2730
+ if (multi && byCorrectness && byCorrectness.avg > 0) {
2731
+ overallWinner = byCorrectness.id;
2732
+ }
2733
+ const errorResults = results.filter((r) => r.error);
2734
+ const deduped = dedupeErrors(errorResults);
2735
+ return `<!DOCTYPE html>
2736
+ <html lang="en">
2737
+ <head>
2738
+ <meta charset="UTF-8">
2739
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2740
+ <title>Agent Duelist Report</title>
2741
+ <meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
2742
+ <meta property="og:title" content="Agent Duelist Report">
2743
+ <meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
2744
+ <meta property="og:type" content="website">
2745
+ ${renderStyle()}
2746
+ </head>
2747
+ <body>
2748
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2749
+ <div class="report">
2750
+
2751
+ ${renderHeader(runsLabel, providers.length, tasks.length)}
2752
+
2753
+ ${tasks.length > 1 ? renderTabs(tasks) : ""}
2754
+
2755
+ <main>
2756
+ ${taskSections.map((s, i) => renderTaskSection(
2757
+ s.task,
2758
+ s.providerData,
2759
+ s.columnStats,
2760
+ s.medals,
2761
+ s.winnerId,
2762
+ scorerNames,
2763
+ hasCost,
2764
+ multi,
2765
+ i
2766
+ )).join("\n")}
2767
+ </main>
2768
+
2769
+ ${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
2770
+
2771
+ ${deduped.length > 0 ? renderErrors(deduped) : ""}
2772
+
2773
+ ${renderFooter()}
2774
+
2775
+ </div>
2776
+ ${renderScript(tasks.length)}
2777
+ </body>
2778
+ </html>`;
2779
+ }
2780
+ function emptyReport() {
2781
+ return `<!DOCTYPE html>
2782
+ <html lang="en">
2783
+ <head>
2784
+ <meta charset="UTF-8">
2785
+ <meta name="viewport" content="width=device-width, initial-scale=1">
2786
+ <title>Agent Duelist Report</title>
2787
+ ${renderStyle()}
2788
+ </head>
2789
+ <body>
2790
+ <div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
2791
+ <div class="report">
2792
+ ${renderHeader("0 runs", 0, 0)}
2793
+ <main><p class="empty-msg">No results to display.</p></main>
2794
+ ${renderFooter()}
2795
+ </div>
2796
+ </body>
2797
+ </html>`;
2798
+ }
2799
+ function dedupeErrors(errorResults) {
2800
+ const seen = /* @__PURE__ */ new Map();
2801
+ for (const r of errorResults) {
2802
+ const key = `${r.providerId}::${r.error}`;
2803
+ const existing = seen.get(key);
2804
+ if (existing) {
2805
+ existing.count++;
2806
+ } else {
2807
+ seen.set(key, {
2808
+ providerId: r.providerId,
2809
+ error: r.error ?? "Unknown error",
2810
+ count: 1,
2811
+ hint: apiKeyHint(r.providerId, r.error ?? "")
2812
+ });
2813
+ }
2814
+ }
2815
+ return [...seen.values()];
2816
+ }
2817
+ function renderStyle() {
2818
+ return `<style>
2819
+ :root {
2820
+ --bg: #0f172a;
2821
+ --bg-deep: #020617;
2822
+ --panel: rgba(15, 23, 42, 0.85);
2823
+ --accent: #f59e0b;
2824
+ --accent-soft: rgba(245, 158, 11, 0.15);
2825
+ --text: #e2e8f0;
2826
+ --muted: #94a3b8;
2827
+ --border: rgba(148, 163, 184, 0.15);
2828
+ --green: #22c55e;
2829
+ --red: #ef4444;
2830
+ --yellow: #eab308;
2831
+ --radius: 12px;
2832
+ --mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
2833
+ --sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
2834
+ }
2835
+ *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
2836
+ html, body {
2837
+ font-family: var(--sans);
2838
+ background: var(--bg);
2839
+ color: var(--text);
2840
+ min-height: 100vh;
2841
+ }
2842
+ body { padding: 24px; display: flex; justify-content: center; }
2843
+
2844
+ /* Animated gradient mesh */
2845
+ .bg-mesh {
2846
+ position: fixed; inset: 0; z-index: 0;
2847
+ overflow: hidden; pointer-events: none;
2848
+ }
2849
+ .bg-mesh::before, .bg-mesh::after {
2850
+ content: ""; position: absolute; border-radius: 50%;
2851
+ filter: blur(120px); opacity: 0.4;
2852
+ }
2853
+ .bg-mesh::before {
2854
+ width: 600px; height: 600px;
2855
+ background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
2856
+ top: -10%; left: -5%;
2857
+ animation: meshDrift1 18s ease-in-out infinite alternate;
2858
+ }
2859
+ .bg-mesh::after {
2860
+ width: 500px; height: 500px;
2861
+ background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
2862
+ bottom: -10%; right: -5%;
2863
+ animation: meshDrift2 22s ease-in-out infinite alternate;
2864
+ }
2865
+ .bg-mesh-extra {
2866
+ position: absolute; width: 400px; height: 400px;
2867
+ border-radius: 50%; filter: blur(100px); opacity: 0.3;
2868
+ background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
2869
+ top: 50%; left: 60%;
2870
+ animation: meshDrift3 15s ease-in-out infinite alternate;
2871
+ }
2872
+ @keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
2873
+ @keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
2874
+ @keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
2875
+
2876
+ /* Report container */
2877
+ .report {
2878
+ position: relative; z-index: 1;
2879
+ width: 100%; max-width: 960px;
2880
+ }
2881
+
2882
+ /* Header */
2883
+ .report-header {
2884
+ display: flex; justify-content: space-between; align-items: center;
2885
+ padding: 20px 0; margin-bottom: 8px;
2886
+ }
2887
+ .report-brand {
2888
+ display: flex; align-items: center; gap: 10px;
2889
+ text-decoration: none; color: var(--muted);
2890
+ font-weight: 600; font-size: 14px;
2891
+ letter-spacing: 0.04em; text-transform: uppercase;
2892
+ }
2893
+ .report-brand:hover { color: var(--text); }
2894
+ .brand-icon {
2895
+ width: 32px; height: 32px; border-radius: 8px;
2896
+ background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
2897
+ border: 1px solid rgba(245,158,11,0.3);
2898
+ display: flex; align-items: center; justify-content: center;
2899
+ font-size: 16px;
2900
+ }
2901
+ .report-meta {
2902
+ font-size: 12px; color: var(--muted);
2903
+ text-align: right; line-height: 1.6;
2904
+ }
2905
+
2906
+ /* Task tabs */
2907
+ .task-tabs {
2908
+ display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
2909
+ }
2910
+ .task-tab {
2911
+ padding: 6px 16px; border-radius: 999px;
2912
+ border: 1px solid var(--border);
2913
+ background: transparent; color: var(--muted);
2914
+ font-size: 13px; font-weight: 500; cursor: pointer;
2915
+ transition: all 150ms ease;
2916
+ }
2917
+ .task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
2918
+ .task-tab.active {
2919
+ background: var(--accent-soft);
2920
+ border-color: rgba(245,158,11,0.4);
2921
+ color: var(--accent);
2922
+ }
2923
+
2924
+ /* Task sections */
2925
+ .task-section { display: none; }
2926
+ .task-section.active { display: block; }
2927
+ .task-name {
2928
+ font-size: 18px; font-weight: 600;
2929
+ margin-bottom: 12px; letter-spacing: -0.01em;
2930
+ }
2931
+
2932
+ /* Results table */
2933
+ .results-table {
2934
+ width: 100%; border-collapse: collapse;
2935
+ font-size: 13px; margin-bottom: 16px;
2936
+ border-radius: var(--radius); overflow: hidden;
2937
+ border: 1px solid var(--border);
2938
+ }
2939
+ .results-table th, .results-table td {
2940
+ padding: 10px 14px;
2941
+ text-align: left;
2942
+ border-bottom: 1px solid var(--border);
2943
+ }
2944
+ .results-table th {
2945
+ background: rgba(0,0,0,0.3);
2946
+ font-size: 11px; font-weight: 600;
2947
+ text-transform: uppercase; letter-spacing: 0.05em;
2948
+ color: var(--muted); cursor: pointer;
2949
+ user-select: none; white-space: nowrap;
2950
+ }
2951
+ .results-table th:hover { color: var(--text); }
2952
+ .results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
2953
+ .results-table tbody tr {
2954
+ background: var(--panel);
2955
+ transition: background 120ms ease;
2956
+ }
2957
+ .results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
2958
+ .results-table tbody tr:last-child td { border-bottom: none; }
2959
+
2960
+ /* Score cell with progress bar */
2961
+ .score-cell { position: relative; min-width: 90px; }
2962
+ .score-bar {
2963
+ position: absolute; left: 0; bottom: 0;
2964
+ height: 3px; border-radius: 2px;
2965
+ transition: width 300ms ease;
2966
+ }
2967
+ .score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
2968
+
2969
+ /* Color ranking */
2970
+ .rank-best { color: var(--green); font-weight: 600; }
2971
+ .rank-worst { color: var(--red); }
2972
+ .rank-mid { color: var(--yellow); }
2973
+ .rank-neutral { color: var(--text); }
2974
+ .rank-error { color: var(--muted); }
2975
+
2976
+ /* Winner banner */
2977
+ .task-winner {
2978
+ display: flex; align-items: center; gap: 10px;
2979
+ padding: 12px 18px; margin-bottom: 20px;
2980
+ border-radius: var(--radius);
2981
+ background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
2982
+ border: 1px solid rgba(34,197,94,0.2);
2983
+ font-size: 14px; font-weight: 500;
2984
+ }
2985
+ .task-winner .trophy { font-size: 20px; }
2986
+ .task-winner .winner-name { color: var(--green); font-weight: 600; }
2987
+ .task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
2988
+
2989
+ /* Summary cards */
2990
+ .summary-section { margin-top: 32px; }
2991
+ .summary-title {
2992
+ font-size: 16px; font-weight: 600;
2993
+ margin-bottom: 12px; color: var(--text);
2994
+ }
2995
+ .summary-cards {
2996
+ display: grid;
2997
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
2998
+ gap: 12px;
2999
+ }
3000
+ .summary-card {
3001
+ padding: 16px; border-radius: var(--radius);
3002
+ border: 1px solid var(--border);
3003
+ background: var(--panel);
3004
+ }
3005
+ .summary-card .card-label {
3006
+ font-size: 11px; font-weight: 600;
3007
+ text-transform: uppercase; letter-spacing: 0.05em;
3008
+ color: var(--muted); margin-bottom: 6px;
3009
+ }
3010
+ .summary-card .card-value {
3011
+ font-size: 20px; font-weight: 700;
3012
+ color: var(--green); font-family: var(--mono);
3013
+ }
3014
+ .summary-card .card-provider {
3015
+ font-size: 12px; color: var(--muted); margin-top: 4px;
3016
+ }
3017
+
3018
+ /* Errors */
3019
+ .errors-section { margin-top: 24px; }
3020
+ .errors-title {
3021
+ font-size: 16px; font-weight: 600;
3022
+ margin-bottom: 8px; color: var(--red);
3023
+ cursor: pointer;
3024
+ }
3025
+ .errors-list {
3026
+ border-radius: var(--radius);
3027
+ border: 1px solid rgba(239,68,68,0.2);
3028
+ background: rgba(239,68,68,0.04);
3029
+ overflow: hidden;
3030
+ }
3031
+ .error-item {
3032
+ padding: 10px 16px;
3033
+ border-bottom: 1px solid rgba(239,68,68,0.1);
3034
+ font-size: 13px;
3035
+ }
3036
+ .error-item:last-child { border-bottom: none; }
3037
+ .error-provider { font-weight: 600; color: var(--text); }
3038
+ .error-msg { color: var(--muted); margin-left: 8px; }
3039
+ .error-count { color: var(--muted); font-size: 11px; }
3040
+ .error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
3041
+
3042
+ /* Footer */
3043
+ .report-footer {
3044
+ margin-top: 40px; padding: 20px 0;
3045
+ border-top: 1px solid var(--border);
3046
+ display: flex; justify-content: space-between; align-items: center;
3047
+ flex-wrap: wrap; gap: 12px;
3048
+ }
3049
+ .footer-brand {
3050
+ font-size: 13px; color: var(--muted);
3051
+ }
3052
+ .footer-brand a {
3053
+ color: var(--accent); text-decoration: none; font-weight: 500;
3054
+ }
3055
+ .footer-brand a:hover { text-decoration: underline; }
3056
+ .footer-cta {
3057
+ display: inline-flex; align-items: center; gap: 6px;
3058
+ padding: 6px 14px; border-radius: 8px;
3059
+ background: var(--accent-soft);
3060
+ border: 1px solid rgba(245,158,11,0.3);
3061
+ color: var(--accent); font-size: 12px; font-weight: 500;
3062
+ text-decoration: none;
3063
+ transition: transform 120ms ease, box-shadow 120ms ease;
3064
+ }
3065
+ .footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
3066
+
3067
+ /* Empty state */
3068
+ .empty-msg {
3069
+ text-align: center; color: var(--muted);
3070
+ padding: 60px 20px; font-size: 16px;
3071
+ }
3072
+
3073
+ /* Responsive */
3074
+ @media (max-width: 640px) {
3075
+ body { padding: 12px; }
3076
+ .report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
3077
+ .report-meta { text-align: left; }
3078
+ .summary-cards { grid-template-columns: 1fr; }
3079
+ .results-table { font-size: 12px; }
3080
+ .results-table th, .results-table td { padding: 8px 10px; }
3081
+ .report-footer { flex-direction: column; align-items: flex-start; }
3082
+ }
3083
+ </style>`;
3084
+ }
3085
+ function renderHeader(runsLabel, providerCount, taskCount) {
3086
+ const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
3087
+ return `<header class="report-header">
3088
+ <a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3089
+ <div class="brand-icon">&#x2B21;</div>
3090
+ <span>Agent Duelist</span>
3091
+ </a>
3092
+ <div class="report-meta">
3093
+ ${providerCount} provider${providerCount !== 1 ? "s" : ""} &middot;
3094
+ ${taskCount} task${taskCount !== 1 ? "s" : ""} &middot;
3095
+ ${esc(runsLabel)}<br>
3096
+ ${esc(now)}
3097
+ </div>
3098
+ </header>`;
3099
+ }
3100
+ function renderTabs(tasks) {
3101
+ const buttons = tasks.map(
3102
+ (t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
3103
+ ).join("\n ");
3104
+ return `<nav class="task-tabs">
3105
+ ${buttons}
3106
+ </nav>`;
3107
+ }
3108
+ function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
3109
+ const cols = [
3110
+ { label: "Provider", key: "provider", isScore: false }
3111
+ ];
3112
+ for (const name of scorerNames) {
3113
+ if (name === "latency") {
3114
+ cols.push({ label: "Latency", key: "latency", isScore: false });
3115
+ } else if (name === "cost") {
3116
+ cols.push({ label: "Cost", key: "cost", isScore: false });
3117
+ cols.push({ label: "Tokens", key: "tokens", isScore: false });
3118
+ } else {
3119
+ cols.push({ label: scorerLabel(name), key: name, isScore: true });
3120
+ }
3121
+ }
3122
+ const ths = cols.map(
3123
+ (c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
3124
+ ).join("");
3125
+ const rows = providerData.map((pd) => {
3126
+ const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
3127
+ const cells = [];
3128
+ const medalHtml = medal ? `${medal} ` : "";
3129
+ cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
3130
+ if (pd.allErrors) {
3131
+ for (let ci = 1; ci < cols.length; ci++) {
3132
+ cells.push(`<td class="rank-error">&mdash;</td>`);
3133
+ }
3134
+ } else {
3135
+ for (const col of cols.slice(1)) {
3136
+ cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
3137
+ }
3138
+ }
3139
+ return `<tr>${cells.join("")}</tr>`;
3140
+ }).join("\n");
3141
+ const winnerHtml = winnerId ? `<div class="task-winner">
3142
+ <span class="trophy">&#x1F3C6;</span>
3143
+ <span>Winner: <span class="winner-name">${esc(winnerId)}</span>
3144
+ <span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
3145
+ </div>` : "";
3146
+ return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
3147
+ <h2 class="task-name">${esc(task)}</h2>
3148
+ <table class="results-table">
3149
+ <thead><tr>${ths}</tr></thead>
3150
+ <tbody>${rows}</tbody>
3151
+ </table>
3152
+ ${winnerHtml}
3153
+ </section>`;
3154
+ }
3155
+ function renderDataCell(key, _isScore, pd, columnStats, multi) {
3156
+ const colStats = columnStats.get(key);
3157
+ if (key === "latency") {
3158
+ const ms = pd.latencyMs;
3159
+ if (ms === void 0) return `<td class="rank-error">&mdash;</td>`;
3160
+ const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
3161
+ return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
3162
+ }
3163
+ if (key === "cost") {
3164
+ const cost = pd.avgDetails.costUsd;
3165
+ if (cost === void 0) return `<td class="rank-error">&mdash;</td>`;
3166
+ const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
3167
+ return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
3168
+ }
3169
+ if (key === "tokens") {
3170
+ const tokens = pd.avgDetails.totalTokens;
3171
+ if (tokens === void 0) return `<td class="rank-error">&mdash;</td>`;
3172
+ const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
3173
+ return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
3174
+ }
3175
+ const val = pd.avgScores[key];
3176
+ if (val === void 0) return `<td class="rank-error">&mdash;</td>`;
3177
+ const pct = Math.round(val * 100);
3178
+ let rankCls;
3179
+ if (multi && colStats) {
3180
+ rankCls = rankClass_(val, colStats);
3181
+ } else {
3182
+ rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
3183
+ }
3184
+ const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
3185
+ return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
3186
+ <span class="score-val">${pct}%</span>
3187
+ <div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
3188
+ </td>`;
3189
+ }
3190
+ function rankClass_(value, colStats) {
3191
+ if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
3192
+ if (colStats.best === colStats.worst) return "rank-neutral";
3193
+ if (value === colStats.best) return "rank-best";
3194
+ if (value === colStats.worst) return "rank-worst";
3195
+ return "rank-mid";
3196
+ }
3197
+ function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
3198
+ const cards = [];
3199
+ if (byCorrectness) {
3200
+ const pct = `${Math.round(byCorrectness.avg * 100)}%`;
3201
+ const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
3202
+ cards.push(`<div class="summary-card">
3203
+ <div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
3204
+ <div class="card-value">${pct}</div>
3205
+ ${provider}
3206
+ </div>`);
3207
+ }
3208
+ if (byLatency && byLatency.avg !== Infinity) {
3209
+ const ms = `${Math.round(byLatency.avg)}ms`;
3210
+ const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
3211
+ cards.push(`<div class="summary-card">
3212
+ <div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
3213
+ <div class="card-value">${ms}</div>
3214
+ ${provider}
3215
+ </div>`);
3216
+ }
3217
+ if (byCost?.avg !== void 0) {
3218
+ const cost = esc(formatCost(byCost.avg));
3219
+ const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
3220
+ cards.push(`<div class="summary-card">
3221
+ <div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
3222
+ <div class="card-value">${cost}</div>
3223
+ ${provider}
3224
+ </div>`);
3225
+ }
3226
+ if (overallWinner) {
3227
+ cards.push(`<div class="summary-card">
3228
+ <div class="card-label">Overall Winner</div>
3229
+ <div class="card-value">&#x1F3C6;</div>
3230
+ <div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
3231
+ </div>`);
3232
+ }
3233
+ if (cards.length === 0) return "";
3234
+ return `<section class="summary-section">
3235
+ <h2 class="summary-title">Summary</h2>
3236
+ <div class="summary-cards">
3237
+ ${cards.join("\n ")}
3238
+ </div>
3239
+ </section>`;
3240
+ }
3241
+ function renderErrors(errors) {
3242
+ const items = errors.map((e) => {
3243
+ const suffix = e.count > 1 ? ` <span class="error-count">(&times;${e.count})</span>` : "";
3244
+ const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
3245
+ return `<div class="error-item">
3246
+ <span class="error-provider">${esc(e.providerId)}:</span>
3247
+ <span class="error-msg">${esc(e.error)}</span>${suffix}
3248
+ ${hint}
3249
+ </div>`;
3250
+ }).join("\n");
3251
+ return `<section class="errors-section">
3252
+ <h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
3253
+ <div class="errors-list">
3254
+ ${items}
3255
+ </div>
3256
+ </section>`;
3257
+ }
3258
+ function renderFooter() {
3259
+ return `<footer class="report-footer">
3260
+ <div class="footer-brand">
3261
+ Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
3262
+ </div>
3263
+ <a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
3264
+ &#x2B50; Star on GitHub
3265
+ </a>
3266
+ </footer>`;
3267
+ }
3268
+ function renderScript(taskCount) {
3269
+ return `<script>
3270
+ (function() {
3271
+ /* Tab switching */
3272
+ ${taskCount > 1 ? `
3273
+ var tabs = document.querySelectorAll('.task-tab');
3274
+ var sections = document.querySelectorAll('.task-section');
3275
+ tabs.forEach(function(tab) {
3276
+ tab.addEventListener('click', function() {
3277
+ var idx = parseInt(tab.getAttribute('data-task'));
3278
+ tabs.forEach(function(t) { t.classList.remove('active'); });
3279
+ sections.forEach(function(s) { s.classList.remove('active'); });
3280
+ tab.classList.add('active');
3281
+ sections[idx].classList.add('active');
3282
+ });
3283
+ });` : ""}
3284
+
3285
+ /* Column sorting */
3286
+ document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
3287
+ var table = th.closest('table');
3288
+ var asc = true;
3289
+ th.addEventListener('click', function() {
3290
+ var tbody = table.querySelector('tbody');
3291
+ var rows = Array.from(tbody.querySelectorAll('tr'));
3292
+ rows.sort(function(a, b) {
3293
+ var aCell = a.children[colIdx];
3294
+ var bCell = b.children[colIdx];
3295
+ var aVal = aCell.getAttribute('data-sort-val');
3296
+ var bVal = bCell.getAttribute('data-sort-val');
3297
+ if (aVal !== null && bVal !== null) {
3298
+ return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
3299
+ }
3300
+ var aText = aCell.textContent || '';
3301
+ var bText = bCell.textContent || '';
3302
+ return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
3303
+ });
3304
+ rows.forEach(function(row) { tbody.appendChild(row); });
3305
+
3306
+ /* Update sort arrows */
3307
+ table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
3308
+ th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
3309
+ asc = !asc;
3310
+ });
3311
+ });
3312
+ })();
3313
+ </script>`;
3314
+ }
3315
+
3316
+ // src/packs/structured-output.ts
3317
+ import { z } from "zod";
3318
+ var structuredOutputPack = {
3319
+ name: "structured-output",
3320
+ label: "Structured Output",
3321
+ description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
3322
+ tasks: [
3323
+ {
3324
+ name: "so:flat-entity",
3325
+ prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
3326
+ expected: {
3327
+ name: "Maria Garcia",
3328
+ age: 34,
3329
+ role: "software architect",
3330
+ city: "Barcelona",
3331
+ country: "Spain",
3332
+ employeeId: "EMP-2847"
3333
+ },
3334
+ schema: z.object({
3335
+ name: z.string(),
3336
+ age: z.number(),
3337
+ role: z.string(),
3338
+ city: z.string(),
3339
+ country: z.string(),
3340
+ employeeId: z.string()
3341
+ })
3342
+ },
3343
+ {
3344
+ name: "so:nested-object",
3345
+ prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
3346
+ expected: {
3347
+ recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
3348
+ address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
3349
+ order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
3350
+ },
3351
+ schema: z.object({
3352
+ recipient: z.object({ company: z.string(), contact: z.string(), floor: z.string() }),
3353
+ address: z.object({
3354
+ street: z.string(),
3355
+ city: z.string(),
3356
+ state: z.string(),
3357
+ zip: z.string(),
3358
+ country: z.string()
3359
+ }),
3360
+ order: z.object({
3361
+ id: z.string(),
3362
+ itemCount: z.number(),
3363
+ weightKg: z.number(),
3364
+ shippingMethod: z.enum(["standard", "express", "overnight"])
3365
+ })
3366
+ })
3367
+ },
3368
+ {
3369
+ name: "so:array-of-objects",
3370
+ prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
3371
+ expected: [
3372
+ { name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
3373
+ { name: "ComfortMax Chair", price: 199, category: "Furniture" },
3374
+ { name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
3375
+ { name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
3376
+ ],
3377
+ schema: z.array(z.object({ name: z.string(), price: z.number(), category: z.string() }))
3378
+ },
3379
+ {
3380
+ name: "so:empty-arrays",
3381
+ prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
3382
+ expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
3383
+ schema: z.object({
3384
+ errors: z.array(z.object({ code: z.string(), severity: z.string() })),
3385
+ warnings: z.array(z.string()),
3386
+ status: z.enum(["healthy", "degraded", "down"]),
3387
+ uptimePercent: z.number()
3388
+ })
3389
+ },
3390
+ {
3391
+ name: "so:enum-classification",
3392
+ prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
3393
+ expected: [
3394
+ { id: "A", priority: "high", category: "billing" },
3395
+ { id: "B", priority: "high", category: "technical" },
3396
+ { id: "C", priority: "low", category: "account" },
3397
+ { id: "D", priority: "critical", category: "technical" }
3398
+ ],
3399
+ schema: z.array(
3400
+ z.object({
3401
+ id: z.string(),
3402
+ priority: z.enum(["low", "medium", "high", "critical"]),
3403
+ category: z.enum(["billing", "technical", "account", "general"])
3404
+ })
3405
+ )
3406
+ },
3407
+ {
3408
+ name: "so:adversarial-input",
3409
+ prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
3410
+
3411
+ User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
3412
+ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
3413
+ expected: {
3414
+ product: "headphones",
3415
+ price: 59.99,
3416
+ rating: 5,
3417
+ maxRating: 5,
3418
+ features: ["noise-cancelling"],
3419
+ recommended: true,
3420
+ purchaseDate: "2026-01-15"
3421
+ },
3422
+ schema: z.object({
3423
+ product: z.string(),
3424
+ price: z.number(),
3425
+ rating: z.number(),
3426
+ maxRating: z.number(),
3427
+ features: z.array(z.string()),
3428
+ recommended: z.boolean(),
3429
+ purchaseDate: z.string()
3430
+ })
3431
+ }
3432
+ ],
3433
+ scorers: ["correctness", "schema-correctness", "latency", "cost"]
3434
+ };
3435
+
3436
+ // src/packs/index.ts
3437
+ var registry = /* @__PURE__ */ new Map();
3438
+ function register(pack) {
3439
+ registry.set(pack.name, pack);
3440
+ }
3441
+ register(structuredOutputPack);
3442
+ function loadPack(name) {
3443
+ const pack = registry.get(name);
3444
+ if (!pack) {
3445
+ const available = [...registry.keys()].join(", ");
3446
+ throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
3447
+ }
3448
+ return pack;
3449
+ }
3450
+ function listPacks() {
3451
+ return [...registry.values()].map((p) => ({
3452
+ name: p.name,
3453
+ label: p.label,
3454
+ description: p.description,
3455
+ taskCount: p.tasks.length
3456
+ }));
3457
+ }
3458
+
2517
3459
  // src/ci.ts
2518
3460
  import { readFileSync, writeFileSync, mkdirSync } from "fs";
2519
3461
  import { dirname } from "path";
@@ -2535,10 +3477,11 @@ var T_CRITICAL_95 = {
2535
3477
  25: 2.06,
2536
3478
  30: 2.042
2537
3479
  };
3480
+ var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
2538
3481
  function tCritical(df) {
2539
3482
  if (df <= 0) return 1.96;
2540
3483
  if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
2541
- const keys = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
3484
+ const keys = T_CRITICAL_KEYS;
2542
3485
  if (df > keys[keys.length - 1]) return 1.96;
2543
3486
  for (let i = 0; i < keys.length - 1; i++) {
2544
3487
  if (df > keys[i] && df < keys[i + 1]) {
@@ -2648,7 +3591,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
2648
3591
  if (regressions.length > 0) {
2649
3592
  for (const r of regressions) {
2650
3593
  failureReasons.push(
2651
- `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta2(r.delta)}`
3594
+ `${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
2652
3595
  );
2653
3596
  }
2654
3597
  }
@@ -2683,10 +3626,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
2683
3626
  }
2684
3627
  return current.ci95Lower - baseline.ci95Upper > threshold;
2685
3628
  }
2686
- function formatDelta2(delta) {
2687
- const sign = delta >= 0 ? "+" : "";
2688
- return `${sign}${delta.toFixed(4)}`;
2689
- }
2690
3629
  function loadBaseline(path) {
2691
3630
  try {
2692
3631
  const raw = readFileSync(path, "utf-8");
@@ -2743,18 +3682,20 @@ function detectGitHubContext() {
2743
3682
  return { token, owner, repo, prNumber };
2744
3683
  }
2745
3684
  var API_BASE = "https://api.github.com";
3685
+ function ghHeaders(token, extra) {
3686
+ return {
3687
+ Authorization: `Bearer ${token}`,
3688
+ Accept: "application/vnd.github+json",
3689
+ "X-GitHub-Api-Version": "2022-11-28",
3690
+ ...extra
3691
+ };
3692
+ }
2746
3693
  async function findExistingComment(ctx, marker) {
2747
3694
  let page = 1;
2748
3695
  const perPage = 50;
2749
3696
  while (true) {
2750
3697
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
2751
- const res = await fetch(url, {
2752
- headers: {
2753
- Authorization: `Bearer ${ctx.token}`,
2754
- Accept: "application/vnd.github+json",
2755
- "X-GitHub-Api-Version": "2022-11-28"
2756
- }
2757
- });
3698
+ const res = await fetch(url, { headers: ghHeaders(ctx.token) });
2758
3699
  if (!res.ok) return null;
2759
3700
  const comments = await res.json();
2760
3701
  if (comments.length === 0) break;
@@ -2774,12 +3715,7 @@ async function upsertPrComment(ctx, body, marker) {
2774
3715
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
2775
3716
  const res = await fetch(url, {
2776
3717
  method: "PATCH",
2777
- headers: {
2778
- Authorization: `Bearer ${ctx.token}`,
2779
- Accept: "application/vnd.github+json",
2780
- "Content-Type": "application/json",
2781
- "X-GitHub-Api-Version": "2022-11-28"
2782
- },
3718
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
2783
3719
  body: JSON.stringify({ body })
2784
3720
  });
2785
3721
  if (!res.ok) {
@@ -2790,12 +3726,7 @@ async function upsertPrComment(ctx, body, marker) {
2790
3726
  const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
2791
3727
  const res = await fetch(url, {
2792
3728
  method: "POST",
2793
- headers: {
2794
- Authorization: `Bearer ${ctx.token}`,
2795
- Accept: "application/vnd.github+json",
2796
- "Content-Type": "application/json",
2797
- "X-GitHub-Api-Version": "2022-11-28"
2798
- },
3729
+ headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
2799
3730
  body: JSON.stringify({ body })
2800
3731
  });
2801
3732
  if (!res.ok) {
@@ -2813,8 +3744,11 @@ export {
2813
3744
  defineArena,
2814
3745
  detectGitHubContext,
2815
3746
  gemini,
3747
+ htmlReporter,
2816
3748
  jsonReporter,
3749
+ listPacks,
2817
3750
  loadBaseline,
3751
+ loadPack,
2818
3752
  markdownReporter,
2819
3753
  openai,
2820
3754
  openaiCompatible,