agent-duelist 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +251 -133
- package/dist/cli.js +4945 -2351
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1405 -468
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +35 -9
- package/dist/index.d.ts +35 -9
- package/dist/index.js +1402 -468
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1349,33 +1349,42 @@ var correctnessScorer = ({ task, result }) => {
|
|
|
1349
1349
|
if (task.expected === void 0) {
|
|
1350
1350
|
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
1351
1351
|
}
|
|
1352
|
-
const
|
|
1352
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
1353
|
+
const match = deepEqual(task.expected, actual);
|
|
1353
1354
|
return {
|
|
1354
1355
|
name: "correctness",
|
|
1355
1356
|
value: match ? 1 : 0,
|
|
1356
1357
|
details: { expected: task.expected, actual: result.output }
|
|
1357
1358
|
};
|
|
1358
1359
|
};
|
|
1359
|
-
function
|
|
1360
|
-
if (
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
if (
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1360
|
+
function normalizeOutput(expected, actual) {
|
|
1361
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
1362
|
+
const entries = Object.entries(actual);
|
|
1363
|
+
const arrayEntries = entries.filter(([, v]) => Array.isArray(v));
|
|
1364
|
+
if (arrayEntries.length === 1) {
|
|
1365
|
+
return arrayEntries[0][1];
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
return actual;
|
|
1369
|
+
}
|
|
1370
|
+
function deepEqual(expected, actual) {
|
|
1371
|
+
if (expected === actual) return true;
|
|
1372
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1373
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1374
|
+
}
|
|
1375
|
+
if (typeof expected !== typeof actual) return false;
|
|
1376
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1377
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1378
|
+
if (expected.length !== actual.length) return false;
|
|
1379
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1380
|
+
}
|
|
1381
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1382
|
+
const objExpected = expected;
|
|
1383
|
+
const objActual = actual;
|
|
1384
|
+
const keysExpected = Object.keys(objExpected);
|
|
1385
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1386
|
+
}
|
|
1387
|
+
return expected === actual;
|
|
1379
1388
|
}
|
|
1380
1389
|
|
|
1381
1390
|
// src/scorers/schema-correctness.ts
|
|
@@ -1395,7 +1404,14 @@ var schemaCorrectnessScorer = ({ task, result }) => {
|
|
|
1395
1404
|
};
|
|
1396
1405
|
}
|
|
1397
1406
|
}
|
|
1398
|
-
|
|
1407
|
+
let parsed = task.schema.safeParse(data);
|
|
1408
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
1409
|
+
const arrayEntries = Object.entries(data).filter(([, v]) => Array.isArray(v));
|
|
1410
|
+
if (arrayEntries.length === 1) {
|
|
1411
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
1412
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1399
1415
|
return {
|
|
1400
1416
|
name: "schema-correctness",
|
|
1401
1417
|
value: parsed.success ? 1 : 0,
|
|
@@ -1410,11 +1426,13 @@ var fuzzySimilarityScorer = ({ task, result }) => {
|
|
|
1410
1426
|
}
|
|
1411
1427
|
const a = stringify(task.expected);
|
|
1412
1428
|
const b = stringify(result.output);
|
|
1413
|
-
const
|
|
1429
|
+
const setA = tokenize(a);
|
|
1430
|
+
const setB = tokenize(b);
|
|
1431
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
1414
1432
|
return {
|
|
1415
1433
|
name: "fuzzy-similarity",
|
|
1416
1434
|
value: Math.round(similarity * 100) / 100,
|
|
1417
|
-
details: { method: "jaccard", expectedTokens:
|
|
1435
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
1418
1436
|
};
|
|
1419
1437
|
};
|
|
1420
1438
|
function stringify(value) {
|
|
@@ -1439,7 +1457,38 @@ import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
|
1439
1457
|
|
|
1440
1458
|
// src/providers/openai.ts
|
|
1441
1459
|
import OpenAI, { AzureOpenAI } from "openai";
|
|
1460
|
+
import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
|
|
1461
|
+
|
|
1462
|
+
// src/providers/shared.ts
|
|
1442
1463
|
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
1464
|
+
function buildSchemaSystemMessage(schema) {
|
|
1465
|
+
if (!schema) return "Respond with valid JSON.";
|
|
1466
|
+
const jsonSchema = zodToJsonSchema(schema, { target: "openAi" });
|
|
1467
|
+
return [
|
|
1468
|
+
"Respond with ONLY valid JSON data. No markdown, no code fences, no explanation.",
|
|
1469
|
+
"",
|
|
1470
|
+
"Your output must conform to this JSON Schema:",
|
|
1471
|
+
JSON.stringify(jsonSchema, null, 2),
|
|
1472
|
+
"",
|
|
1473
|
+
"IMPORTANT: Output the actual data values, NOT the schema definition itself.",
|
|
1474
|
+
'Do NOT include keys like "type", "$schema", or "items" from the schema definition in your response.'
|
|
1475
|
+
].join("\n");
|
|
1476
|
+
}
|
|
1477
|
+
function parseSchemaOutput(rawContent, hasSchema) {
|
|
1478
|
+
if (!hasSchema) return rawContent;
|
|
1479
|
+
const cleaned = stripCodeFences(rawContent);
|
|
1480
|
+
try {
|
|
1481
|
+
return JSON.parse(cleaned);
|
|
1482
|
+
} catch {
|
|
1483
|
+
return rawContent;
|
|
1484
|
+
}
|
|
1485
|
+
}
|
|
1486
|
+
function stripCodeFences(content) {
|
|
1487
|
+
const match = content.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/m);
|
|
1488
|
+
return match ? match[1] : content;
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1491
|
+
// src/providers/openai.ts
|
|
1443
1492
|
var REQUEST_TIMEOUT_MS = 6e4;
|
|
1444
1493
|
function openai(model, options) {
|
|
1445
1494
|
const client = new OpenAI({
|
|
@@ -1486,7 +1535,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1486
1535
|
if (input.schema) {
|
|
1487
1536
|
params.response_format = { type: "json_object" };
|
|
1488
1537
|
params.messages = [
|
|
1489
|
-
{ role: "system", content:
|
|
1538
|
+
{ role: "system", content: buildSchemaSystemMessage(input.schema) },
|
|
1490
1539
|
...params.messages
|
|
1491
1540
|
];
|
|
1492
1541
|
}
|
|
@@ -1494,7 +1543,9 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1494
1543
|
params.tools = input.tools.map(toolDefToOpenAI);
|
|
1495
1544
|
params.tool_choice = "auto";
|
|
1496
1545
|
}
|
|
1497
|
-
const
|
|
1546
|
+
const reqOpts = { signal: input.signal };
|
|
1547
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
1548
|
+
const response = await client.chat.completions.create(params, reqOpts);
|
|
1498
1549
|
let totalPromptTokens = response.usage?.prompt_tokens ?? 0;
|
|
1499
1550
|
let totalCompletionTokens = response.usage?.completion_tokens ?? 0;
|
|
1500
1551
|
const choice = response.choices[0];
|
|
@@ -1528,7 +1579,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1528
1579
|
const followUp = await client.chat.completions.create({
|
|
1529
1580
|
model: requestModel,
|
|
1530
1581
|
messages: toolMessages
|
|
1531
|
-
},
|
|
1582
|
+
}, reqOpts);
|
|
1532
1583
|
totalPromptTokens += followUp.usage?.prompt_tokens ?? 0;
|
|
1533
1584
|
totalCompletionTokens += followUp.usage?.completion_tokens ?? 0;
|
|
1534
1585
|
finalResponse = followUp;
|
|
@@ -1539,13 +1590,7 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1539
1590
|
if (stripThinking) {
|
|
1540
1591
|
rawContent = rawContent.replace(/<think>[\s\S]*?<\/think>\s*/, "");
|
|
1541
1592
|
}
|
|
1542
|
-
|
|
1543
|
-
if (input.schema) {
|
|
1544
|
-
try {
|
|
1545
|
-
output = JSON.parse(rawContent);
|
|
1546
|
-
} catch {
|
|
1547
|
-
}
|
|
1548
|
-
}
|
|
1593
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
1549
1594
|
return {
|
|
1550
1595
|
output,
|
|
1551
1596
|
usage: {
|
|
@@ -1559,13 +1604,27 @@ function makeProvider(id, name, model, client, requestModel, stripThinking) {
|
|
|
1559
1604
|
}
|
|
1560
1605
|
};
|
|
1561
1606
|
}
|
|
1607
|
+
function gemini(model, options) {
|
|
1608
|
+
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
1609
|
+
if (!apiKey) {
|
|
1610
|
+
throw new Error(
|
|
1611
|
+
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
1612
|
+
);
|
|
1613
|
+
}
|
|
1614
|
+
const client = new OpenAI({
|
|
1615
|
+
apiKey,
|
|
1616
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
1617
|
+
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
1618
|
+
});
|
|
1619
|
+
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
1620
|
+
}
|
|
1562
1621
|
function toolDefToOpenAI(tool) {
|
|
1563
1622
|
return {
|
|
1564
1623
|
type: "function",
|
|
1565
1624
|
function: {
|
|
1566
1625
|
name: tool.name,
|
|
1567
1626
|
description: tool.description,
|
|
1568
|
-
parameters:
|
|
1627
|
+
parameters: zodToJsonSchema2(tool.parameters, { target: "openAi" })
|
|
1569
1628
|
}
|
|
1570
1629
|
};
|
|
1571
1630
|
}
|
|
@@ -1614,8 +1673,14 @@ function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1614
1673
|
if (!apiKey) return void 0;
|
|
1615
1674
|
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
1616
1675
|
}
|
|
1676
|
+
function isTemperatureError(err) {
|
|
1677
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1678
|
+
const lower = msg.toLowerCase();
|
|
1679
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
1680
|
+
}
|
|
1617
1681
|
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
1618
1682
|
let cached = void 0;
|
|
1683
|
+
let useTemperature = true;
|
|
1619
1684
|
return async ({ task, result }) => {
|
|
1620
1685
|
if (task.expected === void 0) {
|
|
1621
1686
|
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
@@ -1632,36 +1697,24 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1632
1697
|
}
|
|
1633
1698
|
const { client, model } = cached;
|
|
1634
1699
|
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
1700
|
+
const messages = [{ role: "user", content: prompt }];
|
|
1635
1701
|
try {
|
|
1636
|
-
const response = await client
|
|
1637
|
-
|
|
1638
|
-
messages: [{ role: "user", content: prompt }],
|
|
1639
|
-
temperature: 0,
|
|
1640
|
-
max_tokens: 2048
|
|
1641
|
-
});
|
|
1642
|
-
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1643
|
-
const parsed = {};
|
|
1644
|
-
for (const line of content.split("\n")) {
|
|
1645
|
-
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1646
|
-
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1647
|
-
}
|
|
1648
|
-
const accuracy = parsed.accuracy;
|
|
1649
|
-
const completeness = parsed.completeness;
|
|
1650
|
-
const conciseness = parsed.conciseness;
|
|
1651
|
-
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1652
|
-
return {
|
|
1653
|
-
name: "llm-judge-correctness",
|
|
1654
|
-
value: -1,
|
|
1655
|
-
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1656
|
-
};
|
|
1657
|
-
}
|
|
1658
|
-
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1659
|
-
return {
|
|
1660
|
-
name: "llm-judge-correctness",
|
|
1661
|
-
value: composite,
|
|
1662
|
-
details: { model, accuracy, completeness, conciseness }
|
|
1663
|
-
};
|
|
1702
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
1703
|
+
return parseJudgeResponse(response, model);
|
|
1664
1704
|
} catch (err) {
|
|
1705
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
1706
|
+
useTemperature = false;
|
|
1707
|
+
try {
|
|
1708
|
+
const response = await callJudge(client, model, messages, false);
|
|
1709
|
+
return parseJudgeResponse(response, model);
|
|
1710
|
+
} catch (retryErr) {
|
|
1711
|
+
return {
|
|
1712
|
+
name: "llm-judge-correctness",
|
|
1713
|
+
value: -1,
|
|
1714
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
1715
|
+
};
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1665
1718
|
return {
|
|
1666
1719
|
name: "llm-judge-correctness",
|
|
1667
1720
|
value: -1,
|
|
@@ -1670,6 +1723,38 @@ function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
|
1670
1723
|
}
|
|
1671
1724
|
};
|
|
1672
1725
|
}
|
|
1726
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
1727
|
+
return client.chat.completions.create({
|
|
1728
|
+
model,
|
|
1729
|
+
messages,
|
|
1730
|
+
max_completion_tokens: 2048,
|
|
1731
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
1732
|
+
});
|
|
1733
|
+
}
|
|
1734
|
+
function parseJudgeResponse(response, model) {
|
|
1735
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
1736
|
+
const parsed = {};
|
|
1737
|
+
for (const line of content.split("\n")) {
|
|
1738
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
1739
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
1740
|
+
}
|
|
1741
|
+
const accuracy = parsed.accuracy;
|
|
1742
|
+
const completeness = parsed.completeness;
|
|
1743
|
+
const conciseness = parsed.conciseness;
|
|
1744
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s) => isNaN(s) || s < 0 || s > 1)) {
|
|
1745
|
+
return {
|
|
1746
|
+
name: "llm-judge-correctness",
|
|
1747
|
+
value: -1,
|
|
1748
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
1749
|
+
};
|
|
1750
|
+
}
|
|
1751
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
1752
|
+
return {
|
|
1753
|
+
name: "llm-judge-correctness",
|
|
1754
|
+
value: composite,
|
|
1755
|
+
details: { model, accuracy, completeness, conciseness }
|
|
1756
|
+
};
|
|
1757
|
+
}
|
|
1673
1758
|
|
|
1674
1759
|
// src/scorers/tool-usage.ts
|
|
1675
1760
|
var toolUsageScorer = ({ task, result }) => {
|
|
@@ -1733,118 +1818,174 @@ async function runBenchmarks(options) {
|
|
|
1733
1818
|
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
1734
1819
|
const results = [];
|
|
1735
1820
|
for (const task of tasks) {
|
|
1736
|
-
for (
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1821
|
+
for (let run = 1; run <= runs; run++) {
|
|
1822
|
+
const runResults = await Promise.all(
|
|
1823
|
+
providers.map(async (provider) => {
|
|
1824
|
+
let result;
|
|
1825
|
+
try {
|
|
1826
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
1827
|
+
prompt: task.prompt,
|
|
1828
|
+
schema: task.schema,
|
|
1829
|
+
tools: task.tools,
|
|
1830
|
+
signal,
|
|
1831
|
+
timeout
|
|
1832
|
+
}), timeout);
|
|
1833
|
+
const scores = await Promise.all(
|
|
1834
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
1835
|
+
);
|
|
1836
|
+
result = {
|
|
1837
|
+
providerId: provider.id,
|
|
1838
|
+
taskName: task.name,
|
|
1839
|
+
run,
|
|
1840
|
+
scores,
|
|
1841
|
+
raw: {
|
|
1842
|
+
output: taskResult.output,
|
|
1843
|
+
latencyMs: taskResult.latencyMs,
|
|
1844
|
+
usage: taskResult.usage,
|
|
1845
|
+
toolCalls: taskResult.toolCalls
|
|
1846
|
+
}
|
|
1847
|
+
};
|
|
1848
|
+
} catch (err) {
|
|
1849
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1850
|
+
result = {
|
|
1851
|
+
providerId: provider.id,
|
|
1852
|
+
taskName: task.name,
|
|
1853
|
+
run,
|
|
1854
|
+
scores: [],
|
|
1855
|
+
error: message,
|
|
1856
|
+
raw: { output: "", latencyMs: 0 }
|
|
1857
|
+
};
|
|
1858
|
+
}
|
|
1859
|
+
onResult?.(result);
|
|
1860
|
+
return result;
|
|
1861
|
+
})
|
|
1862
|
+
);
|
|
1863
|
+
results.push(...runResults);
|
|
1775
1864
|
}
|
|
1776
1865
|
}
|
|
1777
1866
|
return results;
|
|
1778
1867
|
}
|
|
1779
1868
|
|
|
1780
|
-
// src/
|
|
1781
|
-
var
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
return `${boldCode}${s}${reset}`;
|
|
1792
|
-
}
|
|
1793
|
-
function dim(s) {
|
|
1794
|
-
return `${dimCode}${s}${reset}`;
|
|
1869
|
+
// src/utils/format.ts
|
|
1870
|
+
var MAX_FRACTION_DIGITS = 100;
|
|
1871
|
+
function formatCost(usd) {
|
|
1872
|
+
if (usd === void 0) return "\u2014";
|
|
1873
|
+
if (usd === 0) return "$0.00";
|
|
1874
|
+
if (usd >= 0.01) return `~$${usd.toFixed(2)}`;
|
|
1875
|
+
const digits = Math.min(
|
|
1876
|
+
MAX_FRACTION_DIGITS,
|
|
1877
|
+
Math.max(4, -Math.floor(Math.log10(Math.abs(usd))) + 1)
|
|
1878
|
+
);
|
|
1879
|
+
return `~$${usd.toFixed(digits).replace(/0+$/, "")}`;
|
|
1795
1880
|
}
|
|
1796
|
-
function
|
|
1797
|
-
|
|
1881
|
+
function formatDelta(delta, precision = 4) {
|
|
1882
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1883
|
+
return `${sign}${delta.toFixed(precision)}`;
|
|
1798
1884
|
}
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1885
|
+
|
|
1886
|
+
// src/reporter/shared.ts
|
|
1887
|
+
function groupResults(results) {
|
|
1888
|
+
const taskSet = /* @__PURE__ */ new Set();
|
|
1889
|
+
const providerSet = /* @__PURE__ */ new Set();
|
|
1890
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
1891
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
1892
|
+
const byProvider = /* @__PURE__ */ new Map();
|
|
1893
|
+
let hasErrors = false;
|
|
1894
|
+
let maxRun = 0;
|
|
1895
|
+
for (const r of results) {
|
|
1896
|
+
taskSet.add(r.taskName);
|
|
1897
|
+
providerSet.add(r.providerId);
|
|
1898
|
+
for (const s of r.scores) scorerSet.add(s.name);
|
|
1899
|
+
if (r.error) hasErrors = true;
|
|
1900
|
+
if (r.run > maxRun) maxRun = r.run;
|
|
1901
|
+
const key = `${r.taskName}::${r.providerId}`;
|
|
1902
|
+
let group = grouped.get(key);
|
|
1903
|
+
if (!group) {
|
|
1904
|
+
group = [];
|
|
1905
|
+
grouped.set(key, group);
|
|
1906
|
+
}
|
|
1907
|
+
group.push(r);
|
|
1908
|
+
let provGroup = byProvider.get(r.providerId);
|
|
1909
|
+
if (!provGroup) {
|
|
1910
|
+
provGroup = [];
|
|
1911
|
+
byProvider.set(r.providerId, provGroup);
|
|
1912
|
+
}
|
|
1913
|
+
provGroup.push(r);
|
|
1807
1914
|
}
|
|
1808
|
-
return
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
const clamped = Math.max(0, Math.min(1, ratio));
|
|
1818
|
-
const fillLen = Math.round(clamped * width);
|
|
1819
|
-
const fill = "\u2593".repeat(fillLen);
|
|
1820
|
-
const track = "\u2591".repeat(width - fillLen);
|
|
1821
|
-
return { fill, track };
|
|
1915
|
+
return {
|
|
1916
|
+
tasks: [...taskSet],
|
|
1917
|
+
providers: [...providerSet],
|
|
1918
|
+
scorerNames: [...scorerSet],
|
|
1919
|
+
grouped,
|
|
1920
|
+
byProvider,
|
|
1921
|
+
hasErrors,
|
|
1922
|
+
maxRun
|
|
1923
|
+
};
|
|
1822
1924
|
}
|
|
1823
|
-
function
|
|
1824
|
-
const
|
|
1825
|
-
|
|
1826
|
-
|
|
1925
|
+
function aggregateProviderTask(providerId, grouped, task) {
|
|
1926
|
+
const taskResults = grouped.get(`${task}::${providerId}`) ?? [];
|
|
1927
|
+
const errorResults = taskResults.filter((r) => r.error);
|
|
1928
|
+
const successResults = taskResults.filter((r) => !r.error);
|
|
1929
|
+
if (successResults.length === 0) {
|
|
1930
|
+
return {
|
|
1931
|
+
providerId,
|
|
1932
|
+
avgScores: {},
|
|
1933
|
+
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1934
|
+
latencyMs: void 0,
|
|
1935
|
+
allErrors: errorResults.length > 0,
|
|
1936
|
+
errorCount: errorResults.length
|
|
1937
|
+
};
|
|
1827
1938
|
}
|
|
1828
|
-
|
|
1829
|
-
|
|
1939
|
+
return {
|
|
1940
|
+
providerId,
|
|
1941
|
+
avgScores: averageScores(successResults),
|
|
1942
|
+
avgDetails: averageDetails(successResults),
|
|
1943
|
+
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1944
|
+
allErrors: false,
|
|
1945
|
+
errorCount: errorResults.length
|
|
1946
|
+
};
|
|
1947
|
+
}
|
|
1948
|
+
function averageScores(results) {
|
|
1949
|
+
const sums = {};
|
|
1950
|
+
const counts = {};
|
|
1951
|
+
for (const result of results) {
|
|
1952
|
+
for (const score of result.scores) {
|
|
1953
|
+
if (score.value < 0) continue;
|
|
1954
|
+
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
1955
|
+
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
1956
|
+
}
|
|
1830
1957
|
}
|
|
1831
|
-
const
|
|
1832
|
-
|
|
1833
|
-
|
|
1958
|
+
const avgs = {};
|
|
1959
|
+
for (const name of Object.keys(sums)) {
|
|
1960
|
+
avgs[name] = sums[name] / counts[name];
|
|
1834
1961
|
}
|
|
1835
|
-
return
|
|
1962
|
+
return avgs;
|
|
1836
1963
|
}
|
|
1837
|
-
function
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1964
|
+
function averageDetails(results) {
|
|
1965
|
+
let costSum = 0;
|
|
1966
|
+
let costCount = 0;
|
|
1967
|
+
let tokenSum = 0;
|
|
1968
|
+
let tokenCount = 0;
|
|
1969
|
+
for (const result of results) {
|
|
1970
|
+
const costScore = result.scores.find((s) => s.name === "cost");
|
|
1971
|
+
const details = costScore?.details;
|
|
1972
|
+
if (details?.estimatedUsd != null) {
|
|
1973
|
+
costSum += details.estimatedUsd;
|
|
1974
|
+
costCount++;
|
|
1975
|
+
}
|
|
1976
|
+
if (details?.totalTokens != null) {
|
|
1977
|
+
tokenSum += details.totalTokens;
|
|
1978
|
+
tokenCount++;
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
return {
|
|
1982
|
+
costUsd: costCount > 0 ? costSum / costCount : void 0,
|
|
1983
|
+
totalTokens: tokenCount > 0 ? Math.round(tokenSum / tokenCount) : void 0
|
|
1984
|
+
};
|
|
1842
1985
|
}
|
|
1843
|
-
function
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
const padding = Math.max(0, totalInner - dw - 1);
|
|
1847
|
-
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
1986
|
+
function average(nums) {
|
|
1987
|
+
if (nums.length === 0) return void 0;
|
|
1988
|
+
return nums.reduce((a, b) => a + b, 0) / nums.length;
|
|
1848
1989
|
}
|
|
1849
1990
|
function computeColumnStats(providerData, scorerNames) {
|
|
1850
1991
|
const stats = /* @__PURE__ */ new Map();
|
|
@@ -1896,62 +2037,274 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
1896
2037
|
}
|
|
1897
2038
|
return stats;
|
|
1898
2039
|
}
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
2040
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
2041
|
+
"correctness",
|
|
2042
|
+
"schema-correctness",
|
|
2043
|
+
"fuzzy-similarity",
|
|
2044
|
+
"llm-judge-correctness",
|
|
2045
|
+
"tool-usage"
|
|
2046
|
+
]);
|
|
2047
|
+
function passesQualityGate(providerId, columnStats) {
|
|
2048
|
+
const qualityColumns = [...columnStats.keys()].filter((k) => QUALITY_SCORERS.has(k));
|
|
2049
|
+
if (qualityColumns.length === 0) return true;
|
|
2050
|
+
return qualityColumns.some((col) => {
|
|
2051
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
2052
|
+
return val !== void 0 && val > 0;
|
|
2053
|
+
});
|
|
1907
2054
|
}
|
|
1908
2055
|
function computeMedals(columnStats, providerIds) {
|
|
1909
2056
|
const medals = /* @__PURE__ */ new Map();
|
|
1910
2057
|
if (providerIds.length < 2) {
|
|
1911
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2058
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1912
2059
|
return medals;
|
|
1913
2060
|
}
|
|
1914
|
-
const
|
|
1915
|
-
|
|
1916
|
-
|
|
2061
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
2062
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
2063
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
2064
|
+
for (const id of providerIds) {
|
|
2065
|
+
qualityWins.set(id, 0);
|
|
2066
|
+
efficiencyWins.set(id, 0);
|
|
2067
|
+
}
|
|
2068
|
+
for (const [colName, colStats] of columnStats) {
|
|
1917
2069
|
if (colStats.best === void 0) continue;
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
2070
|
+
const bestProviders = [...colStats.values.entries()].filter(([, v]) => v !== void 0 && v === colStats.best);
|
|
2071
|
+
if (bestProviders.length === 1) {
|
|
2072
|
+
const winnerId = bestProviders[0][0];
|
|
2073
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
2074
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
2075
|
+
} else {
|
|
2076
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
1921
2077
|
}
|
|
1922
2078
|
}
|
|
1923
2079
|
}
|
|
1924
|
-
const totalWins = [...
|
|
2080
|
+
const totalWins = [...qualityWins.values()].reduce((a, b) => a + b, 0) + [...efficiencyWins.values()].reduce((a, b) => a + b, 0);
|
|
1925
2081
|
if (totalWins === 0) {
|
|
1926
|
-
for (const id of providerIds) medals.set(id, "");
|
|
2082
|
+
for (const id of providerIds) medals.set(id, "none");
|
|
1927
2083
|
return medals;
|
|
1928
2084
|
}
|
|
1929
|
-
const
|
|
1930
|
-
(
|
|
1931
|
-
|
|
1932
|
-
|
|
2085
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a, b) => {
|
|
2086
|
+
const qDiff = (qualityWins.get(b) ?? 0) - (qualityWins.get(a) ?? 0);
|
|
2087
|
+
if (qDiff !== 0) return qDiff;
|
|
2088
|
+
const eDiff = (efficiencyWins.get(b) ?? 0) - (efficiencyWins.get(a) ?? 0);
|
|
2089
|
+
if (eDiff !== 0) return eDiff;
|
|
2090
|
+
return a.localeCompare(b);
|
|
2091
|
+
});
|
|
2092
|
+
const medalList = ["gold", "silver", "bronze"];
|
|
1933
2093
|
let rank = 0;
|
|
1934
|
-
for (let i = 0; i <
|
|
1935
|
-
if (i > 0
|
|
1936
|
-
|
|
2094
|
+
for (let i = 0; i < eligibleSorted.length; i++) {
|
|
2095
|
+
if (i > 0) {
|
|
2096
|
+
const prevQ = qualityWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2097
|
+
const currQ = qualityWins.get(eligibleSorted[i]) ?? 0;
|
|
2098
|
+
if (currQ < prevQ) {
|
|
2099
|
+
rank = i;
|
|
2100
|
+
} else if (currQ === prevQ) {
|
|
2101
|
+
const prevE = efficiencyWins.get(eligibleSorted[i - 1]) ?? 0;
|
|
2102
|
+
const currE = efficiencyWins.get(eligibleSorted[i]) ?? 0;
|
|
2103
|
+
if (currE < prevE) rank = i;
|
|
2104
|
+
}
|
|
1937
2105
|
}
|
|
1938
|
-
medals.set(
|
|
2106
|
+
medals.set(eligibleSorted[i], rank < medalList.length ? medalList[rank] : "none");
|
|
2107
|
+
}
|
|
2108
|
+
for (const id of providerIds) {
|
|
2109
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
1939
2110
|
}
|
|
1940
2111
|
return medals;
|
|
1941
2112
|
}
|
|
2113
|
+
function providerLabel(providerId) {
|
|
2114
|
+
const prefix = providerId.split("/")[0];
|
|
2115
|
+
switch (prefix) {
|
|
2116
|
+
case "azure":
|
|
2117
|
+
return "(OpenAI via Azure)";
|
|
2118
|
+
case "openai":
|
|
2119
|
+
return "(OpenAI)";
|
|
2120
|
+
case "anthropic":
|
|
2121
|
+
return "(Anthropic)";
|
|
2122
|
+
case "google":
|
|
2123
|
+
return "(Google)";
|
|
2124
|
+
case "mistral":
|
|
2125
|
+
return "(Mistral)";
|
|
2126
|
+
case "meta":
|
|
2127
|
+
return "(Meta)";
|
|
2128
|
+
case "deepseek":
|
|
2129
|
+
return "(DeepSeek)";
|
|
2130
|
+
case "cohere":
|
|
2131
|
+
return "(Cohere)";
|
|
2132
|
+
case "qwen":
|
|
2133
|
+
return "(Qwen)";
|
|
2134
|
+
case "xai":
|
|
2135
|
+
return "(xAI)";
|
|
2136
|
+
case "minimax":
|
|
2137
|
+
return "(MiniMax)";
|
|
2138
|
+
case "moonshot":
|
|
2139
|
+
return "(Moonshot / Kimi)";
|
|
2140
|
+
case "perplexity":
|
|
2141
|
+
return "(Perplexity)";
|
|
2142
|
+
case "amazon":
|
|
2143
|
+
return "(Amazon)";
|
|
2144
|
+
case "nvidia":
|
|
2145
|
+
return "(NVIDIA)";
|
|
2146
|
+
case "microsoft":
|
|
2147
|
+
return "(Microsoft)";
|
|
2148
|
+
case "ai21":
|
|
2149
|
+
return "(AI21 Labs)";
|
|
2150
|
+
case "bytedance":
|
|
2151
|
+
return "(ByteDance)";
|
|
2152
|
+
case "together":
|
|
2153
|
+
return "(Together AI)";
|
|
2154
|
+
case "fireworks":
|
|
2155
|
+
return "(Fireworks AI)";
|
|
2156
|
+
case "groq":
|
|
2157
|
+
return "(Groq)";
|
|
2158
|
+
case "cerebras":
|
|
2159
|
+
return "(Cerebras)";
|
|
2160
|
+
default:
|
|
2161
|
+
return `(${prefix})`;
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
function apiKeyHint(providerId, error) {
|
|
2165
|
+
const lower = error.toLowerCase();
|
|
2166
|
+
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2167
|
+
if (!isAuthError) return void 0;
|
|
2168
|
+
const prefix = providerId.split("/")[0];
|
|
2169
|
+
switch (prefix) {
|
|
2170
|
+
case "openai":
|
|
2171
|
+
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2172
|
+
case "azure":
|
|
2173
|
+
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2174
|
+
case "anthropic":
|
|
2175
|
+
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2176
|
+
case "google":
|
|
2177
|
+
return "Set: export GOOGLE_API_KEY=...";
|
|
2178
|
+
default:
|
|
2179
|
+
return `Check the API key for ${providerId}`;
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
function rankProviders(successByProvider, providers, scorerName) {
|
|
2183
|
+
const ranked = providers.map((id) => {
|
|
2184
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2185
|
+
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2186
|
+
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2187
|
+
return { id, avg };
|
|
2188
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2189
|
+
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2190
|
+
}
|
|
2191
|
+
function scorerLabel(name) {
|
|
2192
|
+
switch (name) {
|
|
2193
|
+
case "correctness":
|
|
2194
|
+
return "Match";
|
|
2195
|
+
case "schema-correctness":
|
|
2196
|
+
return "Schema";
|
|
2197
|
+
case "fuzzy-similarity":
|
|
2198
|
+
return "Fuzzy";
|
|
2199
|
+
case "llm-judge-correctness":
|
|
2200
|
+
return "Judge";
|
|
2201
|
+
case "tool-usage":
|
|
2202
|
+
return "Tool";
|
|
2203
|
+
default:
|
|
2204
|
+
return name;
|
|
2205
|
+
}
|
|
2206
|
+
}
|
|
2207
|
+
function medalEmoji(medal) {
|
|
2208
|
+
switch (medal) {
|
|
2209
|
+
case "gold":
|
|
2210
|
+
return "\u{1F947}";
|
|
2211
|
+
case "silver":
|
|
2212
|
+
return "\u{1F948}";
|
|
2213
|
+
case "bronze":
|
|
2214
|
+
return "\u{1F949}";
|
|
2215
|
+
case "none":
|
|
2216
|
+
return "";
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
// src/reporter/console.ts
|
|
2221
|
+
var reset = "\x1B[0m";
|
|
2222
|
+
var boldCode = "\x1B[1m";
|
|
2223
|
+
var dimCode = "\x1B[2m";
|
|
2224
|
+
var green = "\x1B[32m";
|
|
2225
|
+
var red = "\x1B[31m";
|
|
2226
|
+
var yellow = "\x1B[33m";
|
|
2227
|
+
var cyan = "\x1B[36m";
|
|
2228
|
+
var brightGreen = "\x1B[92m";
|
|
2229
|
+
var brightWhite = "\x1B[97m";
|
|
2230
|
+
function bold(s) {
|
|
2231
|
+
return `${boldCode}${s}${reset}`;
|
|
2232
|
+
}
|
|
2233
|
+
function dim(s) {
|
|
2234
|
+
return `${dimCode}${s}${reset}`;
|
|
2235
|
+
}
|
|
2236
|
+
function stripAnsi(s) {
|
|
2237
|
+
return s.replace(/\x1b\[[0-9;]*m/g, "");
|
|
2238
|
+
}
|
|
2239
|
+
function displayWidth(s) {
|
|
2240
|
+
const stripped = stripAnsi(s);
|
|
2241
|
+
let width = 0;
|
|
2242
|
+
for (const ch of stripped) {
|
|
2243
|
+
const code = ch.codePointAt(0) ?? 0;
|
|
2244
|
+
if (code >= 126976) width += 2;
|
|
2245
|
+
else if (code >= 9728 && code <= 10175) width += 2;
|
|
2246
|
+
else width += 1;
|
|
2247
|
+
}
|
|
2248
|
+
return width;
|
|
2249
|
+
}
|
|
2250
|
+
function padCell(str, targetWidth, align) {
|
|
2251
|
+
const dw = displayWidth(str);
|
|
2252
|
+
const padding = Math.max(0, targetWidth - dw);
|
|
2253
|
+
if (align === "right") return " ".repeat(padding) + str;
|
|
2254
|
+
return str + " ".repeat(padding);
|
|
2255
|
+
}
|
|
2256
|
+
function sparkBar(ratio, width = 8) {
|
|
2257
|
+
const clamped = Math.max(0, Math.min(1, ratio));
|
|
2258
|
+
const fillLen = Math.round(clamped * width);
|
|
2259
|
+
const fill = "\u2593".repeat(fillLen);
|
|
2260
|
+
const track = "\u2591".repeat(width - fillLen);
|
|
2261
|
+
return { fill, track };
|
|
2262
|
+
}
|
|
2263
|
+
function drawTableLine(widths, position) {
|
|
2264
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2265
|
+
if (position === "bottom") {
|
|
2266
|
+
return dim(`\u2514${"\u2500".repeat(totalInner)}\u2518`);
|
|
2267
|
+
}
|
|
2268
|
+
if (position === "merge") {
|
|
2269
|
+
return dim(`\u251C${"\u2500".repeat(totalInner)}\u2524`);
|
|
2270
|
+
}
|
|
2271
|
+
const segments = widths.map((w) => "\u2500".repeat(w + 2));
|
|
2272
|
+
if (position === "top") {
|
|
2273
|
+
return dim(`\u250C${segments.join("\u252C")}\u2510`);
|
|
2274
|
+
}
|
|
2275
|
+
return dim(`\u251C${segments.join("\u253C")}\u2524`);
|
|
2276
|
+
}
|
|
2277
|
+
function drawTableRow(cells, widths, aligns) {
|
|
2278
|
+
const parts = cells.map(
|
|
2279
|
+
(cell, i) => " " + padCell(cell, widths[i], aligns[i]) + " "
|
|
2280
|
+
);
|
|
2281
|
+
return dim("\u2502") + parts.join(dim("\u2502")) + dim("\u2502");
|
|
2282
|
+
}
|
|
2283
|
+
function drawSpanRow(content, widths) {
|
|
2284
|
+
const totalInner = widths.reduce((sum, w) => sum + w + 2, 0) + widths.length - 1;
|
|
2285
|
+
const dw = displayWidth(content);
|
|
2286
|
+
const padding = Math.max(0, totalInner - dw - 1);
|
|
2287
|
+
return dim("\u2502") + " " + content + " ".repeat(padding) + dim("\u2502");
|
|
2288
|
+
}
|
|
2289
|
+
function colorByRank(text, value, colStats, providerCount) {
|
|
2290
|
+
if (value === void 0) return dim("\u2014");
|
|
2291
|
+
if (providerCount < 2) return text;
|
|
2292
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return text;
|
|
2293
|
+
if (colStats.best === colStats.worst) return text;
|
|
2294
|
+
if (value === colStats.best) return `${brightGreen}${boldCode}${text}${reset}`;
|
|
2295
|
+
if (value === colStats.worst) return `${red}${text}${reset}`;
|
|
2296
|
+
return `${yellow}${text}${reset}`;
|
|
2297
|
+
}
|
|
1942
2298
|
function consoleReporter(results, options) {
|
|
1943
2299
|
const showSparklines = options?.sparklines ?? true;
|
|
1944
2300
|
if (results.length === 0) {
|
|
1945
2301
|
console.log("\nNo results to display.\n");
|
|
1946
2302
|
return;
|
|
1947
2303
|
}
|
|
1948
|
-
const tasks =
|
|
1949
|
-
const providers = [...new Set(results.map((r) => r.providerId))];
|
|
1950
|
-
const scorerNames = [...new Set(results.flatMap((r) => r.scores.map((s) => s.name)))];
|
|
2304
|
+
const { tasks, providers, scorerNames, grouped, byProvider, hasErrors, maxRun } = groupResults(results);
|
|
1951
2305
|
const hasCost = scorerNames.includes("cost");
|
|
1952
|
-
const hasErrors = results.some((r) => r.error);
|
|
1953
2306
|
const multi = providers.length >= 2;
|
|
1954
|
-
const runsPerCell =
|
|
2307
|
+
const runsPerCell = maxRun;
|
|
1955
2308
|
const runLabel = runsPerCell > 1 ? ` ${dim(`(${runsPerCell} runs each)`)}` : "";
|
|
1956
2309
|
console.log("");
|
|
1957
2310
|
console.log(` ${brightWhite}${boldCode}\u2B21 Agent Duelist${reset}${runLabel}`);
|
|
@@ -1960,29 +2313,9 @@ function consoleReporter(results, options) {
|
|
|
1960
2313
|
for (const task of tasks) {
|
|
1961
2314
|
console.log(` ${bold(`Task: ${task}`)}`);
|
|
1962
2315
|
console.log("");
|
|
1963
|
-
const providerData = providers.map(
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
const successResults = taskResults.filter((r) => !r.error);
|
|
1967
|
-
if (successResults.length === 0) {
|
|
1968
|
-
return {
|
|
1969
|
-
providerId,
|
|
1970
|
-
avgScores: {},
|
|
1971
|
-
avgDetails: { costUsd: void 0, totalTokens: void 0 },
|
|
1972
|
-
latencyMs: void 0,
|
|
1973
|
-
allErrors: errorResults2.length > 0,
|
|
1974
|
-
errorCount: errorResults2.length
|
|
1975
|
-
};
|
|
1976
|
-
}
|
|
1977
|
-
return {
|
|
1978
|
-
providerId,
|
|
1979
|
-
avgScores: averageScores(successResults),
|
|
1980
|
-
avgDetails: averageDetails(successResults),
|
|
1981
|
-
latencyMs: average(successResults.map((r) => r.raw.latencyMs)),
|
|
1982
|
-
allErrors: false,
|
|
1983
|
-
errorCount: errorResults2.length
|
|
1984
|
-
};
|
|
1985
|
-
});
|
|
2316
|
+
const providerData = providers.map(
|
|
2317
|
+
(providerId) => aggregateProviderTask(providerId, grouped, task)
|
|
2318
|
+
);
|
|
1986
2319
|
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
1987
2320
|
const medals = computeMedals(columnStats, providers);
|
|
1988
2321
|
const maxProviderLen = Math.max(...providers.map((id) => id.length));
|
|
@@ -1997,8 +2330,7 @@ function consoleReporter(results, options) {
|
|
|
1997
2330
|
cols.push({ label: "Cost", width: 12, align: "right", statsKey: "cost" });
|
|
1998
2331
|
cols.push({ label: "Tokens", width: 9, align: "right", statsKey: "tokens" });
|
|
1999
2332
|
} else {
|
|
2000
|
-
|
|
2001
|
-
cols.push({ label, width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2333
|
+
cols.push({ label: scorerLabel(name), width: showSparklines ? 15 : 8, align: "right", statsKey: name });
|
|
2002
2334
|
}
|
|
2003
2335
|
}
|
|
2004
2336
|
if (hasErrors) {
|
|
@@ -2011,7 +2343,7 @@ function consoleReporter(results, options) {
|
|
|
2011
2343
|
console.log(` ${drawTableRow(headerCells, widths, aligns)}`);
|
|
2012
2344
|
console.log(` ${drawTableLine(widths, "header")}`);
|
|
2013
2345
|
for (const pd of providerData) {
|
|
2014
|
-
const medal = medals.get(pd.providerId) ?? "";
|
|
2346
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
2015
2347
|
const providerCell = medal ? `${medal} ${pd.providerId}` : pd.providerId;
|
|
2016
2348
|
const cells = [providerCell];
|
|
2017
2349
|
if (pd.allErrors) {
|
|
@@ -2084,7 +2416,7 @@ function consoleReporter(results, options) {
|
|
|
2084
2416
|
console.log(` ${drawTableRow(cells, widths, aligns)}`);
|
|
2085
2417
|
}
|
|
2086
2418
|
if (multi && providerData.some((p) => !p.allErrors)) {
|
|
2087
|
-
const winnerId = [...medals.entries()].find(([, m]) => m === "
|
|
2419
|
+
const winnerId = [...medals.entries()].find(([, m]) => m === "gold")?.[0];
|
|
2088
2420
|
if (winnerId) {
|
|
2089
2421
|
console.log(` ${drawTableLine(widths, "merge")}`);
|
|
2090
2422
|
const winnerText = `${brightGreen}${boldCode}\u{1F3C6} Winner: ${winnerId}${reset} ${dim(providerLabel(winnerId))}`;
|
|
@@ -2094,7 +2426,7 @@ function consoleReporter(results, options) {
|
|
|
2094
2426
|
console.log(` ${drawTableLine(widths, "bottom")}`);
|
|
2095
2427
|
console.log("");
|
|
2096
2428
|
}
|
|
2097
|
-
printSummary(results, providers);
|
|
2429
|
+
printSummary(results, providers, byProvider);
|
|
2098
2430
|
const errorResults = results.filter((r) => r.error);
|
|
2099
2431
|
if (errorResults.length > 0) {
|
|
2100
2432
|
console.log(` ${bold("Errors")}`);
|
|
@@ -2117,203 +2449,66 @@ function consoleReporter(results, options) {
|
|
|
2117
2449
|
console.log("");
|
|
2118
2450
|
}
|
|
2119
2451
|
}
|
|
2120
|
-
function printSummary(results, providers) {
|
|
2452
|
+
function printSummary(results, providers, byProvider) {
|
|
2121
2453
|
const successResults = results.filter((r) => !r.error);
|
|
2122
2454
|
if (successResults.length === 0) return;
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
}
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
}
|
|
2138
|
-
|
|
2139
|
-
|
|
2140
|
-
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2141
|
-
return { id, avg: avg ?? Infinity };
|
|
2142
|
-
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2143
|
-
if (byLatency && byLatency.avg !== Infinity) {
|
|
2144
|
-
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2145
|
-
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2146
|
-
if (single) {
|
|
2147
|
-
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2148
|
-
} else {
|
|
2149
|
-
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2150
|
-
}
|
|
2151
|
-
}
|
|
2152
|
-
const byCost = providers.map((id) => {
|
|
2153
|
-
const runs = successResults.filter((r) => r.providerId === id);
|
|
2154
|
-
const costs = runs.map((r) => {
|
|
2155
|
-
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2156
|
-
return s && s.value >= 0 ? s.value : void 0;
|
|
2157
|
-
}).filter((c) => c !== void 0);
|
|
2158
|
-
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2159
|
-
return { id, avg };
|
|
2160
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2161
|
-
if (byCost?.avg !== void 0) {
|
|
2162
|
-
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2163
|
-
const costStr = formatCost(byCost.avg);
|
|
2164
|
-
if (single) {
|
|
2165
|
-
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2166
|
-
} else {
|
|
2167
|
-
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2168
|
-
}
|
|
2169
|
-
}
|
|
2170
|
-
if (!single) {
|
|
2171
|
-
const wins = /* @__PURE__ */ new Map();
|
|
2172
|
-
for (const id of providers) wins.set(id, 0);
|
|
2173
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
2174
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
2175
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
2176
|
-
const maxWins = Math.max(...wins.values());
|
|
2177
|
-
if (maxWins > 0) {
|
|
2178
|
-
const topProviders = [...wins.entries()].filter(([, w]) => w === maxWins);
|
|
2179
|
-
console.log("");
|
|
2180
|
-
if (topProviders.length === 1) {
|
|
2181
|
-
const [winnerId, winCount] = topProviders[0];
|
|
2182
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
2183
|
-
} else {
|
|
2184
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
2185
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
2186
|
-
}
|
|
2187
|
-
}
|
|
2188
|
-
}
|
|
2189
|
-
console.log("");
|
|
2190
|
-
}
|
|
2191
|
-
function rankProviders(results, providers, scorerName) {
|
|
2192
|
-
const ranked = providers.map((id) => {
|
|
2193
|
-
const runs = results.filter((r) => r.providerId === id);
|
|
2194
|
-
const scores = runs.flatMap((r) => r.scores.filter((s) => s.name === scorerName && s.value >= 0)).map((s) => s.value);
|
|
2195
|
-
const avg = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
2196
|
-
return { id, avg };
|
|
2197
|
-
}).filter((p) => p.avg !== void 0).sort((a, b) => b.avg - a.avg);
|
|
2198
|
-
return ranked[0] ? { id: ranked[0].id, avg: ranked[0].avg } : void 0;
|
|
2199
|
-
}
|
|
2200
|
-
function averageScores(results) {
|
|
2201
|
-
const sums = {};
|
|
2202
|
-
const counts = {};
|
|
2203
|
-
for (const result of results) {
|
|
2204
|
-
for (const score of result.scores) {
|
|
2205
|
-
if (score.value < 0) continue;
|
|
2206
|
-
sums[score.name] = (sums[score.name] ?? 0) + score.value;
|
|
2207
|
-
counts[score.name] = (counts[score.name] ?? 0) + 1;
|
|
2208
|
-
}
|
|
2209
|
-
}
|
|
2210
|
-
const avgs = {};
|
|
2211
|
-
for (const name of Object.keys(sums)) {
|
|
2212
|
-
avgs[name] = sums[name] / counts[name];
|
|
2213
|
-
}
|
|
2214
|
-
return avgs;
|
|
2215
|
-
}
|
|
2216
|
-
function averageDetails(results) {
|
|
2217
|
-
let costSum = 0;
|
|
2218
|
-
let costCount = 0;
|
|
2219
|
-
let tokenSum = 0;
|
|
2220
|
-
let tokenCount = 0;
|
|
2221
|
-
for (const result of results) {
|
|
2222
|
-
const costScore = result.scores.find((s) => s.name === "cost");
|
|
2223
|
-
const details = costScore?.details;
|
|
2224
|
-
if (details?.estimatedUsd != null) {
|
|
2225
|
-
costSum += details.estimatedUsd;
|
|
2226
|
-
costCount++;
|
|
2455
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2456
|
+
for (const id of providers) {
|
|
2457
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2458
|
+
}
|
|
2459
|
+
console.log(` ${bold("Summary")}`);
|
|
2460
|
+
console.log(` ${dim("\u2501".repeat(72))}`);
|
|
2461
|
+
console.log("");
|
|
2462
|
+
const single = providers.length === 1;
|
|
2463
|
+
const correctnessKey = successResults.some((r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)) ? "llm-judge-correctness" : "correctness";
|
|
2464
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2465
|
+
if (byCorrectness) {
|
|
2466
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2467
|
+
const pctStr = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2468
|
+
if (single) {
|
|
2469
|
+
console.log(` ${medal} Avg correctness: ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2470
|
+
} else {
|
|
2471
|
+
console.log(` ${medal} Most correct: ${bold(byCorrectness.id)} ${dim(providerLabel(byCorrectness.id))} ${brightGreen}${boldCode}${pctStr}${reset}`);
|
|
2227
2472
|
}
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2473
|
+
}
|
|
2474
|
+
const byLatency = providers.map((id) => {
|
|
2475
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2476
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2477
|
+
return { id, avg: avg ?? Infinity };
|
|
2478
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2479
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
2480
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2481
|
+
const msStr = `${Math.round(byLatency.avg)}ms`;
|
|
2482
|
+
if (single) {
|
|
2483
|
+
console.log(` ${medal} Avg latency: ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2484
|
+
} else {
|
|
2485
|
+
console.log(` ${medal} Fastest: ${bold(byLatency.id)} ${dim(providerLabel(byLatency.id))} ${brightGreen}${boldCode}${msStr}${reset}`);
|
|
2231
2486
|
}
|
|
2232
2487
|
}
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
|
|
2241
|
-
}
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
}
|
|
2249
|
-
|
|
2250
|
-
const lower = error.toLowerCase();
|
|
2251
|
-
const isAuthError = lower.includes("api key") || lower.includes("401") || lower.includes("unauthorized") || lower.includes("authentication") || lower.includes("incorrect api key") || lower.includes("apikey");
|
|
2252
|
-
if (!isAuthError) return void 0;
|
|
2253
|
-
const prefix = providerId.split("/")[0];
|
|
2254
|
-
switch (prefix) {
|
|
2255
|
-
case "openai":
|
|
2256
|
-
return "Set: export OPENAI_API_KEY=sk-...";
|
|
2257
|
-
case "azure":
|
|
2258
|
-
return "Set: export AZURE_OPENAI_API_KEY=... and AZURE_OPENAI_ENDPOINT=...";
|
|
2259
|
-
case "anthropic":
|
|
2260
|
-
return "Set: export ANTHROPIC_API_KEY=sk-ant-...";
|
|
2261
|
-
case "google":
|
|
2262
|
-
return "Set: export GOOGLE_API_KEY=...";
|
|
2263
|
-
default:
|
|
2264
|
-
return `Check the API key for ${providerId}`;
|
|
2488
|
+
const byCost = providers.map((id) => {
|
|
2489
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2490
|
+
const costs = runs.map((r) => {
|
|
2491
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2492
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2493
|
+
}).filter((c) => c !== void 0);
|
|
2494
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2495
|
+
return { id, avg };
|
|
2496
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2497
|
+
if (byCost?.avg !== void 0) {
|
|
2498
|
+
const medal = single ? `${cyan}\u25C6${reset}` : "\u{1F947}";
|
|
2499
|
+
const costStr = formatCost(byCost.avg);
|
|
2500
|
+
if (single) {
|
|
2501
|
+
console.log(` ${medal} Avg cost: ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2502
|
+
} else {
|
|
2503
|
+
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
2504
|
+
}
|
|
2265
2505
|
}
|
|
2266
|
-
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
case "azure":
|
|
2271
|
-
return "(OpenAI via Azure)";
|
|
2272
|
-
case "openai":
|
|
2273
|
-
return "(OpenAI)";
|
|
2274
|
-
case "anthropic":
|
|
2275
|
-
return "(Anthropic)";
|
|
2276
|
-
case "google":
|
|
2277
|
-
return "(Google)";
|
|
2278
|
-
case "mistral":
|
|
2279
|
-
return "(Mistral)";
|
|
2280
|
-
case "meta":
|
|
2281
|
-
return "(Meta)";
|
|
2282
|
-
case "deepseek":
|
|
2283
|
-
return "(DeepSeek)";
|
|
2284
|
-
case "cohere":
|
|
2285
|
-
return "(Cohere)";
|
|
2286
|
-
case "qwen":
|
|
2287
|
-
return "(Qwen)";
|
|
2288
|
-
case "xai":
|
|
2289
|
-
return "(xAI)";
|
|
2290
|
-
case "minimax":
|
|
2291
|
-
return "(MiniMax)";
|
|
2292
|
-
case "moonshot":
|
|
2293
|
-
return "(Moonshot / Kimi)";
|
|
2294
|
-
case "perplexity":
|
|
2295
|
-
return "(Perplexity)";
|
|
2296
|
-
case "amazon":
|
|
2297
|
-
return "(Amazon)";
|
|
2298
|
-
case "nvidia":
|
|
2299
|
-
return "(NVIDIA)";
|
|
2300
|
-
case "microsoft":
|
|
2301
|
-
return "(Microsoft)";
|
|
2302
|
-
case "ai21":
|
|
2303
|
-
return "(AI21 Labs)";
|
|
2304
|
-
case "bytedance":
|
|
2305
|
-
return "(ByteDance)";
|
|
2306
|
-
case "together":
|
|
2307
|
-
return "(Together AI)";
|
|
2308
|
-
case "fireworks":
|
|
2309
|
-
return "(Fireworks AI)";
|
|
2310
|
-
case "groq":
|
|
2311
|
-
return "(Groq)";
|
|
2312
|
-
case "cerebras":
|
|
2313
|
-
return "(Cerebras)";
|
|
2314
|
-
default:
|
|
2315
|
-
return `(${prefix})`;
|
|
2506
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
2507
|
+
console.log("");
|
|
2508
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
2509
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
2316
2510
|
}
|
|
2511
|
+
console.log("");
|
|
2317
2512
|
}
|
|
2318
2513
|
|
|
2319
2514
|
// src/reporter/json.ts
|
|
@@ -2345,15 +2540,15 @@ function defineArena(config) {
|
|
|
2345
2540
|
if (config.providers.length === 0) {
|
|
2346
2541
|
throw new Error("At least one provider is required");
|
|
2347
2542
|
}
|
|
2348
|
-
if (config.tasks.length === 0) {
|
|
2349
|
-
throw new Error("At least one task is required");
|
|
2350
|
-
}
|
|
2351
2543
|
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
2352
2544
|
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
2353
2545
|
const runs = config.runs ?? 1;
|
|
2354
2546
|
return {
|
|
2355
2547
|
config,
|
|
2356
2548
|
async run(options) {
|
|
2549
|
+
if (config.tasks.length === 0) {
|
|
2550
|
+
throw new Error("At least one task is required");
|
|
2551
|
+
}
|
|
2357
2552
|
return runBenchmarks({
|
|
2358
2553
|
providers: config.providers,
|
|
2359
2554
|
tasks: config.tasks,
|
|
@@ -2379,23 +2574,19 @@ function anthropic(model, options) {
|
|
|
2379
2574
|
model,
|
|
2380
2575
|
async run(input) {
|
|
2381
2576
|
const start = Date.now();
|
|
2382
|
-
const systemMessage = input.schema ?
|
|
2577
|
+
const systemMessage = input.schema ? buildSchemaSystemMessage(input.schema) : void 0;
|
|
2578
|
+
const reqOpts = { signal: input.signal };
|
|
2579
|
+
if (input.timeout) reqOpts.timeout = input.timeout;
|
|
2383
2580
|
const response = await client.messages.create({
|
|
2384
2581
|
model,
|
|
2385
2582
|
max_tokens: maxTokens,
|
|
2386
2583
|
system: systemMessage,
|
|
2387
2584
|
messages: [{ role: "user", content: input.prompt }]
|
|
2388
|
-
},
|
|
2585
|
+
}, reqOpts);
|
|
2389
2586
|
const latencyMs = Date.now() - start;
|
|
2390
2587
|
const textBlock = response.content.find((b) => b.type === "text");
|
|
2391
2588
|
const rawContent = textBlock?.type === "text" ? textBlock.text : "";
|
|
2392
|
-
|
|
2393
|
-
if (input.schema) {
|
|
2394
|
-
try {
|
|
2395
|
-
output = JSON.parse(rawContent);
|
|
2396
|
-
} catch {
|
|
2397
|
-
}
|
|
2398
|
-
}
|
|
2589
|
+
const output = parseSchemaOutput(rawContent, !!input.schema);
|
|
2399
2590
|
return {
|
|
2400
2591
|
output,
|
|
2401
2592
|
usage: {
|
|
@@ -2409,23 +2600,6 @@ function anthropic(model, options) {
|
|
|
2409
2600
|
};
|
|
2410
2601
|
}
|
|
2411
2602
|
|
|
2412
|
-
// src/providers/gemini.ts
|
|
2413
|
-
import OpenAI3 from "openai";
|
|
2414
|
-
function gemini(model, options) {
|
|
2415
|
-
const apiKey = options?.apiKey ?? process.env.GOOGLE_API_KEY;
|
|
2416
|
-
if (!apiKey) {
|
|
2417
|
-
throw new Error(
|
|
2418
|
-
`Missing API key for google/${model}. Set GOOGLE_API_KEY or pass apiKey option.`
|
|
2419
|
-
);
|
|
2420
|
-
}
|
|
2421
|
-
const client = new OpenAI3({
|
|
2422
|
-
apiKey,
|
|
2423
|
-
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
2424
|
-
timeout: options?.timeoutMs ?? REQUEST_TIMEOUT_MS
|
|
2425
|
-
});
|
|
2426
|
-
return makeProvider(`google/${model}`, "Google AI", model, client, model);
|
|
2427
|
-
}
|
|
2428
|
-
|
|
2429
2603
|
// src/reporter/markdown.ts
|
|
2430
2604
|
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
2431
2605
|
function markdownReporter(report, _current) {
|
|
@@ -2470,7 +2644,7 @@ function markdownComparisonTable(comparisons) {
|
|
|
2470
2644
|
for (const c of comparisons) {
|
|
2471
2645
|
const baselineStr = c.baseline ? formatStats(c.baseline) : "\u2014";
|
|
2472
2646
|
const currentStr = formatStats(c.current);
|
|
2473
|
-
const deltaStr = c.delta !== null ? formatDelta(c.delta) : "\u2014";
|
|
2647
|
+
const deltaStr = c.delta !== null ? formatDelta(c.delta, 3) : "\u2014";
|
|
2474
2648
|
const status = statusIndicator(c);
|
|
2475
2649
|
lines.push(`| ${c.providerId} | ${c.taskName} | ${c.scorerName} | ${baselineStr} | ${currentStr} | ${deltaStr} | ${status} |`);
|
|
2476
2650
|
}
|
|
@@ -2503,10 +2677,6 @@ function formatStats(stats) {
|
|
|
2503
2677
|
}
|
|
2504
2678
|
return stats.mean.toFixed(3);
|
|
2505
2679
|
}
|
|
2506
|
-
function formatDelta(delta) {
|
|
2507
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2508
|
-
return `${sign}${delta.toFixed(3)}`;
|
|
2509
|
-
}
|
|
2510
2680
|
function statusIndicator(c) {
|
|
2511
2681
|
if (c.regressed) return "\u{1F534} regressed";
|
|
2512
2682
|
if (c.improved) return "\u{1F7E2} improved";
|
|
@@ -2514,6 +2684,778 @@ function statusIndicator(c) {
|
|
|
2514
2684
|
return "\u26AA unchanged";
|
|
2515
2685
|
}
|
|
2516
2686
|
|
|
2687
|
+
// src/reporter/html.ts
|
|
2688
|
+
function esc(s) {
|
|
2689
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
2690
|
+
}
|
|
2691
|
+
function htmlReporter(results) {
|
|
2692
|
+
if (results.length === 0) {
|
|
2693
|
+
return emptyReport();
|
|
2694
|
+
}
|
|
2695
|
+
const { tasks, providers, scorerNames, grouped, byProvider, maxRun } = groupResults(results);
|
|
2696
|
+
const hasCost = scorerNames.includes("cost");
|
|
2697
|
+
const multi = providers.length >= 2;
|
|
2698
|
+
const runsLabel = maxRun > 1 ? `${maxRun} runs each` : "1 run";
|
|
2699
|
+
const taskSections = tasks.map((task) => {
|
|
2700
|
+
const providerData = providers.map((id) => aggregateProviderTask(id, grouped, task));
|
|
2701
|
+
const columnStats = computeColumnStats(providerData, scorerNames);
|
|
2702
|
+
const medals = computeMedals(columnStats, providers);
|
|
2703
|
+
const winnerId = multi ? [...medals.entries()].find(([, m]) => m === "gold")?.[0] : void 0;
|
|
2704
|
+
return { task, providerData, columnStats, medals, winnerId };
|
|
2705
|
+
});
|
|
2706
|
+
const successResults = results.filter((r) => !r.error);
|
|
2707
|
+
const successByProvider = /* @__PURE__ */ new Map();
|
|
2708
|
+
for (const id of providers) {
|
|
2709
|
+
successByProvider.set(id, (byProvider.get(id) ?? []).filter((r) => !r.error));
|
|
2710
|
+
}
|
|
2711
|
+
const correctnessKey = successResults.some(
|
|
2712
|
+
(r) => r.scores.some((s) => s.name === "llm-judge-correctness" && s.value >= 0)
|
|
2713
|
+
) ? "llm-judge-correctness" : "correctness";
|
|
2714
|
+
const byCorrectness = rankProviders(successByProvider, providers, correctnessKey);
|
|
2715
|
+
const byLatency = providers.map((id) => {
|
|
2716
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2717
|
+
const avg = average(runs.map((r) => r.raw.latencyMs));
|
|
2718
|
+
return { id, avg: avg ?? Infinity };
|
|
2719
|
+
}).sort((a, b) => a.avg - b.avg)[0];
|
|
2720
|
+
const byCost = providers.map((id) => {
|
|
2721
|
+
const runs = successByProvider.get(id) ?? [];
|
|
2722
|
+
const costs = runs.map((r) => {
|
|
2723
|
+
const s = r.scores.find((s2) => s2.name === "cost");
|
|
2724
|
+
return s && s.value >= 0 ? s.value : void 0;
|
|
2725
|
+
}).filter((c) => c !== void 0);
|
|
2726
|
+
const avg = costs.length > 0 ? costs.reduce((a, b) => a + b, 0) / costs.length : void 0;
|
|
2727
|
+
return { id, avg };
|
|
2728
|
+
}).filter((p) => p.avg !== void 0).sort((a, b) => a.avg - b.avg)[0];
|
|
2729
|
+
let overallWinner;
|
|
2730
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
2731
|
+
overallWinner = byCorrectness.id;
|
|
2732
|
+
}
|
|
2733
|
+
const errorResults = results.filter((r) => r.error);
|
|
2734
|
+
const deduped = dedupeErrors(errorResults);
|
|
2735
|
+
return `<!DOCTYPE html>
|
|
2736
|
+
<html lang="en">
|
|
2737
|
+
<head>
|
|
2738
|
+
<meta charset="UTF-8">
|
|
2739
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2740
|
+
<title>Agent Duelist Report</title>
|
|
2741
|
+
<meta name="description" content="LLM provider benchmark results \u2014 ${providers.length} provider${providers.length !== 1 ? "s" : ""}, ${tasks.length} task${tasks.length !== 1 ? "s" : ""}">
|
|
2742
|
+
<meta property="og:title" content="Agent Duelist Report">
|
|
2743
|
+
<meta property="og:description" content="LLM provider benchmark: ${providers.map(esc).join(" vs ")}">
|
|
2744
|
+
<meta property="og:type" content="website">
|
|
2745
|
+
${renderStyle()}
|
|
2746
|
+
</head>
|
|
2747
|
+
<body>
|
|
2748
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2749
|
+
<div class="report">
|
|
2750
|
+
|
|
2751
|
+
${renderHeader(runsLabel, providers.length, tasks.length)}
|
|
2752
|
+
|
|
2753
|
+
${tasks.length > 1 ? renderTabs(tasks) : ""}
|
|
2754
|
+
|
|
2755
|
+
<main>
|
|
2756
|
+
${taskSections.map((s, i) => renderTaskSection(
|
|
2757
|
+
s.task,
|
|
2758
|
+
s.providerData,
|
|
2759
|
+
s.columnStats,
|
|
2760
|
+
s.medals,
|
|
2761
|
+
s.winnerId,
|
|
2762
|
+
scorerNames,
|
|
2763
|
+
hasCost,
|
|
2764
|
+
multi,
|
|
2765
|
+
i
|
|
2766
|
+
)).join("\n")}
|
|
2767
|
+
</main>
|
|
2768
|
+
|
|
2769
|
+
${renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi)}
|
|
2770
|
+
|
|
2771
|
+
${deduped.length > 0 ? renderErrors(deduped) : ""}
|
|
2772
|
+
|
|
2773
|
+
${renderFooter()}
|
|
2774
|
+
|
|
2775
|
+
</div>
|
|
2776
|
+
${renderScript(tasks.length)}
|
|
2777
|
+
</body>
|
|
2778
|
+
</html>`;
|
|
2779
|
+
}
|
|
2780
|
+
function emptyReport() {
|
|
2781
|
+
return `<!DOCTYPE html>
|
|
2782
|
+
<html lang="en">
|
|
2783
|
+
<head>
|
|
2784
|
+
<meta charset="UTF-8">
|
|
2785
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2786
|
+
<title>Agent Duelist Report</title>
|
|
2787
|
+
${renderStyle()}
|
|
2788
|
+
</head>
|
|
2789
|
+
<body>
|
|
2790
|
+
<div class="bg-mesh"><div class="bg-mesh-extra"></div></div>
|
|
2791
|
+
<div class="report">
|
|
2792
|
+
${renderHeader("0 runs", 0, 0)}
|
|
2793
|
+
<main><p class="empty-msg">No results to display.</p></main>
|
|
2794
|
+
${renderFooter()}
|
|
2795
|
+
</div>
|
|
2796
|
+
</body>
|
|
2797
|
+
</html>`;
|
|
2798
|
+
}
|
|
2799
|
+
function dedupeErrors(errorResults) {
|
|
2800
|
+
const seen = /* @__PURE__ */ new Map();
|
|
2801
|
+
for (const r of errorResults) {
|
|
2802
|
+
const key = `${r.providerId}::${r.error}`;
|
|
2803
|
+
const existing = seen.get(key);
|
|
2804
|
+
if (existing) {
|
|
2805
|
+
existing.count++;
|
|
2806
|
+
} else {
|
|
2807
|
+
seen.set(key, {
|
|
2808
|
+
providerId: r.providerId,
|
|
2809
|
+
error: r.error ?? "Unknown error",
|
|
2810
|
+
count: 1,
|
|
2811
|
+
hint: apiKeyHint(r.providerId, r.error ?? "")
|
|
2812
|
+
});
|
|
2813
|
+
}
|
|
2814
|
+
}
|
|
2815
|
+
return [...seen.values()];
|
|
2816
|
+
}
|
|
2817
|
+
function renderStyle() {
|
|
2818
|
+
return `<style>
|
|
2819
|
+
:root {
|
|
2820
|
+
--bg: #0f172a;
|
|
2821
|
+
--bg-deep: #020617;
|
|
2822
|
+
--panel: rgba(15, 23, 42, 0.85);
|
|
2823
|
+
--accent: #f59e0b;
|
|
2824
|
+
--accent-soft: rgba(245, 158, 11, 0.15);
|
|
2825
|
+
--text: #e2e8f0;
|
|
2826
|
+
--muted: #94a3b8;
|
|
2827
|
+
--border: rgba(148, 163, 184, 0.15);
|
|
2828
|
+
--green: #22c55e;
|
|
2829
|
+
--red: #ef4444;
|
|
2830
|
+
--yellow: #eab308;
|
|
2831
|
+
--radius: 12px;
|
|
2832
|
+
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Monaco, Consolas, monospace;
|
|
2833
|
+
--sans: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
|
2834
|
+
}
|
|
2835
|
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
2836
|
+
html, body {
|
|
2837
|
+
font-family: var(--sans);
|
|
2838
|
+
background: var(--bg);
|
|
2839
|
+
color: var(--text);
|
|
2840
|
+
min-height: 100vh;
|
|
2841
|
+
}
|
|
2842
|
+
body { padding: 24px; display: flex; justify-content: center; }
|
|
2843
|
+
|
|
2844
|
+
/* Animated gradient mesh */
|
|
2845
|
+
.bg-mesh {
|
|
2846
|
+
position: fixed; inset: 0; z-index: 0;
|
|
2847
|
+
overflow: hidden; pointer-events: none;
|
|
2848
|
+
}
|
|
2849
|
+
.bg-mesh::before, .bg-mesh::after {
|
|
2850
|
+
content: ""; position: absolute; border-radius: 50%;
|
|
2851
|
+
filter: blur(120px); opacity: 0.4;
|
|
2852
|
+
}
|
|
2853
|
+
.bg-mesh::before {
|
|
2854
|
+
width: 600px; height: 600px;
|
|
2855
|
+
background: radial-gradient(circle, rgba(245,158,11,0.18), transparent 70%);
|
|
2856
|
+
top: -10%; left: -5%;
|
|
2857
|
+
animation: meshDrift1 18s ease-in-out infinite alternate;
|
|
2858
|
+
}
|
|
2859
|
+
.bg-mesh::after {
|
|
2860
|
+
width: 500px; height: 500px;
|
|
2861
|
+
background: radial-gradient(circle, rgba(139,92,246,0.12), transparent 70%);
|
|
2862
|
+
bottom: -10%; right: -5%;
|
|
2863
|
+
animation: meshDrift2 22s ease-in-out infinite alternate;
|
|
2864
|
+
}
|
|
2865
|
+
.bg-mesh-extra {
|
|
2866
|
+
position: absolute; width: 400px; height: 400px;
|
|
2867
|
+
border-radius: 50%; filter: blur(100px); opacity: 0.3;
|
|
2868
|
+
background: radial-gradient(circle, rgba(56,189,248,0.12), transparent 70%);
|
|
2869
|
+
top: 50%; left: 60%;
|
|
2870
|
+
animation: meshDrift3 15s ease-in-out infinite alternate;
|
|
2871
|
+
}
|
|
2872
|
+
@keyframes meshDrift1 { from { transform: translate(0,0) scale(1); } to { transform: translate(80px,60px) scale(1.15); } }
|
|
2873
|
+
@keyframes meshDrift2 { from { transform: translate(0,0) scale(1); } to { transform: translate(-60px,-50px) scale(1.1); } }
|
|
2874
|
+
@keyframes meshDrift3 { from { transform: translate(0,0) scale(1); } to { transform: translate(-40px,40px) scale(1.2); } }
|
|
2875
|
+
|
|
2876
|
+
/* Report container */
|
|
2877
|
+
.report {
|
|
2878
|
+
position: relative; z-index: 1;
|
|
2879
|
+
width: 100%; max-width: 960px;
|
|
2880
|
+
}
|
|
2881
|
+
|
|
2882
|
+
/* Header */
|
|
2883
|
+
.report-header {
|
|
2884
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
2885
|
+
padding: 20px 0; margin-bottom: 8px;
|
|
2886
|
+
}
|
|
2887
|
+
.report-brand {
|
|
2888
|
+
display: flex; align-items: center; gap: 10px;
|
|
2889
|
+
text-decoration: none; color: var(--muted);
|
|
2890
|
+
font-weight: 600; font-size: 14px;
|
|
2891
|
+
letter-spacing: 0.04em; text-transform: uppercase;
|
|
2892
|
+
}
|
|
2893
|
+
.report-brand:hover { color: var(--text); }
|
|
2894
|
+
.brand-icon {
|
|
2895
|
+
width: 32px; height: 32px; border-radius: 8px;
|
|
2896
|
+
background: linear-gradient(135deg, var(--accent-soft), rgba(245,158,11,0.05));
|
|
2897
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
2898
|
+
display: flex; align-items: center; justify-content: center;
|
|
2899
|
+
font-size: 16px;
|
|
2900
|
+
}
|
|
2901
|
+
.report-meta {
|
|
2902
|
+
font-size: 12px; color: var(--muted);
|
|
2903
|
+
text-align: right; line-height: 1.6;
|
|
2904
|
+
}
|
|
2905
|
+
|
|
2906
|
+
/* Task tabs */
|
|
2907
|
+
.task-tabs {
|
|
2908
|
+
display: flex; gap: 6px; margin-bottom: 16px; flex-wrap: wrap;
|
|
2909
|
+
}
|
|
2910
|
+
.task-tab {
|
|
2911
|
+
padding: 6px 16px; border-radius: 999px;
|
|
2912
|
+
border: 1px solid var(--border);
|
|
2913
|
+
background: transparent; color: var(--muted);
|
|
2914
|
+
font-size: 13px; font-weight: 500; cursor: pointer;
|
|
2915
|
+
transition: all 150ms ease;
|
|
2916
|
+
}
|
|
2917
|
+
.task-tab:hover { border-color: rgba(245,158,11,0.3); color: var(--text); }
|
|
2918
|
+
.task-tab.active {
|
|
2919
|
+
background: var(--accent-soft);
|
|
2920
|
+
border-color: rgba(245,158,11,0.4);
|
|
2921
|
+
color: var(--accent);
|
|
2922
|
+
}
|
|
2923
|
+
|
|
2924
|
+
/* Task sections */
|
|
2925
|
+
.task-section { display: none; }
|
|
2926
|
+
.task-section.active { display: block; }
|
|
2927
|
+
.task-name {
|
|
2928
|
+
font-size: 18px; font-weight: 600;
|
|
2929
|
+
margin-bottom: 12px; letter-spacing: -0.01em;
|
|
2930
|
+
}
|
|
2931
|
+
|
|
2932
|
+
/* Results table */
|
|
2933
|
+
.results-table {
|
|
2934
|
+
width: 100%; border-collapse: collapse;
|
|
2935
|
+
font-size: 13px; margin-bottom: 16px;
|
|
2936
|
+
border-radius: var(--radius); overflow: hidden;
|
|
2937
|
+
border: 1px solid var(--border);
|
|
2938
|
+
}
|
|
2939
|
+
.results-table th, .results-table td {
|
|
2940
|
+
padding: 10px 14px;
|
|
2941
|
+
text-align: left;
|
|
2942
|
+
border-bottom: 1px solid var(--border);
|
|
2943
|
+
}
|
|
2944
|
+
.results-table th {
|
|
2945
|
+
background: rgba(0,0,0,0.3);
|
|
2946
|
+
font-size: 11px; font-weight: 600;
|
|
2947
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
2948
|
+
color: var(--muted); cursor: pointer;
|
|
2949
|
+
user-select: none; white-space: nowrap;
|
|
2950
|
+
}
|
|
2951
|
+
.results-table th:hover { color: var(--text); }
|
|
2952
|
+
.results-table th .sort-arrow { margin-left: 4px; font-size: 10px; }
|
|
2953
|
+
.results-table tbody tr {
|
|
2954
|
+
background: var(--panel);
|
|
2955
|
+
transition: background 120ms ease;
|
|
2956
|
+
}
|
|
2957
|
+
.results-table tbody tr:hover { background: rgba(15,23,42,0.95); }
|
|
2958
|
+
.results-table tbody tr:last-child td { border-bottom: none; }
|
|
2959
|
+
|
|
2960
|
+
/* Score cell with progress bar */
|
|
2961
|
+
.score-cell { position: relative; min-width: 90px; }
|
|
2962
|
+
.score-bar {
|
|
2963
|
+
position: absolute; left: 0; bottom: 0;
|
|
2964
|
+
height: 3px; border-radius: 2px;
|
|
2965
|
+
transition: width 300ms ease;
|
|
2966
|
+
}
|
|
2967
|
+
.score-val { position: relative; z-index: 1; font-family: var(--mono); font-size: 12px; }
|
|
2968
|
+
|
|
2969
|
+
/* Color ranking */
|
|
2970
|
+
.rank-best { color: var(--green); font-weight: 600; }
|
|
2971
|
+
.rank-worst { color: var(--red); }
|
|
2972
|
+
.rank-mid { color: var(--yellow); }
|
|
2973
|
+
.rank-neutral { color: var(--text); }
|
|
2974
|
+
.rank-error { color: var(--muted); }
|
|
2975
|
+
|
|
2976
|
+
/* Winner banner */
|
|
2977
|
+
.task-winner {
|
|
2978
|
+
display: flex; align-items: center; gap: 10px;
|
|
2979
|
+
padding: 12px 18px; margin-bottom: 20px;
|
|
2980
|
+
border-radius: var(--radius);
|
|
2981
|
+
background: linear-gradient(135deg, rgba(34,197,94,0.08), rgba(34,197,94,0.02));
|
|
2982
|
+
border: 1px solid rgba(34,197,94,0.2);
|
|
2983
|
+
font-size: 14px; font-weight: 500;
|
|
2984
|
+
}
|
|
2985
|
+
.task-winner .trophy { font-size: 20px; }
|
|
2986
|
+
.task-winner .winner-name { color: var(--green); font-weight: 600; }
|
|
2987
|
+
.task-winner .winner-label { color: var(--muted); font-size: 12px; margin-left: 4px; }
|
|
2988
|
+
|
|
2989
|
+
/* Summary cards */
|
|
2990
|
+
.summary-section { margin-top: 32px; }
|
|
2991
|
+
.summary-title {
|
|
2992
|
+
font-size: 16px; font-weight: 600;
|
|
2993
|
+
margin-bottom: 12px; color: var(--text);
|
|
2994
|
+
}
|
|
2995
|
+
.summary-cards {
|
|
2996
|
+
display: grid;
|
|
2997
|
+
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
|
2998
|
+
gap: 12px;
|
|
2999
|
+
}
|
|
3000
|
+
.summary-card {
|
|
3001
|
+
padding: 16px; border-radius: var(--radius);
|
|
3002
|
+
border: 1px solid var(--border);
|
|
3003
|
+
background: var(--panel);
|
|
3004
|
+
}
|
|
3005
|
+
.summary-card .card-label {
|
|
3006
|
+
font-size: 11px; font-weight: 600;
|
|
3007
|
+
text-transform: uppercase; letter-spacing: 0.05em;
|
|
3008
|
+
color: var(--muted); margin-bottom: 6px;
|
|
3009
|
+
}
|
|
3010
|
+
.summary-card .card-value {
|
|
3011
|
+
font-size: 20px; font-weight: 700;
|
|
3012
|
+
color: var(--green); font-family: var(--mono);
|
|
3013
|
+
}
|
|
3014
|
+
.summary-card .card-provider {
|
|
3015
|
+
font-size: 12px; color: var(--muted); margin-top: 4px;
|
|
3016
|
+
}
|
|
3017
|
+
|
|
3018
|
+
/* Errors */
|
|
3019
|
+
.errors-section { margin-top: 24px; }
|
|
3020
|
+
.errors-title {
|
|
3021
|
+
font-size: 16px; font-weight: 600;
|
|
3022
|
+
margin-bottom: 8px; color: var(--red);
|
|
3023
|
+
cursor: pointer;
|
|
3024
|
+
}
|
|
3025
|
+
.errors-list {
|
|
3026
|
+
border-radius: var(--radius);
|
|
3027
|
+
border: 1px solid rgba(239,68,68,0.2);
|
|
3028
|
+
background: rgba(239,68,68,0.04);
|
|
3029
|
+
overflow: hidden;
|
|
3030
|
+
}
|
|
3031
|
+
.error-item {
|
|
3032
|
+
padding: 10px 16px;
|
|
3033
|
+
border-bottom: 1px solid rgba(239,68,68,0.1);
|
|
3034
|
+
font-size: 13px;
|
|
3035
|
+
}
|
|
3036
|
+
.error-item:last-child { border-bottom: none; }
|
|
3037
|
+
.error-provider { font-weight: 600; color: var(--text); }
|
|
3038
|
+
.error-msg { color: var(--muted); margin-left: 8px; }
|
|
3039
|
+
.error-count { color: var(--muted); font-size: 11px; }
|
|
3040
|
+
.error-hint { color: var(--muted); font-size: 12px; margin-top: 4px; font-style: italic; }
|
|
3041
|
+
|
|
3042
|
+
/* Footer */
|
|
3043
|
+
.report-footer {
|
|
3044
|
+
margin-top: 40px; padding: 20px 0;
|
|
3045
|
+
border-top: 1px solid var(--border);
|
|
3046
|
+
display: flex; justify-content: space-between; align-items: center;
|
|
3047
|
+
flex-wrap: wrap; gap: 12px;
|
|
3048
|
+
}
|
|
3049
|
+
.footer-brand {
|
|
3050
|
+
font-size: 13px; color: var(--muted);
|
|
3051
|
+
}
|
|
3052
|
+
.footer-brand a {
|
|
3053
|
+
color: var(--accent); text-decoration: none; font-weight: 500;
|
|
3054
|
+
}
|
|
3055
|
+
.footer-brand a:hover { text-decoration: underline; }
|
|
3056
|
+
.footer-cta {
|
|
3057
|
+
display: inline-flex; align-items: center; gap: 6px;
|
|
3058
|
+
padding: 6px 14px; border-radius: 8px;
|
|
3059
|
+
background: var(--accent-soft);
|
|
3060
|
+
border: 1px solid rgba(245,158,11,0.3);
|
|
3061
|
+
color: var(--accent); font-size: 12px; font-weight: 500;
|
|
3062
|
+
text-decoration: none;
|
|
3063
|
+
transition: transform 120ms ease, box-shadow 120ms ease;
|
|
3064
|
+
}
|
|
3065
|
+
.footer-cta:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(245,158,11,0.2); }
|
|
3066
|
+
|
|
3067
|
+
/* Empty state */
|
|
3068
|
+
.empty-msg {
|
|
3069
|
+
text-align: center; color: var(--muted);
|
|
3070
|
+
padding: 60px 20px; font-size: 16px;
|
|
3071
|
+
}
|
|
3072
|
+
|
|
3073
|
+
/* Responsive */
|
|
3074
|
+
@media (max-width: 640px) {
|
|
3075
|
+
body { padding: 12px; }
|
|
3076
|
+
.report-header { flex-direction: column; align-items: flex-start; gap: 8px; }
|
|
3077
|
+
.report-meta { text-align: left; }
|
|
3078
|
+
.summary-cards { grid-template-columns: 1fr; }
|
|
3079
|
+
.results-table { font-size: 12px; }
|
|
3080
|
+
.results-table th, .results-table td { padding: 8px 10px; }
|
|
3081
|
+
.report-footer { flex-direction: column; align-items: flex-start; }
|
|
3082
|
+
}
|
|
3083
|
+
</style>`;
|
|
3084
|
+
}
|
|
3085
|
+
function renderHeader(runsLabel, providerCount, taskCount) {
|
|
3086
|
+
const now = (/* @__PURE__ */ new Date()).toISOString().replace("T", " ").slice(0, 19) + " UTC";
|
|
3087
|
+
return `<header class="report-header">
|
|
3088
|
+
<a class="report-brand" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3089
|
+
<div class="brand-icon">⬡</div>
|
|
3090
|
+
<span>Agent Duelist</span>
|
|
3091
|
+
</a>
|
|
3092
|
+
<div class="report-meta">
|
|
3093
|
+
${providerCount} provider${providerCount !== 1 ? "s" : ""} ·
|
|
3094
|
+
${taskCount} task${taskCount !== 1 ? "s" : ""} ·
|
|
3095
|
+
${esc(runsLabel)}<br>
|
|
3096
|
+
${esc(now)}
|
|
3097
|
+
</div>
|
|
3098
|
+
</header>`;
|
|
3099
|
+
}
|
|
3100
|
+
function renderTabs(tasks) {
|
|
3101
|
+
const buttons = tasks.map(
|
|
3102
|
+
(t, i) => `<button class="task-tab${i === 0 ? " active" : ""}" data-task="${i}">${esc(t)}</button>`
|
|
3103
|
+
).join("\n ");
|
|
3104
|
+
return `<nav class="task-tabs">
|
|
3105
|
+
${buttons}
|
|
3106
|
+
</nav>`;
|
|
3107
|
+
}
|
|
3108
|
+
function renderTaskSection(task, providerData, columnStats, medals, winnerId, scorerNames, _hasCost, multi, index) {
|
|
3109
|
+
const cols = [
|
|
3110
|
+
{ label: "Provider", key: "provider", isScore: false }
|
|
3111
|
+
];
|
|
3112
|
+
for (const name of scorerNames) {
|
|
3113
|
+
if (name === "latency") {
|
|
3114
|
+
cols.push({ label: "Latency", key: "latency", isScore: false });
|
|
3115
|
+
} else if (name === "cost") {
|
|
3116
|
+
cols.push({ label: "Cost", key: "cost", isScore: false });
|
|
3117
|
+
cols.push({ label: "Tokens", key: "tokens", isScore: false });
|
|
3118
|
+
} else {
|
|
3119
|
+
cols.push({ label: scorerLabel(name), key: name, isScore: true });
|
|
3120
|
+
}
|
|
3121
|
+
}
|
|
3122
|
+
const ths = cols.map(
|
|
3123
|
+
(c) => `<th data-col="${esc(c.key)}">${esc(c.label)}<span class="sort-arrow"></span></th>`
|
|
3124
|
+
).join("");
|
|
3125
|
+
const rows = providerData.map((pd) => {
|
|
3126
|
+
const medal = medalEmoji(medals.get(pd.providerId) ?? "none");
|
|
3127
|
+
const cells = [];
|
|
3128
|
+
const medalHtml = medal ? `${medal} ` : "";
|
|
3129
|
+
cells.push(`<td>${medalHtml}${esc(pd.providerId)}</td>`);
|
|
3130
|
+
if (pd.allErrors) {
|
|
3131
|
+
for (let ci = 1; ci < cols.length; ci++) {
|
|
3132
|
+
cells.push(`<td class="rank-error">—</td>`);
|
|
3133
|
+
}
|
|
3134
|
+
} else {
|
|
3135
|
+
for (const col of cols.slice(1)) {
|
|
3136
|
+
cells.push(renderDataCell(col.key, col.isScore, pd, columnStats, multi));
|
|
3137
|
+
}
|
|
3138
|
+
}
|
|
3139
|
+
return `<tr>${cells.join("")}</tr>`;
|
|
3140
|
+
}).join("\n");
|
|
3141
|
+
const winnerHtml = winnerId ? `<div class="task-winner">
|
|
3142
|
+
<span class="trophy">🏆</span>
|
|
3143
|
+
<span>Winner: <span class="winner-name">${esc(winnerId)}</span>
|
|
3144
|
+
<span class="winner-label">${esc(providerLabel(winnerId))}</span></span>
|
|
3145
|
+
</div>` : "";
|
|
3146
|
+
return `<section class="task-section${index === 0 ? " active" : ""}" data-task-idx="${index}">
|
|
3147
|
+
<h2 class="task-name">${esc(task)}</h2>
|
|
3148
|
+
<table class="results-table">
|
|
3149
|
+
<thead><tr>${ths}</tr></thead>
|
|
3150
|
+
<tbody>${rows}</tbody>
|
|
3151
|
+
</table>
|
|
3152
|
+
${winnerHtml}
|
|
3153
|
+
</section>`;
|
|
3154
|
+
}
|
|
3155
|
+
function renderDataCell(key, _isScore, pd, columnStats, multi) {
|
|
3156
|
+
const colStats = columnStats.get(key);
|
|
3157
|
+
if (key === "latency") {
|
|
3158
|
+
const ms = pd.latencyMs;
|
|
3159
|
+
if (ms === void 0) return `<td class="rank-error">—</td>`;
|
|
3160
|
+
const rankClass = multi && colStats ? rankClass_(ms, colStats) : "rank-neutral";
|
|
3161
|
+
return `<td class="${rankClass}" data-sort-val="${ms}">${Math.round(ms)}ms</td>`;
|
|
3162
|
+
}
|
|
3163
|
+
if (key === "cost") {
|
|
3164
|
+
const cost = pd.avgDetails.costUsd;
|
|
3165
|
+
if (cost === void 0) return `<td class="rank-error">—</td>`;
|
|
3166
|
+
const rankClass = multi && colStats ? rankClass_(cost, colStats) : "rank-neutral";
|
|
3167
|
+
return `<td class="${rankClass}" data-sort-val="${cost}">${esc(formatCost(cost))}</td>`;
|
|
3168
|
+
}
|
|
3169
|
+
if (key === "tokens") {
|
|
3170
|
+
const tokens = pd.avgDetails.totalTokens;
|
|
3171
|
+
if (tokens === void 0) return `<td class="rank-error">—</td>`;
|
|
3172
|
+
const rankClass = multi && colStats ? rankClass_(tokens, colStats) : "rank-neutral";
|
|
3173
|
+
return `<td class="${rankClass}" data-sort-val="${tokens}">${tokens}</td>`;
|
|
3174
|
+
}
|
|
3175
|
+
const val = pd.avgScores[key];
|
|
3176
|
+
if (val === void 0) return `<td class="rank-error">—</td>`;
|
|
3177
|
+
const pct = Math.round(val * 100);
|
|
3178
|
+
let rankCls;
|
|
3179
|
+
if (multi && colStats) {
|
|
3180
|
+
rankCls = rankClass_(val, colStats);
|
|
3181
|
+
} else {
|
|
3182
|
+
rankCls = val >= 0.8 ? "rank-best" : val >= 0.5 ? "rank-mid" : "rank-worst";
|
|
3183
|
+
}
|
|
3184
|
+
const barColor = val >= 0.8 ? "var(--green)" : val >= 0.5 ? "var(--yellow)" : "var(--red)";
|
|
3185
|
+
return `<td class="score-cell ${rankCls}" data-sort-val="${val}">
|
|
3186
|
+
<span class="score-val">${pct}%</span>
|
|
3187
|
+
<div class="score-bar" style="width:${pct}%;background:${barColor}"></div>
|
|
3188
|
+
</td>`;
|
|
3189
|
+
}
|
|
3190
|
+
function rankClass_(value, colStats) {
|
|
3191
|
+
if (colStats.best === void 0 || colStats.worst === void 0) return "rank-neutral";
|
|
3192
|
+
if (colStats.best === colStats.worst) return "rank-neutral";
|
|
3193
|
+
if (value === colStats.best) return "rank-best";
|
|
3194
|
+
if (value === colStats.worst) return "rank-worst";
|
|
3195
|
+
return "rank-mid";
|
|
3196
|
+
}
|
|
3197
|
+
function renderSummary(byCorrectness, byLatency, byCost, overallWinner, multi) {
|
|
3198
|
+
const cards = [];
|
|
3199
|
+
if (byCorrectness) {
|
|
3200
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
3201
|
+
const provider = multi ? `<div class="card-provider">${esc(byCorrectness.id)} ${esc(providerLabel(byCorrectness.id))}</div>` : "";
|
|
3202
|
+
cards.push(`<div class="summary-card">
|
|
3203
|
+
<div class="card-label">${multi ? "Most Correct" : "Avg Correctness"}</div>
|
|
3204
|
+
<div class="card-value">${pct}</div>
|
|
3205
|
+
${provider}
|
|
3206
|
+
</div>`);
|
|
3207
|
+
}
|
|
3208
|
+
if (byLatency && byLatency.avg !== Infinity) {
|
|
3209
|
+
const ms = `${Math.round(byLatency.avg)}ms`;
|
|
3210
|
+
const provider = multi ? `<div class="card-provider">${esc(byLatency.id)} ${esc(providerLabel(byLatency.id))}</div>` : "";
|
|
3211
|
+
cards.push(`<div class="summary-card">
|
|
3212
|
+
<div class="card-label">${multi ? "Fastest" : "Avg Latency"}</div>
|
|
3213
|
+
<div class="card-value">${ms}</div>
|
|
3214
|
+
${provider}
|
|
3215
|
+
</div>`);
|
|
3216
|
+
}
|
|
3217
|
+
if (byCost?.avg !== void 0) {
|
|
3218
|
+
const cost = esc(formatCost(byCost.avg));
|
|
3219
|
+
const provider = multi ? `<div class="card-provider">${esc(byCost.id)} ${esc(providerLabel(byCost.id))}</div>` : "";
|
|
3220
|
+
cards.push(`<div class="summary-card">
|
|
3221
|
+
<div class="card-label">${multi ? "Cheapest" : "Avg Cost"}</div>
|
|
3222
|
+
<div class="card-value">${cost}</div>
|
|
3223
|
+
${provider}
|
|
3224
|
+
</div>`);
|
|
3225
|
+
}
|
|
3226
|
+
if (overallWinner) {
|
|
3227
|
+
cards.push(`<div class="summary-card">
|
|
3228
|
+
<div class="card-label">Overall Winner</div>
|
|
3229
|
+
<div class="card-value">🏆</div>
|
|
3230
|
+
<div class="card-provider">${esc(overallWinner)} ${esc(providerLabel(overallWinner))}</div>
|
|
3231
|
+
</div>`);
|
|
3232
|
+
}
|
|
3233
|
+
if (cards.length === 0) return "";
|
|
3234
|
+
return `<section class="summary-section">
|
|
3235
|
+
<h2 class="summary-title">Summary</h2>
|
|
3236
|
+
<div class="summary-cards">
|
|
3237
|
+
${cards.join("\n ")}
|
|
3238
|
+
</div>
|
|
3239
|
+
</section>`;
|
|
3240
|
+
}
|
|
3241
|
+
function renderErrors(errors) {
|
|
3242
|
+
const items = errors.map((e) => {
|
|
3243
|
+
const suffix = e.count > 1 ? ` <span class="error-count">(×${e.count})</span>` : "";
|
|
3244
|
+
const hint = e.hint ? `<div class="error-hint">${esc(e.hint)}</div>` : "";
|
|
3245
|
+
return `<div class="error-item">
|
|
3246
|
+
<span class="error-provider">${esc(e.providerId)}:</span>
|
|
3247
|
+
<span class="error-msg">${esc(e.error)}</span>${suffix}
|
|
3248
|
+
${hint}
|
|
3249
|
+
</div>`;
|
|
3250
|
+
}).join("\n");
|
|
3251
|
+
return `<section class="errors-section">
|
|
3252
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
3253
|
+
<div class="errors-list">
|
|
3254
|
+
${items}
|
|
3255
|
+
</div>
|
|
3256
|
+
</section>`;
|
|
3257
|
+
}
|
|
3258
|
+
function renderFooter() {
|
|
3259
|
+
return `<footer class="report-footer">
|
|
3260
|
+
<div class="footer-brand">
|
|
3261
|
+
Powered by <a href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">Agent Duelist</a>
|
|
3262
|
+
</div>
|
|
3263
|
+
<a class="footer-cta" href="https://github.com/DataGobes/agent-duelist" target="_blank" rel="noopener">
|
|
3264
|
+
⭐ Star on GitHub
|
|
3265
|
+
</a>
|
|
3266
|
+
</footer>`;
|
|
3267
|
+
}
|
|
3268
|
+
function renderScript(taskCount) {
|
|
3269
|
+
return `<script>
|
|
3270
|
+
(function() {
|
|
3271
|
+
/* Tab switching */
|
|
3272
|
+
${taskCount > 1 ? `
|
|
3273
|
+
var tabs = document.querySelectorAll('.task-tab');
|
|
3274
|
+
var sections = document.querySelectorAll('.task-section');
|
|
3275
|
+
tabs.forEach(function(tab) {
|
|
3276
|
+
tab.addEventListener('click', function() {
|
|
3277
|
+
var idx = parseInt(tab.getAttribute('data-task'));
|
|
3278
|
+
tabs.forEach(function(t) { t.classList.remove('active'); });
|
|
3279
|
+
sections.forEach(function(s) { s.classList.remove('active'); });
|
|
3280
|
+
tab.classList.add('active');
|
|
3281
|
+
sections[idx].classList.add('active');
|
|
3282
|
+
});
|
|
3283
|
+
});` : ""}
|
|
3284
|
+
|
|
3285
|
+
/* Column sorting */
|
|
3286
|
+
document.querySelectorAll('.results-table th').forEach(function(th, colIdx) {
|
|
3287
|
+
var table = th.closest('table');
|
|
3288
|
+
var asc = true;
|
|
3289
|
+
th.addEventListener('click', function() {
|
|
3290
|
+
var tbody = table.querySelector('tbody');
|
|
3291
|
+
var rows = Array.from(tbody.querySelectorAll('tr'));
|
|
3292
|
+
rows.sort(function(a, b) {
|
|
3293
|
+
var aCell = a.children[colIdx];
|
|
3294
|
+
var bCell = b.children[colIdx];
|
|
3295
|
+
var aVal = aCell.getAttribute('data-sort-val');
|
|
3296
|
+
var bVal = bCell.getAttribute('data-sort-val');
|
|
3297
|
+
if (aVal !== null && bVal !== null) {
|
|
3298
|
+
return asc ? parseFloat(aVal) - parseFloat(bVal) : parseFloat(bVal) - parseFloat(aVal);
|
|
3299
|
+
}
|
|
3300
|
+
var aText = aCell.textContent || '';
|
|
3301
|
+
var bText = bCell.textContent || '';
|
|
3302
|
+
return asc ? aText.localeCompare(bText) : bText.localeCompare(aText);
|
|
3303
|
+
});
|
|
3304
|
+
rows.forEach(function(row) { tbody.appendChild(row); });
|
|
3305
|
+
|
|
3306
|
+
/* Update sort arrows */
|
|
3307
|
+
table.querySelectorAll('th .sort-arrow').forEach(function(a) { a.textContent = ''; });
|
|
3308
|
+
th.querySelector('.sort-arrow').textContent = asc ? ' \\u25B2' : ' \\u25BC';
|
|
3309
|
+
asc = !asc;
|
|
3310
|
+
});
|
|
3311
|
+
});
|
|
3312
|
+
})();
|
|
3313
|
+
</script>`;
|
|
3314
|
+
}
|
|
3315
|
+
|
|
3316
|
+
// src/packs/structured-output.ts
|
|
3317
|
+
import { z } from "zod";
|
|
3318
|
+
var structuredOutputPack = {
|
|
3319
|
+
name: "structured-output",
|
|
3320
|
+
label: "Structured Output",
|
|
3321
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
3322
|
+
tasks: [
|
|
3323
|
+
{
|
|
3324
|
+
name: "so:flat-entity",
|
|
3325
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
3326
|
+
expected: {
|
|
3327
|
+
name: "Maria Garcia",
|
|
3328
|
+
age: 34,
|
|
3329
|
+
role: "software architect",
|
|
3330
|
+
city: "Barcelona",
|
|
3331
|
+
country: "Spain",
|
|
3332
|
+
employeeId: "EMP-2847"
|
|
3333
|
+
},
|
|
3334
|
+
schema: z.object({
|
|
3335
|
+
name: z.string(),
|
|
3336
|
+
age: z.number(),
|
|
3337
|
+
role: z.string(),
|
|
3338
|
+
city: z.string(),
|
|
3339
|
+
country: z.string(),
|
|
3340
|
+
employeeId: z.string()
|
|
3341
|
+
})
|
|
3342
|
+
},
|
|
3343
|
+
{
|
|
3344
|
+
name: "so:nested-object",
|
|
3345
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
3346
|
+
expected: {
|
|
3347
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
3348
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
3349
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
3350
|
+
},
|
|
3351
|
+
schema: z.object({
|
|
3352
|
+
recipient: z.object({ company: z.string(), contact: z.string(), floor: z.string() }),
|
|
3353
|
+
address: z.object({
|
|
3354
|
+
street: z.string(),
|
|
3355
|
+
city: z.string(),
|
|
3356
|
+
state: z.string(),
|
|
3357
|
+
zip: z.string(),
|
|
3358
|
+
country: z.string()
|
|
3359
|
+
}),
|
|
3360
|
+
order: z.object({
|
|
3361
|
+
id: z.string(),
|
|
3362
|
+
itemCount: z.number(),
|
|
3363
|
+
weightKg: z.number(),
|
|
3364
|
+
shippingMethod: z.enum(["standard", "express", "overnight"])
|
|
3365
|
+
})
|
|
3366
|
+
})
|
|
3367
|
+
},
|
|
3368
|
+
{
|
|
3369
|
+
name: "so:array-of-objects",
|
|
3370
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
3371
|
+
expected: [
|
|
3372
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
3373
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
3374
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
3375
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
3376
|
+
],
|
|
3377
|
+
schema: z.array(z.object({ name: z.string(), price: z.number(), category: z.string() }))
|
|
3378
|
+
},
|
|
3379
|
+
{
|
|
3380
|
+
name: "so:empty-arrays",
|
|
3381
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
3382
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
3383
|
+
schema: z.object({
|
|
3384
|
+
errors: z.array(z.object({ code: z.string(), severity: z.string() })),
|
|
3385
|
+
warnings: z.array(z.string()),
|
|
3386
|
+
status: z.enum(["healthy", "degraded", "down"]),
|
|
3387
|
+
uptimePercent: z.number()
|
|
3388
|
+
})
|
|
3389
|
+
},
|
|
3390
|
+
{
|
|
3391
|
+
name: "so:enum-classification",
|
|
3392
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
3393
|
+
expected: [
|
|
3394
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
3395
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
3396
|
+
{ id: "C", priority: "low", category: "account" },
|
|
3397
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
3398
|
+
],
|
|
3399
|
+
schema: z.array(
|
|
3400
|
+
z.object({
|
|
3401
|
+
id: z.string(),
|
|
3402
|
+
priority: z.enum(["low", "medium", "high", "critical"]),
|
|
3403
|
+
category: z.enum(["billing", "technical", "account", "general"])
|
|
3404
|
+
})
|
|
3405
|
+
)
|
|
3406
|
+
},
|
|
3407
|
+
{
|
|
3408
|
+
name: "so:adversarial-input",
|
|
3409
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
3410
|
+
|
|
3411
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
3412
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
3413
|
+
expected: {
|
|
3414
|
+
product: "headphones",
|
|
3415
|
+
price: 59.99,
|
|
3416
|
+
rating: 5,
|
|
3417
|
+
maxRating: 5,
|
|
3418
|
+
features: ["noise-cancelling"],
|
|
3419
|
+
recommended: true,
|
|
3420
|
+
purchaseDate: "2026-01-15"
|
|
3421
|
+
},
|
|
3422
|
+
schema: z.object({
|
|
3423
|
+
product: z.string(),
|
|
3424
|
+
price: z.number(),
|
|
3425
|
+
rating: z.number(),
|
|
3426
|
+
maxRating: z.number(),
|
|
3427
|
+
features: z.array(z.string()),
|
|
3428
|
+
recommended: z.boolean(),
|
|
3429
|
+
purchaseDate: z.string()
|
|
3430
|
+
})
|
|
3431
|
+
}
|
|
3432
|
+
],
|
|
3433
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3434
|
+
};
|
|
3435
|
+
|
|
3436
|
+
// src/packs/index.ts
|
|
3437
|
+
var registry = /* @__PURE__ */ new Map();
|
|
3438
|
+
function register(pack) {
|
|
3439
|
+
registry.set(pack.name, pack);
|
|
3440
|
+
}
|
|
3441
|
+
register(structuredOutputPack);
|
|
3442
|
+
function loadPack(name) {
|
|
3443
|
+
const pack = registry.get(name);
|
|
3444
|
+
if (!pack) {
|
|
3445
|
+
const available = [...registry.keys()].join(", ");
|
|
3446
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
3447
|
+
}
|
|
3448
|
+
return pack;
|
|
3449
|
+
}
|
|
3450
|
+
function listPacks() {
|
|
3451
|
+
return [...registry.values()].map((p) => ({
|
|
3452
|
+
name: p.name,
|
|
3453
|
+
label: p.label,
|
|
3454
|
+
description: p.description,
|
|
3455
|
+
taskCount: p.tasks.length
|
|
3456
|
+
}));
|
|
3457
|
+
}
|
|
3458
|
+
|
|
2517
3459
|
// src/ci.ts
|
|
2518
3460
|
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
2519
3461
|
import { dirname } from "path";
|
|
@@ -2535,10 +3477,11 @@ var T_CRITICAL_95 = {
|
|
|
2535
3477
|
25: 2.06,
|
|
2536
3478
|
30: 2.042
|
|
2537
3479
|
};
|
|
3480
|
+
var T_CRITICAL_KEYS = Object.keys(T_CRITICAL_95).map(Number).sort((a, b) => a - b);
|
|
2538
3481
|
function tCritical(df) {
|
|
2539
3482
|
if (df <= 0) return 1.96;
|
|
2540
3483
|
if (T_CRITICAL_95[df] !== void 0) return T_CRITICAL_95[df];
|
|
2541
|
-
const keys =
|
|
3484
|
+
const keys = T_CRITICAL_KEYS;
|
|
2542
3485
|
if (df > keys[keys.length - 1]) return 1.96;
|
|
2543
3486
|
for (let i = 0; i < keys.length - 1; i++) {
|
|
2544
3487
|
if (df > keys[i] && df < keys[i + 1]) {
|
|
@@ -2648,7 +3591,7 @@ function compareResults(baselineStats, currentStats, thresholds, budget, current
|
|
|
2648
3591
|
if (regressions.length > 0) {
|
|
2649
3592
|
for (const r of regressions) {
|
|
2650
3593
|
failureReasons.push(
|
|
2651
|
-
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${
|
|
3594
|
+
`${r.providerId} \xD7 ${r.taskName}: ${r.scorerName} regressed by ${formatDelta(r.delta)}`
|
|
2652
3595
|
);
|
|
2653
3596
|
}
|
|
2654
3597
|
}
|
|
@@ -2683,10 +3626,6 @@ function detectImprovement(baseline, current, threshold, lowerIsBetter) {
|
|
|
2683
3626
|
}
|
|
2684
3627
|
return current.ci95Lower - baseline.ci95Upper > threshold;
|
|
2685
3628
|
}
|
|
2686
|
-
function formatDelta2(delta) {
|
|
2687
|
-
const sign = delta >= 0 ? "+" : "";
|
|
2688
|
-
return `${sign}${delta.toFixed(4)}`;
|
|
2689
|
-
}
|
|
2690
3629
|
function loadBaseline(path) {
|
|
2691
3630
|
try {
|
|
2692
3631
|
const raw = readFileSync(path, "utf-8");
|
|
@@ -2743,18 +3682,20 @@ function detectGitHubContext() {
|
|
|
2743
3682
|
return { token, owner, repo, prNumber };
|
|
2744
3683
|
}
|
|
2745
3684
|
var API_BASE = "https://api.github.com";
|
|
3685
|
+
function ghHeaders(token, extra) {
|
|
3686
|
+
return {
|
|
3687
|
+
Authorization: `Bearer ${token}`,
|
|
3688
|
+
Accept: "application/vnd.github+json",
|
|
3689
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
3690
|
+
...extra
|
|
3691
|
+
};
|
|
3692
|
+
}
|
|
2746
3693
|
async function findExistingComment(ctx, marker) {
|
|
2747
3694
|
let page = 1;
|
|
2748
3695
|
const perPage = 50;
|
|
2749
3696
|
while (true) {
|
|
2750
3697
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments?per_page=${perPage}&page=${page}`;
|
|
2751
|
-
const res = await fetch(url, {
|
|
2752
|
-
headers: {
|
|
2753
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2754
|
-
Accept: "application/vnd.github+json",
|
|
2755
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2756
|
-
}
|
|
2757
|
-
});
|
|
3698
|
+
const res = await fetch(url, { headers: ghHeaders(ctx.token) });
|
|
2758
3699
|
if (!res.ok) return null;
|
|
2759
3700
|
const comments = await res.json();
|
|
2760
3701
|
if (comments.length === 0) break;
|
|
@@ -2774,12 +3715,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2774
3715
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/comments/${existingId}`;
|
|
2775
3716
|
const res = await fetch(url, {
|
|
2776
3717
|
method: "PATCH",
|
|
2777
|
-
headers: {
|
|
2778
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2779
|
-
Accept: "application/vnd.github+json",
|
|
2780
|
-
"Content-Type": "application/json",
|
|
2781
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2782
|
-
},
|
|
3718
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2783
3719
|
body: JSON.stringify({ body })
|
|
2784
3720
|
});
|
|
2785
3721
|
if (!res.ok) {
|
|
@@ -2790,12 +3726,7 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
2790
3726
|
const url = `${API_BASE}/repos/${ctx.owner}/${ctx.repo}/issues/${ctx.prNumber}/comments`;
|
|
2791
3727
|
const res = await fetch(url, {
|
|
2792
3728
|
method: "POST",
|
|
2793
|
-
headers: {
|
|
2794
|
-
Authorization: `Bearer ${ctx.token}`,
|
|
2795
|
-
Accept: "application/vnd.github+json",
|
|
2796
|
-
"Content-Type": "application/json",
|
|
2797
|
-
"X-GitHub-Api-Version": "2022-11-28"
|
|
2798
|
-
},
|
|
3729
|
+
headers: ghHeaders(ctx.token, { "Content-Type": "application/json" }),
|
|
2799
3730
|
body: JSON.stringify({ body })
|
|
2800
3731
|
});
|
|
2801
3732
|
if (!res.ok) {
|
|
@@ -2813,8 +3744,11 @@ export {
|
|
|
2813
3744
|
defineArena,
|
|
2814
3745
|
detectGitHubContext,
|
|
2815
3746
|
gemini,
|
|
3747
|
+
htmlReporter,
|
|
2816
3748
|
jsonReporter,
|
|
3749
|
+
listPacks,
|
|
2817
3750
|
loadBaseline,
|
|
3751
|
+
loadPack,
|
|
2818
3752
|
markdownReporter,
|
|
2819
3753
|
openai,
|
|
2820
3754
|
openaiCompatible,
|