agent-duelist 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/dist/cli.js +320 -40
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +299 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +299 -23
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/dist/index.js
CHANGED
|
@@ -1344,6 +1344,27 @@ var costScorer = ({ result }, providerId) => {
|
|
|
1344
1344
|
};
|
|
1345
1345
|
};
|
|
1346
1346
|
|
|
1347
|
+
// src/utils/deep-equal.ts
|
|
1348
|
+
function deepEqual(expected, actual) {
|
|
1349
|
+
if (expected === actual) return true;
|
|
1350
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1351
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1352
|
+
}
|
|
1353
|
+
if (typeof expected !== typeof actual) return false;
|
|
1354
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1355
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1356
|
+
if (expected.length !== actual.length) return false;
|
|
1357
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1358
|
+
}
|
|
1359
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1360
|
+
const objExpected = expected;
|
|
1361
|
+
const objActual = actual;
|
|
1362
|
+
const keysExpected = Object.keys(objExpected);
|
|
1363
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1364
|
+
}
|
|
1365
|
+
return expected === actual;
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1347
1368
|
// src/scorers/correctness.ts
|
|
1348
1369
|
var correctnessScorer = ({ task, result }) => {
|
|
1349
1370
|
if (task.expected === void 0) {
|
|
@@ -1367,25 +1388,6 @@ function normalizeOutput(expected, actual) {
|
|
|
1367
1388
|
}
|
|
1368
1389
|
return actual;
|
|
1369
1390
|
}
|
|
1370
|
-
function deepEqual(expected, actual) {
|
|
1371
|
-
if (expected === actual) return true;
|
|
1372
|
-
if (typeof expected === "string" && typeof actual === "string") {
|
|
1373
|
-
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1374
|
-
}
|
|
1375
|
-
if (typeof expected !== typeof actual) return false;
|
|
1376
|
-
if (expected === null || actual === null) return expected === actual;
|
|
1377
|
-
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1378
|
-
if (expected.length !== actual.length) return false;
|
|
1379
|
-
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1380
|
-
}
|
|
1381
|
-
if (typeof expected === "object" && typeof actual === "object") {
|
|
1382
|
-
const objExpected = expected;
|
|
1383
|
-
const objActual = actual;
|
|
1384
|
-
const keysExpected = Object.keys(objExpected);
|
|
1385
|
-
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1386
|
-
}
|
|
1387
|
-
return expected === actual;
|
|
1388
|
-
}
|
|
1389
1391
|
|
|
1390
1392
|
// src/scorers/schema-correctness.ts
|
|
1391
1393
|
var schemaCorrectnessScorer = ({ task, result }) => {
|
|
@@ -1758,15 +1760,54 @@ function parseJudgeResponse(response, model) {
|
|
|
1758
1760
|
|
|
1759
1761
|
// src/scorers/tool-usage.ts
|
|
1760
1762
|
var toolUsageScorer = ({ task, result }) => {
|
|
1761
|
-
|
|
1762
|
-
if (!expectedToolName) {
|
|
1763
|
+
if (!task.tools?.length) {
|
|
1763
1764
|
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
1764
1765
|
}
|
|
1765
|
-
const
|
|
1766
|
+
const calls = result.toolCalls ?? [];
|
|
1767
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
1768
|
+
if (expectedIsObject) {
|
|
1769
|
+
const matchingCall = calls.find((c) => {
|
|
1770
|
+
const toolDef = task.tools.find((t) => t.name === c.name);
|
|
1771
|
+
if (!toolDef) return false;
|
|
1772
|
+
return deepEqual(task.expected, c.arguments);
|
|
1773
|
+
});
|
|
1774
|
+
if (matchingCall) {
|
|
1775
|
+
return {
|
|
1776
|
+
name: "tool-usage",
|
|
1777
|
+
value: 1,
|
|
1778
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
1779
|
+
};
|
|
1780
|
+
}
|
|
1781
|
+
const expectedKeys = Object.keys(task.expected);
|
|
1782
|
+
const partialMatch = calls.find((c) => {
|
|
1783
|
+
if (typeof c.arguments !== "object" || c.arguments === null) return false;
|
|
1784
|
+
const argKeys = Object.keys(c.arguments);
|
|
1785
|
+
return expectedKeys.some((k) => argKeys.includes(k));
|
|
1786
|
+
});
|
|
1787
|
+
if (partialMatch) {
|
|
1788
|
+
return {
|
|
1789
|
+
name: "tool-usage",
|
|
1790
|
+
value: 0.5,
|
|
1791
|
+
details: {
|
|
1792
|
+
reason: "correct tool but wrong arguments",
|
|
1793
|
+
expected: task.expected,
|
|
1794
|
+
actual: partialMatch.arguments,
|
|
1795
|
+
toolCalls: calls
|
|
1796
|
+
}
|
|
1797
|
+
};
|
|
1798
|
+
}
|
|
1799
|
+
return {
|
|
1800
|
+
name: "tool-usage",
|
|
1801
|
+
value: 0,
|
|
1802
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
1803
|
+
};
|
|
1804
|
+
}
|
|
1805
|
+
const expectedToolName = task.tools[0].name;
|
|
1806
|
+
const usedTool = calls.some((c) => c.name === expectedToolName);
|
|
1766
1807
|
return {
|
|
1767
1808
|
name: "tool-usage",
|
|
1768
1809
|
value: usedTool ? 1 : 0,
|
|
1769
|
-
details: { expectedToolName, usedTool, toolCalls:
|
|
1810
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
1770
1811
|
};
|
|
1771
1812
|
};
|
|
1772
1813
|
|
|
@@ -3433,12 +3474,247 @@ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
|
3433
3474
|
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3434
3475
|
};
|
|
3435
3476
|
|
|
3477
|
+
// src/packs/tool-calling.ts
|
|
3478
|
+
import { z as z2 } from "zod";
|
|
3479
|
+
var toolCallingPack = {
|
|
3480
|
+
name: "tool-calling",
|
|
3481
|
+
label: "Tool Calling",
|
|
3482
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
3483
|
+
tasks: [
|
|
3484
|
+
{
|
|
3485
|
+
name: "tc:simple-single-tool",
|
|
3486
|
+
prompt: "What's the current weather in Tokyo?",
|
|
3487
|
+
tools: [{
|
|
3488
|
+
name: "getWeather",
|
|
3489
|
+
description: "Get current weather for a city",
|
|
3490
|
+
parameters: z2.object({
|
|
3491
|
+
city: z2.string(),
|
|
3492
|
+
units: z2.enum(["celsius", "fahrenheit"]).optional()
|
|
3493
|
+
}),
|
|
3494
|
+
handler: async ({ city, units }) => ({
|
|
3495
|
+
city,
|
|
3496
|
+
tempC: 8,
|
|
3497
|
+
condition: "cloudy",
|
|
3498
|
+
units: units ?? "celsius"
|
|
3499
|
+
})
|
|
3500
|
+
}],
|
|
3501
|
+
expected: { city: "Tokyo" }
|
|
3502
|
+
},
|
|
3503
|
+
{
|
|
3504
|
+
name: "tc:complex-params",
|
|
3505
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
3506
|
+
tools: [{
|
|
3507
|
+
name: "searchRestaurants",
|
|
3508
|
+
description: "Search for restaurants matching criteria",
|
|
3509
|
+
parameters: z2.object({
|
|
3510
|
+
cuisine: z2.string(),
|
|
3511
|
+
location: z2.string(),
|
|
3512
|
+
radiusMiles: z2.number(),
|
|
3513
|
+
minRating: z2.number(),
|
|
3514
|
+
openNow: z2.boolean()
|
|
3515
|
+
}),
|
|
3516
|
+
handler: async (_args) => ({
|
|
3517
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
3518
|
+
})
|
|
3519
|
+
}],
|
|
3520
|
+
expected: {
|
|
3521
|
+
cuisine: "Italian",
|
|
3522
|
+
location: "downtown Portland",
|
|
3523
|
+
radiusMiles: 2,
|
|
3524
|
+
minRating: 4,
|
|
3525
|
+
openNow: true
|
|
3526
|
+
}
|
|
3527
|
+
},
|
|
3528
|
+
{
|
|
3529
|
+
name: "tc:select-from-many",
|
|
3530
|
+
prompt: "Convert 150 USD to Euros.",
|
|
3531
|
+
tools: [
|
|
3532
|
+
{
|
|
3533
|
+
name: "getWeather",
|
|
3534
|
+
description: "Get current weather for a city",
|
|
3535
|
+
parameters: z2.object({ city: z2.string() }),
|
|
3536
|
+
handler: async () => ({ tempC: 20 })
|
|
3537
|
+
},
|
|
3538
|
+
{
|
|
3539
|
+
name: "convertCurrency",
|
|
3540
|
+
description: "Convert an amount between currencies",
|
|
3541
|
+
parameters: z2.object({
|
|
3542
|
+
amount: z2.number(),
|
|
3543
|
+
from: z2.string(),
|
|
3544
|
+
to: z2.string()
|
|
3545
|
+
}),
|
|
3546
|
+
handler: async ({ amount, from, to }) => ({
|
|
3547
|
+
amount,
|
|
3548
|
+
from,
|
|
3549
|
+
to,
|
|
3550
|
+
result: 138.75,
|
|
3551
|
+
rate: 0.925
|
|
3552
|
+
})
|
|
3553
|
+
},
|
|
3554
|
+
{
|
|
3555
|
+
name: "translateText",
|
|
3556
|
+
description: "Translate text between languages",
|
|
3557
|
+
parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
|
|
3558
|
+
handler: async () => ({ translated: "" })
|
|
3559
|
+
},
|
|
3560
|
+
{
|
|
3561
|
+
name: "calculateTip",
|
|
3562
|
+
description: "Calculate tip amount for a bill",
|
|
3563
|
+
parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
|
|
3564
|
+
handler: async () => ({ tip: 0 })
|
|
3565
|
+
}
|
|
3566
|
+
],
|
|
3567
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
3568
|
+
},
|
|
3569
|
+
{
|
|
3570
|
+
name: "tc:parallel-calls",
|
|
3571
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
3572
|
+
tools: [{
|
|
3573
|
+
name: "getWeather",
|
|
3574
|
+
description: "Get current weather for a city",
|
|
3575
|
+
parameters: z2.object({ city: z2.string() }),
|
|
3576
|
+
handler: async ({ city }) => {
|
|
3577
|
+
const data = {
|
|
3578
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
3579
|
+
London: { tempC: 9, condition: "rainy" }
|
|
3580
|
+
};
|
|
3581
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
3582
|
+
}
|
|
3583
|
+
}],
|
|
3584
|
+
expected: "weather data for Paris and London"
|
|
3585
|
+
}
|
|
3586
|
+
],
|
|
3587
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
3588
|
+
};
|
|
3589
|
+
|
|
3590
|
+
// src/packs/reasoning.ts
|
|
3591
|
+
import { z as z3 } from "zod";
|
|
3592
|
+
var reasoningPack = {
|
|
3593
|
+
name: "reasoning",
|
|
3594
|
+
label: "Reasoning",
|
|
3595
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
3596
|
+
tasks: [
|
|
3597
|
+
{
|
|
3598
|
+
name: "rs:saas-mrr-calc",
|
|
3599
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
3600
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
3601
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
3602
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
3603
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
3604
|
+
expected: { mrr: 27425 },
|
|
3605
|
+
schema: z3.object({
|
|
3606
|
+
reasoning: z3.string().optional(),
|
|
3607
|
+
mrr: z3.number()
|
|
3608
|
+
})
|
|
3609
|
+
},
|
|
3610
|
+
{
|
|
3611
|
+
name: "rs:logical-deduction",
|
|
3612
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
3613
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
3614
|
+
1. Alice does not use Python, Java, or Go.
|
|
3615
|
+
2. Bob uses TypeScript.
|
|
3616
|
+
3. Carol uses neither Rust nor Go.
|
|
3617
|
+
4. Dave does not use Java.
|
|
3618
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
3619
|
+
What language does each developer use? Return as JSON.`,
|
|
3620
|
+
expected: {
|
|
3621
|
+
Alice: "Rust",
|
|
3622
|
+
Bob: "TypeScript",
|
|
3623
|
+
Carol: "Java",
|
|
3624
|
+
Dave: "Go",
|
|
3625
|
+
Eve: "Python"
|
|
3626
|
+
},
|
|
3627
|
+
schema: z3.object({
|
|
3628
|
+
Alice: z3.string(),
|
|
3629
|
+
Bob: z3.string(),
|
|
3630
|
+
Carol: z3.string(),
|
|
3631
|
+
Dave: z3.string(),
|
|
3632
|
+
Eve: z3.string()
|
|
3633
|
+
})
|
|
3634
|
+
},
|
|
3635
|
+
{
|
|
3636
|
+
name: "rs:data-interpretation",
|
|
3637
|
+
prompt: `Given this quarterly revenue data:
|
|
3638
|
+
| Quarter | Revenue | Growth |
|
|
3639
|
+
|---------|---------|--------|
|
|
3640
|
+
| Q1 2025 | $2.1M | - |
|
|
3641
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
3642
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
3643
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
3644
|
+
|
|
3645
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
3646
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
3647
|
+
expected: {
|
|
3648
|
+
highestGrowthQuarter: "Q4 2025",
|
|
3649
|
+
absoluteIncrease: 0.6,
|
|
3650
|
+
fullYearRevenue: 9.5
|
|
3651
|
+
},
|
|
3652
|
+
schema: z3.object({
|
|
3653
|
+
highestGrowthQuarter: z3.string(),
|
|
3654
|
+
absoluteIncrease: z3.number(),
|
|
3655
|
+
fullYearRevenue: z3.number()
|
|
3656
|
+
})
|
|
3657
|
+
},
|
|
3658
|
+
{
|
|
3659
|
+
name: "rs:critical-path",
|
|
3660
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
3661
|
+
- Build (3 min, no dependency)
|
|
3662
|
+
- Unit tests (5 min, depends on Build)
|
|
3663
|
+
- Integration tests (8 min, depends on Build)
|
|
3664
|
+
- Security scan (4 min, depends on Build)
|
|
3665
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
3666
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
3667
|
+
|
|
3668
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
3669
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
3670
|
+
expected: {
|
|
3671
|
+
totalMinutes: 16,
|
|
3672
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
3673
|
+
},
|
|
3674
|
+
schema: z3.object({
|
|
3675
|
+
totalMinutes: z3.number(),
|
|
3676
|
+
criticalPath: z3.array(z3.string())
|
|
3677
|
+
})
|
|
3678
|
+
},
|
|
3679
|
+
{
|
|
3680
|
+
name: "rs:pricing-rules",
|
|
3681
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
3682
|
+
Rules:
|
|
3683
|
+
- Base price: $100
|
|
3684
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
3685
|
+
- Annual billing: additional 15% off the discounted price
|
|
3686
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
3687
|
+
|
|
3688
|
+
Customers:
|
|
3689
|
+
A: 50 seats, monthly billing, for-profit
|
|
3690
|
+
B: 200 seats, annual billing, for-profit
|
|
3691
|
+
C: 75 seats, annual billing, non-profit
|
|
3692
|
+
D: 150 seats, monthly billing, for-profit
|
|
3693
|
+
|
|
3694
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
3695
|
+
expected: [
|
|
3696
|
+
{ id: "A", finalPrice: 100 },
|
|
3697
|
+
{ id: "B", finalPrice: 59.5 },
|
|
3698
|
+
{ id: "C", finalPrice: 50 },
|
|
3699
|
+
{ id: "D", finalPrice: 70 }
|
|
3700
|
+
],
|
|
3701
|
+
schema: z3.array(z3.object({
|
|
3702
|
+
id: z3.string(),
|
|
3703
|
+
finalPrice: z3.number()
|
|
3704
|
+
}))
|
|
3705
|
+
}
|
|
3706
|
+
],
|
|
3707
|
+
scorers: ["correctness", "latency", "cost"]
|
|
3708
|
+
};
|
|
3709
|
+
|
|
3436
3710
|
// src/packs/index.ts
|
|
3437
3711
|
var registry = /* @__PURE__ */ new Map();
|
|
3438
3712
|
function register(pack) {
|
|
3439
3713
|
registry.set(pack.name, pack);
|
|
3440
3714
|
}
|
|
3441
3715
|
register(structuredOutputPack);
|
|
3716
|
+
register(toolCallingPack);
|
|
3717
|
+
register(reasoningPack);
|
|
3442
3718
|
function loadPack(name) {
|
|
3443
3719
|
const pack = registry.get(name);
|
|
3444
3720
|
if (!pack) {
|