agent-duelist 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/dist/cli.js +320 -40
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +299 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +299 -23
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/dist/index.cjs
CHANGED
|
@@ -1398,6 +1398,27 @@ var costScorer = ({ result }, providerId) => {
|
|
|
1398
1398
|
};
|
|
1399
1399
|
};
|
|
1400
1400
|
|
|
1401
|
+
// src/utils/deep-equal.ts
|
|
1402
|
+
function deepEqual(expected, actual) {
|
|
1403
|
+
if (expected === actual) return true;
|
|
1404
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
1405
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1406
|
+
}
|
|
1407
|
+
if (typeof expected !== typeof actual) return false;
|
|
1408
|
+
if (expected === null || actual === null) return expected === actual;
|
|
1409
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1410
|
+
if (expected.length !== actual.length) return false;
|
|
1411
|
+
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1412
|
+
}
|
|
1413
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
1414
|
+
const objExpected = expected;
|
|
1415
|
+
const objActual = actual;
|
|
1416
|
+
const keysExpected = Object.keys(objExpected);
|
|
1417
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1418
|
+
}
|
|
1419
|
+
return expected === actual;
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1401
1422
|
// src/scorers/correctness.ts
|
|
1402
1423
|
var correctnessScorer = ({ task, result }) => {
|
|
1403
1424
|
if (task.expected === void 0) {
|
|
@@ -1421,25 +1442,6 @@ function normalizeOutput(expected, actual) {
|
|
|
1421
1442
|
}
|
|
1422
1443
|
return actual;
|
|
1423
1444
|
}
|
|
1424
|
-
function deepEqual(expected, actual) {
|
|
1425
|
-
if (expected === actual) return true;
|
|
1426
|
-
if (typeof expected === "string" && typeof actual === "string") {
|
|
1427
|
-
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
1428
|
-
}
|
|
1429
|
-
if (typeof expected !== typeof actual) return false;
|
|
1430
|
-
if (expected === null || actual === null) return expected === actual;
|
|
1431
|
-
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
1432
|
-
if (expected.length !== actual.length) return false;
|
|
1433
|
-
return expected.every((val, i) => deepEqual(val, actual[i]));
|
|
1434
|
-
}
|
|
1435
|
-
if (typeof expected === "object" && typeof actual === "object") {
|
|
1436
|
-
const objExpected = expected;
|
|
1437
|
-
const objActual = actual;
|
|
1438
|
-
const keysExpected = Object.keys(objExpected);
|
|
1439
|
-
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
1440
|
-
}
|
|
1441
|
-
return expected === actual;
|
|
1442
|
-
}
|
|
1443
1445
|
|
|
1444
1446
|
// src/scorers/schema-correctness.ts
|
|
1445
1447
|
var schemaCorrectnessScorer = ({ task, result }) => {
|
|
@@ -1812,15 +1814,54 @@ function parseJudgeResponse(response, model) {
|
|
|
1812
1814
|
|
|
1813
1815
|
// src/scorers/tool-usage.ts
|
|
1814
1816
|
var toolUsageScorer = ({ task, result }) => {
|
|
1815
|
-
|
|
1816
|
-
if (!expectedToolName) {
|
|
1817
|
+
if (!task.tools?.length) {
|
|
1817
1818
|
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
1818
1819
|
}
|
|
1819
|
-
const
|
|
1820
|
+
const calls = result.toolCalls ?? [];
|
|
1821
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
1822
|
+
if (expectedIsObject) {
|
|
1823
|
+
const matchingCall = calls.find((c) => {
|
|
1824
|
+
const toolDef = task.tools.find((t) => t.name === c.name);
|
|
1825
|
+
if (!toolDef) return false;
|
|
1826
|
+
return deepEqual(task.expected, c.arguments);
|
|
1827
|
+
});
|
|
1828
|
+
if (matchingCall) {
|
|
1829
|
+
return {
|
|
1830
|
+
name: "tool-usage",
|
|
1831
|
+
value: 1,
|
|
1832
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
1833
|
+
};
|
|
1834
|
+
}
|
|
1835
|
+
const expectedKeys = Object.keys(task.expected);
|
|
1836
|
+
const partialMatch = calls.find((c) => {
|
|
1837
|
+
if (typeof c.arguments !== "object" || c.arguments === null) return false;
|
|
1838
|
+
const argKeys = Object.keys(c.arguments);
|
|
1839
|
+
return expectedKeys.some((k) => argKeys.includes(k));
|
|
1840
|
+
});
|
|
1841
|
+
if (partialMatch) {
|
|
1842
|
+
return {
|
|
1843
|
+
name: "tool-usage",
|
|
1844
|
+
value: 0.5,
|
|
1845
|
+
details: {
|
|
1846
|
+
reason: "correct tool but wrong arguments",
|
|
1847
|
+
expected: task.expected,
|
|
1848
|
+
actual: partialMatch.arguments,
|
|
1849
|
+
toolCalls: calls
|
|
1850
|
+
}
|
|
1851
|
+
};
|
|
1852
|
+
}
|
|
1853
|
+
return {
|
|
1854
|
+
name: "tool-usage",
|
|
1855
|
+
value: 0,
|
|
1856
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
1857
|
+
};
|
|
1858
|
+
}
|
|
1859
|
+
const expectedToolName = task.tools[0].name;
|
|
1860
|
+
const usedTool = calls.some((c) => c.name === expectedToolName);
|
|
1820
1861
|
return {
|
|
1821
1862
|
name: "tool-usage",
|
|
1822
1863
|
value: usedTool ? 1 : 0,
|
|
1823
|
-
details: { expectedToolName, usedTool, toolCalls:
|
|
1864
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
1824
1865
|
};
|
|
1825
1866
|
};
|
|
1826
1867
|
|
|
@@ -3487,12 +3528,247 @@ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
|
3487
3528
|
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
3488
3529
|
};
|
|
3489
3530
|
|
|
3531
|
+
// src/packs/tool-calling.ts
|
|
3532
|
+
var import_zod2 = require("zod");
|
|
3533
|
+
var toolCallingPack = {
|
|
3534
|
+
name: "tool-calling",
|
|
3535
|
+
label: "Tool Calling",
|
|
3536
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
3537
|
+
tasks: [
|
|
3538
|
+
{
|
|
3539
|
+
name: "tc:simple-single-tool",
|
|
3540
|
+
prompt: "What's the current weather in Tokyo?",
|
|
3541
|
+
tools: [{
|
|
3542
|
+
name: "getWeather",
|
|
3543
|
+
description: "Get current weather for a city",
|
|
3544
|
+
parameters: import_zod2.z.object({
|
|
3545
|
+
city: import_zod2.z.string(),
|
|
3546
|
+
units: import_zod2.z.enum(["celsius", "fahrenheit"]).optional()
|
|
3547
|
+
}),
|
|
3548
|
+
handler: async ({ city, units }) => ({
|
|
3549
|
+
city,
|
|
3550
|
+
tempC: 8,
|
|
3551
|
+
condition: "cloudy",
|
|
3552
|
+
units: units ?? "celsius"
|
|
3553
|
+
})
|
|
3554
|
+
}],
|
|
3555
|
+
expected: { city: "Tokyo" }
|
|
3556
|
+
},
|
|
3557
|
+
{
|
|
3558
|
+
name: "tc:complex-params",
|
|
3559
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
3560
|
+
tools: [{
|
|
3561
|
+
name: "searchRestaurants",
|
|
3562
|
+
description: "Search for restaurants matching criteria",
|
|
3563
|
+
parameters: import_zod2.z.object({
|
|
3564
|
+
cuisine: import_zod2.z.string(),
|
|
3565
|
+
location: import_zod2.z.string(),
|
|
3566
|
+
radiusMiles: import_zod2.z.number(),
|
|
3567
|
+
minRating: import_zod2.z.number(),
|
|
3568
|
+
openNow: import_zod2.z.boolean()
|
|
3569
|
+
}),
|
|
3570
|
+
handler: async (_args) => ({
|
|
3571
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
3572
|
+
})
|
|
3573
|
+
}],
|
|
3574
|
+
expected: {
|
|
3575
|
+
cuisine: "Italian",
|
|
3576
|
+
location: "downtown Portland",
|
|
3577
|
+
radiusMiles: 2,
|
|
3578
|
+
minRating: 4,
|
|
3579
|
+
openNow: true
|
|
3580
|
+
}
|
|
3581
|
+
},
|
|
3582
|
+
{
|
|
3583
|
+
name: "tc:select-from-many",
|
|
3584
|
+
prompt: "Convert 150 USD to Euros.",
|
|
3585
|
+
tools: [
|
|
3586
|
+
{
|
|
3587
|
+
name: "getWeather",
|
|
3588
|
+
description: "Get current weather for a city",
|
|
3589
|
+
parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
|
|
3590
|
+
handler: async () => ({ tempC: 20 })
|
|
3591
|
+
},
|
|
3592
|
+
{
|
|
3593
|
+
name: "convertCurrency",
|
|
3594
|
+
description: "Convert an amount between currencies",
|
|
3595
|
+
parameters: import_zod2.z.object({
|
|
3596
|
+
amount: import_zod2.z.number(),
|
|
3597
|
+
from: import_zod2.z.string(),
|
|
3598
|
+
to: import_zod2.z.string()
|
|
3599
|
+
}),
|
|
3600
|
+
handler: async ({ amount, from, to }) => ({
|
|
3601
|
+
amount,
|
|
3602
|
+
from,
|
|
3603
|
+
to,
|
|
3604
|
+
result: 138.75,
|
|
3605
|
+
rate: 0.925
|
|
3606
|
+
})
|
|
3607
|
+
},
|
|
3608
|
+
{
|
|
3609
|
+
name: "translateText",
|
|
3610
|
+
description: "Translate text between languages",
|
|
3611
|
+
parameters: import_zod2.z.object({ text: import_zod2.z.string(), targetLang: import_zod2.z.string() }),
|
|
3612
|
+
handler: async () => ({ translated: "" })
|
|
3613
|
+
},
|
|
3614
|
+
{
|
|
3615
|
+
name: "calculateTip",
|
|
3616
|
+
description: "Calculate tip amount for a bill",
|
|
3617
|
+
parameters: import_zod2.z.object({ billAmount: import_zod2.z.number(), tipPercent: import_zod2.z.number() }),
|
|
3618
|
+
handler: async () => ({ tip: 0 })
|
|
3619
|
+
}
|
|
3620
|
+
],
|
|
3621
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
3622
|
+
},
|
|
3623
|
+
{
|
|
3624
|
+
name: "tc:parallel-calls",
|
|
3625
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
3626
|
+
tools: [{
|
|
3627
|
+
name: "getWeather",
|
|
3628
|
+
description: "Get current weather for a city",
|
|
3629
|
+
parameters: import_zod2.z.object({ city: import_zod2.z.string() }),
|
|
3630
|
+
handler: async ({ city }) => {
|
|
3631
|
+
const data = {
|
|
3632
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
3633
|
+
London: { tempC: 9, condition: "rainy" }
|
|
3634
|
+
};
|
|
3635
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
3636
|
+
}
|
|
3637
|
+
}],
|
|
3638
|
+
expected: "weather data for Paris and London"
|
|
3639
|
+
}
|
|
3640
|
+
],
|
|
3641
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
3642
|
+
};
|
|
3643
|
+
|
|
3644
|
+
// src/packs/reasoning.ts
|
|
3645
|
+
var import_zod3 = require("zod");
|
|
3646
|
+
var reasoningPack = {
|
|
3647
|
+
name: "reasoning",
|
|
3648
|
+
label: "Reasoning",
|
|
3649
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
3650
|
+
tasks: [
|
|
3651
|
+
{
|
|
3652
|
+
name: "rs:saas-mrr-calc",
|
|
3653
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
3654
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
3655
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
3656
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
3657
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
3658
|
+
expected: { mrr: 27425 },
|
|
3659
|
+
schema: import_zod3.z.object({
|
|
3660
|
+
reasoning: import_zod3.z.string().optional(),
|
|
3661
|
+
mrr: import_zod3.z.number()
|
|
3662
|
+
})
|
|
3663
|
+
},
|
|
3664
|
+
{
|
|
3665
|
+
name: "rs:logical-deduction",
|
|
3666
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
3667
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
3668
|
+
1. Alice does not use Python, Java, or Go.
|
|
3669
|
+
2. Bob uses TypeScript.
|
|
3670
|
+
3. Carol uses neither Rust nor Go.
|
|
3671
|
+
4. Dave does not use Java.
|
|
3672
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
3673
|
+
What language does each developer use? Return as JSON.`,
|
|
3674
|
+
expected: {
|
|
3675
|
+
Alice: "Rust",
|
|
3676
|
+
Bob: "TypeScript",
|
|
3677
|
+
Carol: "Java",
|
|
3678
|
+
Dave: "Go",
|
|
3679
|
+
Eve: "Python"
|
|
3680
|
+
},
|
|
3681
|
+
schema: import_zod3.z.object({
|
|
3682
|
+
Alice: import_zod3.z.string(),
|
|
3683
|
+
Bob: import_zod3.z.string(),
|
|
3684
|
+
Carol: import_zod3.z.string(),
|
|
3685
|
+
Dave: import_zod3.z.string(),
|
|
3686
|
+
Eve: import_zod3.z.string()
|
|
3687
|
+
})
|
|
3688
|
+
},
|
|
3689
|
+
{
|
|
3690
|
+
name: "rs:data-interpretation",
|
|
3691
|
+
prompt: `Given this quarterly revenue data:
|
|
3692
|
+
| Quarter | Revenue | Growth |
|
|
3693
|
+
|---------|---------|--------|
|
|
3694
|
+
| Q1 2025 | $2.1M | - |
|
|
3695
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
3696
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
3697
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
3698
|
+
|
|
3699
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
3700
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
3701
|
+
expected: {
|
|
3702
|
+
highestGrowthQuarter: "Q4 2025",
|
|
3703
|
+
absoluteIncrease: 0.6,
|
|
3704
|
+
fullYearRevenue: 9.5
|
|
3705
|
+
},
|
|
3706
|
+
schema: import_zod3.z.object({
|
|
3707
|
+
highestGrowthQuarter: import_zod3.z.string(),
|
|
3708
|
+
absoluteIncrease: import_zod3.z.number(),
|
|
3709
|
+
fullYearRevenue: import_zod3.z.number()
|
|
3710
|
+
})
|
|
3711
|
+
},
|
|
3712
|
+
{
|
|
3713
|
+
name: "rs:critical-path",
|
|
3714
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
3715
|
+
- Build (3 min, no dependency)
|
|
3716
|
+
- Unit tests (5 min, depends on Build)
|
|
3717
|
+
- Integration tests (8 min, depends on Build)
|
|
3718
|
+
- Security scan (4 min, depends on Build)
|
|
3719
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
3720
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
3721
|
+
|
|
3722
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
3723
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
3724
|
+
expected: {
|
|
3725
|
+
totalMinutes: 16,
|
|
3726
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
3727
|
+
},
|
|
3728
|
+
schema: import_zod3.z.object({
|
|
3729
|
+
totalMinutes: import_zod3.z.number(),
|
|
3730
|
+
criticalPath: import_zod3.z.array(import_zod3.z.string())
|
|
3731
|
+
})
|
|
3732
|
+
},
|
|
3733
|
+
{
|
|
3734
|
+
name: "rs:pricing-rules",
|
|
3735
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
3736
|
+
Rules:
|
|
3737
|
+
- Base price: $100
|
|
3738
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
3739
|
+
- Annual billing: additional 15% off the discounted price
|
|
3740
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
3741
|
+
|
|
3742
|
+
Customers:
|
|
3743
|
+
A: 50 seats, monthly billing, for-profit
|
|
3744
|
+
B: 200 seats, annual billing, for-profit
|
|
3745
|
+
C: 75 seats, annual billing, non-profit
|
|
3746
|
+
D: 150 seats, monthly billing, for-profit
|
|
3747
|
+
|
|
3748
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
3749
|
+
expected: [
|
|
3750
|
+
{ id: "A", finalPrice: 100 },
|
|
3751
|
+
{ id: "B", finalPrice: 59.5 },
|
|
3752
|
+
{ id: "C", finalPrice: 50 },
|
|
3753
|
+
{ id: "D", finalPrice: 70 }
|
|
3754
|
+
],
|
|
3755
|
+
schema: import_zod3.z.array(import_zod3.z.object({
|
|
3756
|
+
id: import_zod3.z.string(),
|
|
3757
|
+
finalPrice: import_zod3.z.number()
|
|
3758
|
+
}))
|
|
3759
|
+
}
|
|
3760
|
+
],
|
|
3761
|
+
scorers: ["correctness", "latency", "cost"]
|
|
3762
|
+
};
|
|
3763
|
+
|
|
3490
3764
|
// src/packs/index.ts
|
|
3491
3765
|
var registry = /* @__PURE__ */ new Map();
|
|
3492
3766
|
function register(pack) {
|
|
3493
3767
|
registry.set(pack.name, pack);
|
|
3494
3768
|
}
|
|
3495
3769
|
register(structuredOutputPack);
|
|
3770
|
+
register(toolCallingPack);
|
|
3771
|
+
register(reasoningPack);
|
|
3496
3772
|
function loadPack(name) {
|
|
3497
3773
|
const pack = registry.get(name);
|
|
3498
3774
|
if (!pack) {
|