agent-duelist 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1344,6 +1344,27 @@ var costScorer = ({ result }, providerId) => {
1344
1344
  };
1345
1345
  };
1346
1346
 
1347
+ // src/utils/deep-equal.ts
1348
+ function deepEqual(expected, actual) {
1349
+ if (expected === actual) return true;
1350
+ if (typeof expected === "string" && typeof actual === "string") {
1351
+ return expected.trim().toLowerCase() === actual.trim().toLowerCase();
1352
+ }
1353
+ if (typeof expected !== typeof actual) return false;
1354
+ if (expected === null || actual === null) return expected === actual;
1355
+ if (Array.isArray(expected) && Array.isArray(actual)) {
1356
+ if (expected.length !== actual.length) return false;
1357
+ return expected.every((val, i) => deepEqual(val, actual[i]));
1358
+ }
1359
+ if (typeof expected === "object" && typeof actual === "object") {
1360
+ const objExpected = expected;
1361
+ const objActual = actual;
1362
+ const keysExpected = Object.keys(objExpected);
1363
+ return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
1364
+ }
1365
+ return expected === actual;
1366
+ }
1367
+
1347
1368
  // src/scorers/correctness.ts
1348
1369
  var correctnessScorer = ({ task, result }) => {
1349
1370
  if (task.expected === void 0) {
@@ -1367,25 +1388,6 @@ function normalizeOutput(expected, actual) {
1367
1388
  }
1368
1389
  return actual;
1369
1390
  }
1370
- function deepEqual(expected, actual) {
1371
- if (expected === actual) return true;
1372
- if (typeof expected === "string" && typeof actual === "string") {
1373
- return expected.trim().toLowerCase() === actual.trim().toLowerCase();
1374
- }
1375
- if (typeof expected !== typeof actual) return false;
1376
- if (expected === null || actual === null) return expected === actual;
1377
- if (Array.isArray(expected) && Array.isArray(actual)) {
1378
- if (expected.length !== actual.length) return false;
1379
- return expected.every((val, i) => deepEqual(val, actual[i]));
1380
- }
1381
- if (typeof expected === "object" && typeof actual === "object") {
1382
- const objExpected = expected;
1383
- const objActual = actual;
1384
- const keysExpected = Object.keys(objExpected);
1385
- return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
1386
- }
1387
- return expected === actual;
1388
- }
1389
1391
 
1390
1392
  // src/scorers/schema-correctness.ts
1391
1393
  var schemaCorrectnessScorer = ({ task, result }) => {
@@ -1758,15 +1760,54 @@ function parseJudgeResponse(response, model) {
1758
1760
 
1759
1761
  // src/scorers/tool-usage.ts
1760
1762
  var toolUsageScorer = ({ task, result }) => {
1761
- const expectedToolName = task.tools?.[0]?.name;
1762
- if (!expectedToolName) {
1763
+ if (!task.tools?.length) {
1763
1764
  return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
1764
1765
  }
1765
- const usedTool = result.toolCalls?.some((c) => c.name === expectedToolName) ?? false;
1766
+ const calls = result.toolCalls ?? [];
1767
+ const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
1768
+ if (expectedIsObject) {
1769
+ const matchingCall = calls.find((c) => {
1770
+ const toolDef = task.tools.find((t) => t.name === c.name);
1771
+ if (!toolDef) return false;
1772
+ return deepEqual(task.expected, c.arguments);
1773
+ });
1774
+ if (matchingCall) {
1775
+ return {
1776
+ name: "tool-usage",
1777
+ value: 1,
1778
+ details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
1779
+ };
1780
+ }
1781
+ const expectedKeys = Object.keys(task.expected);
1782
+ const partialMatch = calls.find((c) => {
1783
+ if (typeof c.arguments !== "object" || c.arguments === null) return false;
1784
+ const argKeys = Object.keys(c.arguments);
1785
+ return expectedKeys.some((k) => argKeys.includes(k));
1786
+ });
1787
+ if (partialMatch) {
1788
+ return {
1789
+ name: "tool-usage",
1790
+ value: 0.5,
1791
+ details: {
1792
+ reason: "correct tool but wrong arguments",
1793
+ expected: task.expected,
1794
+ actual: partialMatch.arguments,
1795
+ toolCalls: calls
1796
+ }
1797
+ };
1798
+ }
1799
+ return {
1800
+ name: "tool-usage",
1801
+ value: 0,
1802
+ details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
1803
+ };
1804
+ }
1805
+ const expectedToolName = task.tools[0].name;
1806
+ const usedTool = calls.some((c) => c.name === expectedToolName);
1766
1807
  return {
1767
1808
  name: "tool-usage",
1768
1809
  value: usedTool ? 1 : 0,
1769
- details: { expectedToolName, usedTool, toolCalls: result.toolCalls ?? [] }
1810
+ details: { expectedToolName, usedTool, toolCalls: calls }
1770
1811
  };
1771
1812
  };
1772
1813
 
@@ -3433,12 +3474,247 @@ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
3433
3474
  scorers: ["correctness", "schema-correctness", "latency", "cost"]
3434
3475
  };
3435
3476
 
3477
+ // src/packs/tool-calling.ts
3478
+ import { z as z2 } from "zod";
3479
+ var toolCallingPack = {
3480
+ name: "tool-calling",
3481
+ label: "Tool Calling",
3482
+ description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
3483
+ tasks: [
3484
+ {
3485
+ name: "tc:simple-single-tool",
3486
+ prompt: "What's the current weather in Tokyo?",
3487
+ tools: [{
3488
+ name: "getWeather",
3489
+ description: "Get current weather for a city",
3490
+ parameters: z2.object({
3491
+ city: z2.string(),
3492
+ units: z2.enum(["celsius", "fahrenheit"]).optional()
3493
+ }),
3494
+ handler: async ({ city, units }) => ({
3495
+ city,
3496
+ tempC: 8,
3497
+ condition: "cloudy",
3498
+ units: units ?? "celsius"
3499
+ })
3500
+ }],
3501
+ expected: { city: "Tokyo" }
3502
+ },
3503
+ {
3504
+ name: "tc:complex-params",
3505
+ prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
3506
+ tools: [{
3507
+ name: "searchRestaurants",
3508
+ description: "Search for restaurants matching criteria",
3509
+ parameters: z2.object({
3510
+ cuisine: z2.string(),
3511
+ location: z2.string(),
3512
+ radiusMiles: z2.number(),
3513
+ minRating: z2.number(),
3514
+ openNow: z2.boolean()
3515
+ }),
3516
+ handler: async (_args) => ({
3517
+ results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
3518
+ })
3519
+ }],
3520
+ expected: {
3521
+ cuisine: "Italian",
3522
+ location: "downtown Portland",
3523
+ radiusMiles: 2,
3524
+ minRating: 4,
3525
+ openNow: true
3526
+ }
3527
+ },
3528
+ {
3529
+ name: "tc:select-from-many",
3530
+ prompt: "Convert 150 USD to Euros.",
3531
+ tools: [
3532
+ {
3533
+ name: "getWeather",
3534
+ description: "Get current weather for a city",
3535
+ parameters: z2.object({ city: z2.string() }),
3536
+ handler: async () => ({ tempC: 20 })
3537
+ },
3538
+ {
3539
+ name: "convertCurrency",
3540
+ description: "Convert an amount between currencies",
3541
+ parameters: z2.object({
3542
+ amount: z2.number(),
3543
+ from: z2.string(),
3544
+ to: z2.string()
3545
+ }),
3546
+ handler: async ({ amount, from, to }) => ({
3547
+ amount,
3548
+ from,
3549
+ to,
3550
+ result: 138.75,
3551
+ rate: 0.925
3552
+ })
3553
+ },
3554
+ {
3555
+ name: "translateText",
3556
+ description: "Translate text between languages",
3557
+ parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
3558
+ handler: async () => ({ translated: "" })
3559
+ },
3560
+ {
3561
+ name: "calculateTip",
3562
+ description: "Calculate tip amount for a bill",
3563
+ parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
3564
+ handler: async () => ({ tip: 0 })
3565
+ }
3566
+ ],
3567
+ expected: { amount: 150, from: "USD", to: "EUR" }
3568
+ },
3569
+ {
3570
+ name: "tc:parallel-calls",
3571
+ prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
3572
+ tools: [{
3573
+ name: "getWeather",
3574
+ description: "Get current weather for a city",
3575
+ parameters: z2.object({ city: z2.string() }),
3576
+ handler: async ({ city }) => {
3577
+ const data = {
3578
+ Paris: { tempC: 12, condition: "partly cloudy" },
3579
+ London: { tempC: 9, condition: "rainy" }
3580
+ };
3581
+ return data[city] ?? { tempC: 15, condition: "unknown" };
3582
+ }
3583
+ }],
3584
+ expected: "weather data for Paris and London"
3585
+ }
3586
+ ],
3587
+ scorers: ["tool-usage", "latency", "cost"]
3588
+ };
3589
+
3590
+ // src/packs/reasoning.ts
3591
+ import { z as z3 } from "zod";
3592
+ var reasoningPack = {
3593
+ name: "reasoning",
3594
+ label: "Reasoning",
3595
+ description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
3596
+ tasks: [
3597
+ {
3598
+ name: "rs:saas-mrr-calc",
3599
+ prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
3600
+ In Q1 they had 200 basic subscribers and 85 pro subscribers.
3601
+ In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
3602
+ No one churned. What is the Q2 monthly recurring revenue (MRR)?
3603
+ Return as JSON with your reasoning and the final MRR number.`,
3604
+ expected: { mrr: 27425 },
3605
+ schema: z3.object({
3606
+ reasoning: z3.string().optional(),
3607
+ mrr: z3.number()
3608
+ })
3609
+ },
3610
+ {
3611
+ name: "rs:logical-deduction",
3612
+ prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
3613
+ primary language: Rust, TypeScript, Python, Go, and Java. Given:
3614
+ 1. Alice does not use Python, Java, or Go.
3615
+ 2. Bob uses TypeScript.
3616
+ 3. Carol uses neither Rust nor Go.
3617
+ 4. Dave does not use Java.
3618
+ 5. Eve uses neither Rust, Go, nor Java.
3619
+ What language does each developer use? Return as JSON.`,
3620
+ expected: {
3621
+ Alice: "Rust",
3622
+ Bob: "TypeScript",
3623
+ Carol: "Java",
3624
+ Dave: "Go",
3625
+ Eve: "Python"
3626
+ },
3627
+ schema: z3.object({
3628
+ Alice: z3.string(),
3629
+ Bob: z3.string(),
3630
+ Carol: z3.string(),
3631
+ Dave: z3.string(),
3632
+ Eve: z3.string()
3633
+ })
3634
+ },
3635
+ {
3636
+ name: "rs:data-interpretation",
3637
+ prompt: `Given this quarterly revenue data:
3638
+ | Quarter | Revenue | Growth |
3639
+ |---------|---------|--------|
3640
+ | Q1 2025 | $2.1M | - |
3641
+ | Q2 2025 | $2.4M | 14.3% |
3642
+ | Q3 2025 | $2.2M | -8.3% |
3643
+ | Q4 2025 | $2.8M | 27.3% |
3644
+
3645
+ Which quarter had the highest absolute revenue increase compared to the previous
3646
+ quarter? What was the full-year total revenue in millions? Return as JSON.`,
3647
+ expected: {
3648
+ highestGrowthQuarter: "Q4 2025",
3649
+ absoluteIncrease: 0.6,
3650
+ fullYearRevenue: 9.5
3651
+ },
3652
+ schema: z3.object({
3653
+ highestGrowthQuarter: z3.string(),
3654
+ absoluteIncrease: z3.number(),
3655
+ fullYearRevenue: z3.number()
3656
+ })
3657
+ },
3658
+ {
3659
+ name: "rs:critical-path",
3660
+ prompt: `A deployment pipeline has these stages with dependencies:
3661
+ - Build (3 min, no dependency)
3662
+ - Unit tests (5 min, depends on Build)
3663
+ - Integration tests (8 min, depends on Build)
3664
+ - Security scan (4 min, depends on Build)
3665
+ - Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
3666
+ - Smoke tests (3 min, depends on Staging deploy)
3667
+
3668
+ Assuming stages run in parallel where possible, what is the total pipeline
3669
+ duration in minutes? Which stages are on the critical path? Return as JSON.`,
3670
+ expected: {
3671
+ totalMinutes: 16,
3672
+ criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
3673
+ },
3674
+ schema: z3.object({
3675
+ totalMinutes: z3.number(),
3676
+ criticalPath: z3.array(z3.string())
3677
+ })
3678
+ },
3679
+ {
3680
+ name: "rs:pricing-rules",
3681
+ prompt: `Apply these pricing rules to each customer and return the final price:
3682
+ Rules:
3683
+ - Base price: $100
3684
+ - Enterprise customers (>100 seats): 30% discount
3685
+ - Annual billing: additional 15% off the discounted price
3686
+ - Non-profit organizations: flat $50 regardless of other rules
3687
+
3688
+ Customers:
3689
+ A: 50 seats, monthly billing, for-profit
3690
+ B: 200 seats, annual billing, for-profit
3691
+ C: 75 seats, annual billing, non-profit
3692
+ D: 150 seats, monthly billing, for-profit
3693
+
3694
+ Return as a JSON array with customer id and finalPrice.`,
3695
+ expected: [
3696
+ { id: "A", finalPrice: 100 },
3697
+ { id: "B", finalPrice: 59.5 },
3698
+ { id: "C", finalPrice: 50 },
3699
+ { id: "D", finalPrice: 70 }
3700
+ ],
3701
+ schema: z3.array(z3.object({
3702
+ id: z3.string(),
3703
+ finalPrice: z3.number()
3704
+ }))
3705
+ }
3706
+ ],
3707
+ scorers: ["correctness", "latency", "cost"]
3708
+ };
3709
+
3436
3710
  // src/packs/index.ts
3437
3711
  var registry = /* @__PURE__ */ new Map();
3438
3712
  function register(pack) {
3439
3713
  registry.set(pack.name, pack);
3440
3714
  }
3441
3715
  register(structuredOutputPack);
3716
+ register(toolCallingPack);
3717
+ register(reasoningPack);
3442
3718
  function loadPack(name) {
3443
3719
  const pack = registry.get(name);
3444
3720
  if (!pack) {