npm - agent-duelist - Versions diffs - 0.3.0 → 0.3.1 - Mend

agent-duelist 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -161,9 +161,11 @@ export default defineArena({
 ### Available packs
-| Pack | Tasks | Description |
-|------|-------|-------------|
-| `structured-output` | 6 | Zod schema stress test — flat objects, nesting, arrays, enums, empty arrays, and adversarial input |
+| Pack | Tasks | Scorers | Description |
+|------|-------|---------|-------------|
+| `structured-output` | 6 | correctness, schema-correctness, latency, cost | Zod schema stress test — flat objects, nesting, arrays, enums, empty arrays, and adversarial input |
+| `tool-calling` | 4 | tool-usage, latency, cost | Function invocation accuracy — single calls, complex params, tool selection, and parallel calls |
+| `reasoning` | 5 | correctness, latency, cost | Logic, math, and multi-step thinking — arithmetic, deduction, data interpretation, critical path, and business rules |
 Packs work with both `run` and `ci` commands:
@@ -291,7 +293,7 @@ Scorers turn raw model outputs into **numeric scores** (0–1) with optional det
 | `correctness` | Exact match against `expected` (deep-equal, key-order independent for objects) |
 | `schema-correctness` | Validates output against the task's Zod `schema` via `safeParse()` |
 | `fuzzy-similarity` | Jaccard token-overlap similarity between output and `expected` |
-| `tool-usage` | Whether the model invoked the expected tool(s) during a tool-calling task |
+| `tool-usage` | Tool calling accuracy — checks tool selection and argument correctness (1.0 exact match, 0.5 right tool / wrong args, 0.0 wrong tool) |
 | `llm-judge-correctness` | LLM-as-judge — calls a judge model to score accuracy, completeness, and conciseness |
 Configure them in your arena:
@@ -637,7 +639,7 @@ With cost summary, flakiness warnings, and pass/fail verdict.
 - 5 provider types: OpenAI, Azure OpenAI, Anthropic, Google Gemini, and any OpenAI-compatible gateway
 - 7 built-in scorers including LLM-as-judge, tool-usage, schema validation, and fuzzy similarity
 - Tool-calling support with local handlers for agent task benchmarking
-- **Task packs**: built-in benchmark suites (`structured-output`) — run with `--pack`, no config writing needed
+- **Task packs**: built-in benchmark suites (`structured-output`, `tool-calling`, `reasoning`) — run with `--pack`, no config writing needed
 - Quality-first medal ranking: output quality decides medals, efficiency only breaks ties
 - Fair head-to-head benchmarking with parallel provider execution
 - 4 reporters: console (tables + medals + sparklines), JSON, HTML (sortable, self-contained), and Markdown (PR comments)
@@ -649,7 +651,7 @@ With cost summary, flakiness warnings, and pass/fail verdict.
 **Planned** (subject to community feedback):
-- **More task packs** — reasoning, summarization, tool-calling, and multi-turn conversation packs
+- **More task packs** — summarization, multi-turn conversation, and code generation packs
 - **Agent workflows** — multi-step tool chains, multi-hop reasoning, and agent traces
 - **More export formats** — CSV
 - **Plugin system** — first-class support for user-defined providers and scorers

package/dist/cli.js CHANGED Viewed

@@ -498,7 +498,7 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
     return p5(), n3.getToken() !== 2 ? k3(7, [2], []) : b3(), true;
   }
   i2(M2, "parseObject");
-  function z3() {
+  function z5() {
     D3(), b3();
     let w4 = true, j2 = false;
     for (; n3.getToken() !== 4 && n3.getToken() !== 17; ) {
@@ -509,11 +509,11 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
     }
     return L3(), w4 || o7.pop(), n3.getToken() !== 4 ? k3(8, [4], []) : b3(), true;
   }
-  i2(z3, "parseArray");
+  i2(z5, "parseArray");
   function U2() {
     switch (n3.getToken()) {
       case 3:
-        return z3();
+        return z5();
       case 1:
         return M2();
       case 10:
@@ -746,7 +746,7 @@ var init_dist2 = __esm({
     }, "interpolateConfigDir");
     Me = ["outDir", "declarationDir", "outFile", "rootDir", "baseUrl", "tsBuildInfoFile"];
     ze = i2((e5) => {
-      var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2, z3, U2, w4, j2, S2, $2;
+      var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2, z5, U2, w4, j2, S2, $2;
       if (e5.strict) {
         const f6 = ["noImplicitAny", "noImplicitThis", "strictNullChecks", "strictFunctionTypes", "strictBindCallApply", "strictPropertyInitialization", "strictBuiltinIteratorReturn", "alwaysStrict", "useUnknownInCatchVariables"];
         for (const B2 of f6) e5[B2] === void 0 && (e5[B2] = true);
@@ -767,7 +767,7 @@ var init_dist2 = __esm({
         let f6 = e5.moduleResolution.toLowerCase();
         f6 === "node" && (f6 = "node10"), e5.moduleResolution = f6, (f6 === "node16" || f6 === "nodenext" || f6 === "bundler") && ((R5 = e5.resolvePackageJsonExports) != null || (e5.resolvePackageJsonExports = true), (W = e5.resolvePackageJsonImports) != null || (e5.resolvePackageJsonImports = true)), f6 === "bundler" && ((V2 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true), (M2 = e5.resolveJsonModule) != null || (e5.resolveJsonModule = true));
       }
-      e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((z3 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true)), e5.verbatimModuleSyntax && ((U2 = e5.isolatedModules) != null || (e5.isolatedModules = true), (w4 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.isolatedModules && ((j2 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.rewriteRelativeImportExtensions && ((S2 = e5.allowImportingTsExtensions) != null || (e5.allowImportingTsExtensions = true)), e5.lib && (e5.lib = e5.lib.map((f6) => f6.toLowerCase())), e5.checkJs && (($2 = e5.allowJs) != null || (e5.allowJs = true));
+      e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((z5 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true)), e5.verbatimModuleSyntax && ((U2 = e5.isolatedModules) != null || (e5.isolatedModules = true), (w4 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.isolatedModules && ((j2 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.rewriteRelativeImportExtensions && ((S2 = e5.allowImportingTsExtensions) != null || (e5.allowImportingTsExtensions = true)), e5.lib && (e5.lib = e5.lib.map((f6) => f6.toLowerCase())), e5.checkJs && (($2 = e5.allowJs) != null || (e5.allowJs = true));
     }, "normalizeCompilerOptions");
     pe = i2((e5, t3 = /* @__PURE__ */ new Map()) => {
       const s5 = m3.resolve(e5), n3 = ve(s5, t3), o7 = m3.dirname(s5), { compilerOptions: l3 } = n3;
@@ -3345,14 +3345,14 @@ function fn(s5, e5 = "@") {
               case 32:
                 break;
               case 101: {
-                if (!(d5[400] | 0) && z3(h4) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && ($2(), (b3[804] | 0) == 0)) {
+                if (!(d5[400] | 0) && z5(h4) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && ($2(), (b3[804] | 0) == 0)) {
                   l3 = 9;
                   break e;
                 } else l3 = 17;
                 break;
               }
               case 105: {
-                z3(h4) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 17;
+                z5(h4) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 17;
                 break;
               }
               case 59: {
@@ -3400,15 +3400,15 @@ function fn(s5, e5 = "@") {
                   case 32:
                     break;
                   case 101: {
-                    !(d5[400] | 0) && z3(a7) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && $2(), l3 = 91;
+                    !(d5[400] | 0) && z5(a7) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && $2(), l3 = 91;
                     break;
                   }
                   case 105: {
-                    z3(a7) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 91;
+                    z5(a7) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 91;
                     break;
                   }
                   case 99: {
-                    z3(a7) | 0 && !(A3(t3 + 4 | 0, 36, 8) | 0) && P3(d5[t3 + 12 >> 1] | 0) | 0 && (b3[806] = 1), l3 = 91;
+                    z5(a7) | 0 && !(A3(t3 + 4 | 0, 36, 8) | 0) && P3(d5[t3 + 12 >> 1] | 0) | 0 && (b3[806] = 1), l3 = 91;
                     break;
                   }
                   case 40: {
@@ -4437,10 +4437,10 @@ function fn(s5, e5 = "@") {
         return t3 = t3 | 0, (d5[t3 >> 1] | 0) == 46 && (d5[t3 + -2 >> 1] | 0) == 46 ? t3 = (d5[t3 + -4 >> 1] | 0) == 46 : t3 = 0, t3 | 0;
       }
       f2(de3, "G");
-      function z3(t3) {
+      function z5(t3) {
         return t3 = t3 | 0, (r3[3] | 0) == (t3 | 0) ? t3 = 1 : t3 = Oe2(t3 + -2 | 0) | 0, t3 | 0;
       }
-      f2(z3, "H");
+      f2(z5, "H");
       function vt() {
         var t3 = 0;
         return t3 = r3[(r3[62] | 0) + 12 >> 2] | 0, t3 ? t3 = t3 - (r3[3] | 0) >> 1 : t3 = -1, t3 | 0;
@@ -5821,7 +5821,7 @@ import p4 from "path";
 import { fileURLToPath as O4 } from "url";
 import se3, { writeSync as te2 } from "fs";
 import { inspect as oe3 } from "util";
-var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3, z2, we2, Me3, Te3, Fe3, H2, $e3;
+var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3, z4, we2, Me3, Te3, Fe3, H2, $e3;
 var init_register_CFH5oNdT = __esm({
   "node_modules/tsx/dist/register-CFH5oNdT.mjs"() {
     "use strict";
@@ -5995,11 +5995,11 @@ var init_register_CFH5oNdT = __esm({
         throw t3;
       }
     }, "createTsExtensionResolver");
-    z2 = "at cjsPreparseModuleExports (node:internal";
+    z4 = "at cjsPreparseModuleExports (node:internal";
     we2 = o4((s5) => {
       const e5 = s5.stack.split(`
 `).slice(1);
-      return e5[1].includes(z2) || e5[2].includes(z2);
+      return e5[1].includes(z4) || e5[2].includes(z4);
     }, "isFromCjsLexer");
     Me3 = o4((s5, e5) => {
       const a7 = s5.split("?"), n3 = new URLSearchParams(a7[1]);
@@ -7540,6 +7540,27 @@ var costScorer = ({ result }, providerId) => {
   };
 };
+// src/utils/deep-equal.ts
+function deepEqual(expected, actual) {
+  if (expected === actual) return true;
+  if (typeof expected === "string" && typeof actual === "string") {
+    return expected.trim().toLowerCase() === actual.trim().toLowerCase();
+  }
+  if (typeof expected !== typeof actual) return false;
+  if (expected === null || actual === null) return expected === actual;
+  if (Array.isArray(expected) && Array.isArray(actual)) {
+    if (expected.length !== actual.length) return false;
+    return expected.every((val, i7) => deepEqual(val, actual[i7]));
+  }
+  if (typeof expected === "object" && typeof actual === "object") {
+    const objExpected = expected;
+    const objActual = actual;
+    const keysExpected = Object.keys(objExpected);
+    return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
+  }
+  return expected === actual;
+}
 // src/scorers/correctness.ts
 var correctnessScorer = ({ task, result }) => {
   if (task.expected === void 0) {
@@ -7563,25 +7584,6 @@ function normalizeOutput(expected, actual) {
   }
   return actual;
 }
-function deepEqual(expected, actual) {
-  if (expected === actual) return true;
-  if (typeof expected === "string" && typeof actual === "string") {
-    return expected.trim().toLowerCase() === actual.trim().toLowerCase();
-  }
-  if (typeof expected !== typeof actual) return false;
-  if (expected === null || actual === null) return expected === actual;
-  if (Array.isArray(expected) && Array.isArray(actual)) {
-    if (expected.length !== actual.length) return false;
-    return expected.every((val, i7) => deepEqual(val, actual[i7]));
-  }
-  if (typeof expected === "object" && typeof actual === "object") {
-    const objExpected = expected;
-    const objActual = actual;
-    const keysExpected = Object.keys(objExpected);
-    return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
-  }
-  return expected === actual;
-}
 // src/scorers/schema-correctness.ts
 var schemaCorrectnessScorer = ({ task, result }) => {
@@ -7790,15 +7792,54 @@ function parseJudgeResponse(response, model) {
 // src/scorers/tool-usage.ts
 var toolUsageScorer = ({ task, result }) => {
-  const expectedToolName = task.tools?.[0]?.name;
-  if (!expectedToolName) {
+  if (!task.tools?.length) {
     return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
   }
-  const usedTool = result.toolCalls?.some((c3) => c3.name === expectedToolName) ?? false;
+  const calls = result.toolCalls ?? [];
+  const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
+  if (expectedIsObject) {
+    const matchingCall = calls.find((c3) => {
+      const toolDef = task.tools.find((t3) => t3.name === c3.name);
+      if (!toolDef) return false;
+      return deepEqual(task.expected, c3.arguments);
+    });
+    if (matchingCall) {
+      return {
+        name: "tool-usage",
+        value: 1,
+        details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
+      };
+    }
+    const expectedKeys = Object.keys(task.expected);
+    const partialMatch = calls.find((c3) => {
+      if (typeof c3.arguments !== "object" || c3.arguments === null) return false;
+      const argKeys = Object.keys(c3.arguments);
+      return expectedKeys.some((k3) => argKeys.includes(k3));
+    });
+    if (partialMatch) {
+      return {
+        name: "tool-usage",
+        value: 0.5,
+        details: {
+          reason: "correct tool but wrong arguments",
+          expected: task.expected,
+          actual: partialMatch.arguments,
+          toolCalls: calls
+        }
+      };
+    }
+    return {
+      name: "tool-usage",
+      value: 0,
+      details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
+    };
+  }
+  const expectedToolName = task.tools[0].name;
+  const usedTool = calls.some((c3) => c3.name === expectedToolName);
   return {
     name: "tool-usage",
     value: usedTool ? 1 : 0,
-    details: { expectedToolName, usedTool, toolCalls: result.toolCalls ?? [] }
+    details: { expectedToolName, usedTool, toolCalls: calls }
   };
 };
@@ -9706,12 +9747,247 @@ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
   scorers: ["correctness", "schema-correctness", "latency", "cost"]
 };
+// src/packs/tool-calling.ts
+import { z as z2 } from "zod";
+var toolCallingPack = {
+  name: "tool-calling",
+  label: "Tool Calling",
+  description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
+  tasks: [
+    {
+      name: "tc:simple-single-tool",
+      prompt: "What's the current weather in Tokyo?",
+      tools: [{
+        name: "getWeather",
+        description: "Get current weather for a city",
+        parameters: z2.object({
+          city: z2.string(),
+          units: z2.enum(["celsius", "fahrenheit"]).optional()
+        }),
+        handler: async ({ city, units }) => ({
+          city,
+          tempC: 8,
+          condition: "cloudy",
+          units: units ?? "celsius"
+        })
+      }],
+      expected: { city: "Tokyo" }
+    },
+    {
+      name: "tc:complex-params",
+      prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
+      tools: [{
+        name: "searchRestaurants",
+        description: "Search for restaurants matching criteria",
+        parameters: z2.object({
+          cuisine: z2.string(),
+          location: z2.string(),
+          radiusMiles: z2.number(),
+          minRating: z2.number(),
+          openNow: z2.boolean()
+        }),
+        handler: async (_args) => ({
+          results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
+        })
+      }],
+      expected: {
+        cuisine: "Italian",
+        location: "downtown Portland",
+        radiusMiles: 2,
+        minRating: 4,
+        openNow: true
+      }
+    },
+    {
+      name: "tc:select-from-many",
+      prompt: "Convert 150 USD to Euros.",
+      tools: [
+        {
+          name: "getWeather",
+          description: "Get current weather for a city",
+          parameters: z2.object({ city: z2.string() }),
+          handler: async () => ({ tempC: 20 })
+        },
+        {
+          name: "convertCurrency",
+          description: "Convert an amount between currencies",
+          parameters: z2.object({
+            amount: z2.number(),
+            from: z2.string(),
+            to: z2.string()
+          }),
+          handler: async ({ amount, from, to }) => ({
+            amount,
+            from,
+            to,
+            result: 138.75,
+            rate: 0.925
+          })
+        },
+        {
+          name: "translateText",
+          description: "Translate text between languages",
+          parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
+          handler: async () => ({ translated: "" })
+        },
+        {
+          name: "calculateTip",
+          description: "Calculate tip amount for a bill",
+          parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
+          handler: async () => ({ tip: 0 })
+        }
+      ],
+      expected: { amount: 150, from: "USD", to: "EUR" }
+    },
+    {
+      name: "tc:parallel-calls",
+      prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
+      tools: [{
+        name: "getWeather",
+        description: "Get current weather for a city",
+        parameters: z2.object({ city: z2.string() }),
+        handler: async ({ city }) => {
+          const data = {
+            Paris: { tempC: 12, condition: "partly cloudy" },
+            London: { tempC: 9, condition: "rainy" }
+          };
+          return data[city] ?? { tempC: 15, condition: "unknown" };
+        }
+      }],
+      expected: "weather data for Paris and London"
+    }
+  ],
+  scorers: ["tool-usage", "latency", "cost"]
+};
+// src/packs/reasoning.ts
+import { z as z3 } from "zod";
+var reasoningPack = {
+  name: "reasoning",
+  label: "Reasoning",
+  description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
+  tasks: [
+    {
+      name: "rs:saas-mrr-calc",
+      prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
+In Q1 they had 200 basic subscribers and 85 pro subscribers.
+In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
+No one churned. What is the Q2 monthly recurring revenue (MRR)?
+Return as JSON with your reasoning and the final MRR number.`,
+      expected: { mrr: 27425 },
+      schema: z3.object({
+        reasoning: z3.string().optional(),
+        mrr: z3.number()
+      })
+    },
+    {
+      name: "rs:logical-deduction",
+      prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
+primary language: Rust, TypeScript, Python, Go, and Java. Given:
+1. Alice does not use Python, Java, or Go.
+2. Bob uses TypeScript.
+3. Carol uses neither Rust nor Go.
+4. Dave does not use Java.
+5. Eve uses neither Rust, Go, nor Java.
+What language does each developer use? Return as JSON.`,
+      expected: {
+        Alice: "Rust",
+        Bob: "TypeScript",
+        Carol: "Java",
+        Dave: "Go",
+        Eve: "Python"
+      },
+      schema: z3.object({
+        Alice: z3.string(),
+        Bob: z3.string(),
+        Carol: z3.string(),
+        Dave: z3.string(),
+        Eve: z3.string()
+      })
+    },
+    {
+      name: "rs:data-interpretation",
+      prompt: `Given this quarterly revenue data:
+| Quarter | Revenue | Growth |
+|---------|---------|--------|
+| Q1 2025 | $2.1M   | -      |
+| Q2 2025 | $2.4M   | 14.3%  |
+| Q3 2025 | $2.2M   | -8.3%  |
+| Q4 2025 | $2.8M   | 27.3%  |
+Which quarter had the highest absolute revenue increase compared to the previous
+quarter? What was the full-year total revenue in millions? Return as JSON.`,
+      expected: {
+        highestGrowthQuarter: "Q4 2025",
+        absoluteIncrease: 0.6,
+        fullYearRevenue: 9.5
+      },
+      schema: z3.object({
+        highestGrowthQuarter: z3.string(),
+        absoluteIncrease: z3.number(),
+        fullYearRevenue: z3.number()
+      })
+    },
+    {
+      name: "rs:critical-path",
+      prompt: `A deployment pipeline has these stages with dependencies:
+- Build (3 min, no dependency)
+- Unit tests (5 min, depends on Build)
+- Integration tests (8 min, depends on Build)
+- Security scan (4 min, depends on Build)
+- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
+- Smoke tests (3 min, depends on Staging deploy)
+Assuming stages run in parallel where possible, what is the total pipeline
+duration in minutes? Which stages are on the critical path? Return as JSON.`,
+      expected: {
+        totalMinutes: 16,
+        criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
+      },
+      schema: z3.object({
+        totalMinutes: z3.number(),
+        criticalPath: z3.array(z3.string())
+      })
+    },
+    {
+      name: "rs:pricing-rules",
+      prompt: `Apply these pricing rules to each customer and return the final price:
+Rules:
+- Base price: $100
+- Enterprise customers (>100 seats): 30% discount
+- Annual billing: additional 15% off the discounted price
+- Non-profit organizations: flat $50 regardless of other rules
+Customers:
+A: 50 seats, monthly billing, for-profit
+B: 200 seats, annual billing, for-profit
+C: 75 seats, annual billing, non-profit
+D: 150 seats, monthly billing, for-profit
+Return as a JSON array with customer id and finalPrice.`,
+      expected: [
+        { id: "A", finalPrice: 100 },
+        { id: "B", finalPrice: 59.5 },
+        { id: "C", finalPrice: 50 },
+        { id: "D", finalPrice: 70 }
+      ],
+      schema: z3.array(z3.object({
+        id: z3.string(),
+        finalPrice: z3.number()
+      }))
+    }
+  ],
+  scorers: ["correctness", "latency", "cost"]
+};
 // src/packs/index.ts
 var registry = /* @__PURE__ */ new Map();
 function register(pack) {
   registry.set(pack.name, pack);
 }
 register(structuredOutputPack);
+register(toolCallingPack);
+register(reasoningPack);
 function loadPack(name) {
   const pack = registry.get(name);
   if (!pack) {
@@ -9893,11 +10169,15 @@ function printPackList() {
     console.log("No packs available.");
     return;
   }
+  const nameWidth = Math.max(...packs.map((p5) => p5.name.length)) + 2;
   console.log("Available task packs:\n");
   for (const p5 of packs) {
-    console.log(`  ${p5.name.padEnd(24)} ${p5.description} (${p5.taskCount} tasks)`);
+    const tasks = `${p5.taskCount} tasks`;
+    console.log(`  ${p5.name.padEnd(nameWidth)} ${tasks.padEnd(9)} ${p5.description}`);
   }
-  console.log("\nUsage: npx duelist run --pack <name>");
+  console.log(`
+Run:     npx duelist run --pack <name>`);
+  console.log(`Combine: npx duelist run --pack structured-output,tool-calling`);
 }
 async function loadArenaWithPacks(packNames, configOpt) {
   const configPath = resolve(configOpt);