agent-duelist 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/dist/cli.js +320 -40
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +299 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +299 -23
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/README.md
CHANGED
|
@@ -161,9 +161,11 @@ export default defineArena({
|
|
|
161
161
|
|
|
162
162
|
### Available packs
|
|
163
163
|
|
|
164
|
-
| Pack | Tasks | Description |
|
|
165
|
-
|
|
166
|
-
| `structured-output` | 6 | Zod schema stress test — flat objects, nesting, arrays, enums, empty arrays, and adversarial input |
|
|
164
|
+
| Pack | Tasks | Scorers | Description |
|
|
165
|
+
|------|-------|---------|-------------|
|
|
166
|
+
| `structured-output` | 6 | correctness, schema-correctness, latency, cost | Zod schema stress test — flat objects, nesting, arrays, enums, empty arrays, and adversarial input |
|
|
167
|
+
| `tool-calling` | 4 | tool-usage, latency, cost | Function invocation accuracy — single calls, complex params, tool selection, and parallel calls |
|
|
168
|
+
| `reasoning` | 5 | correctness, latency, cost | Logic, math, and multi-step thinking — arithmetic, deduction, data interpretation, critical path, and business rules |
|
|
167
169
|
|
|
168
170
|
Packs work with both `run` and `ci` commands:
|
|
169
171
|
|
|
@@ -291,7 +293,7 @@ Scorers turn raw model outputs into **numeric scores** (0–1) with optional det
|
|
|
291
293
|
| `correctness` | Exact match against `expected` (deep-equal, key-order independent for objects) |
|
|
292
294
|
| `schema-correctness` | Validates output against the task's Zod `schema` via `safeParse()` |
|
|
293
295
|
| `fuzzy-similarity` | Jaccard token-overlap similarity between output and `expected` |
|
|
294
|
-
| `tool-usage` |
|
|
296
|
+
| `tool-usage` | Tool calling accuracy — checks tool selection and argument correctness (1.0 exact match, 0.5 right tool / wrong args, 0.0 wrong tool) |
|
|
295
297
|
| `llm-judge-correctness` | LLM-as-judge — calls a judge model to score accuracy, completeness, and conciseness |
|
|
296
298
|
|
|
297
299
|
Configure them in your arena:
|
|
@@ -637,7 +639,7 @@ With cost summary, flakiness warnings, and pass/fail verdict.
|
|
|
637
639
|
- 5 provider types: OpenAI, Azure OpenAI, Anthropic, Google Gemini, and any OpenAI-compatible gateway
|
|
638
640
|
- 7 built-in scorers including LLM-as-judge, tool-usage, schema validation, and fuzzy similarity
|
|
639
641
|
- Tool-calling support with local handlers for agent task benchmarking
|
|
640
|
-
- **Task packs**: built-in benchmark suites (`structured-output`) — run with `--pack`, no config writing needed
|
|
642
|
+
- **Task packs**: built-in benchmark suites (`structured-output`, `tool-calling`, `reasoning`) — run with `--pack`, no config writing needed
|
|
641
643
|
- Quality-first medal ranking: output quality decides medals, efficiency only breaks ties
|
|
642
644
|
- Fair head-to-head benchmarking with parallel provider execution
|
|
643
645
|
- 4 reporters: console (tables + medals + sparklines), JSON, HTML (sortable, self-contained), and Markdown (PR comments)
|
|
@@ -649,7 +651,7 @@ With cost summary, flakiness warnings, and pass/fail verdict.
|
|
|
649
651
|
|
|
650
652
|
**Planned** (subject to community feedback):
|
|
651
653
|
|
|
652
|
-
- **More task packs** —
|
|
654
|
+
- **More task packs** — summarization, multi-turn conversation, and code generation packs
|
|
653
655
|
- **Agent workflows** — multi-step tool chains, multi-hop reasoning, and agent traces
|
|
654
656
|
- **More export formats** — CSV
|
|
655
657
|
- **Plugin system** — first-class support for user-defined providers and scorers
|
package/dist/cli.js
CHANGED
|
@@ -498,7 +498,7 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
|
|
|
498
498
|
return p5(), n3.getToken() !== 2 ? k3(7, [2], []) : b3(), true;
|
|
499
499
|
}
|
|
500
500
|
i2(M2, "parseObject");
|
|
501
|
-
function
|
|
501
|
+
function z5() {
|
|
502
502
|
D3(), b3();
|
|
503
503
|
let w4 = true, j2 = false;
|
|
504
504
|
for (; n3.getToken() !== 4 && n3.getToken() !== 17; ) {
|
|
@@ -509,11 +509,11 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
|
|
|
509
509
|
}
|
|
510
510
|
return L3(), w4 || o7.pop(), n3.getToken() !== 4 ? k3(8, [4], []) : b3(), true;
|
|
511
511
|
}
|
|
512
|
-
i2(
|
|
512
|
+
i2(z5, "parseArray");
|
|
513
513
|
function U2() {
|
|
514
514
|
switch (n3.getToken()) {
|
|
515
515
|
case 3:
|
|
516
|
-
return
|
|
516
|
+
return z5();
|
|
517
517
|
case 1:
|
|
518
518
|
return M2();
|
|
519
519
|
case 10:
|
|
@@ -746,7 +746,7 @@ var init_dist2 = __esm({
|
|
|
746
746
|
}, "interpolateConfigDir");
|
|
747
747
|
Me = ["outDir", "declarationDir", "outFile", "rootDir", "baseUrl", "tsBuildInfoFile"];
|
|
748
748
|
ze = i2((e5) => {
|
|
749
|
-
var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2,
|
|
749
|
+
var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2, z5, U2, w4, j2, S2, $2;
|
|
750
750
|
if (e5.strict) {
|
|
751
751
|
const f6 = ["noImplicitAny", "noImplicitThis", "strictNullChecks", "strictFunctionTypes", "strictBindCallApply", "strictPropertyInitialization", "strictBuiltinIteratorReturn", "alwaysStrict", "useUnknownInCatchVariables"];
|
|
752
752
|
for (const B2 of f6) e5[B2] === void 0 && (e5[B2] = true);
|
|
@@ -767,7 +767,7 @@ var init_dist2 = __esm({
|
|
|
767
767
|
let f6 = e5.moduleResolution.toLowerCase();
|
|
768
768
|
f6 === "node" && (f6 = "node10"), e5.moduleResolution = f6, (f6 === "node16" || f6 === "nodenext" || f6 === "bundler") && ((R5 = e5.resolvePackageJsonExports) != null || (e5.resolvePackageJsonExports = true), (W = e5.resolvePackageJsonImports) != null || (e5.resolvePackageJsonImports = true)), f6 === "bundler" && ((V2 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true), (M2 = e5.resolveJsonModule) != null || (e5.resolveJsonModule = true));
|
|
769
769
|
}
|
|
770
|
-
e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((
|
|
770
|
+
e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((z5 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true)), e5.verbatimModuleSyntax && ((U2 = e5.isolatedModules) != null || (e5.isolatedModules = true), (w4 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.isolatedModules && ((j2 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.rewriteRelativeImportExtensions && ((S2 = e5.allowImportingTsExtensions) != null || (e5.allowImportingTsExtensions = true)), e5.lib && (e5.lib = e5.lib.map((f6) => f6.toLowerCase())), e5.checkJs && (($2 = e5.allowJs) != null || (e5.allowJs = true));
|
|
771
771
|
}, "normalizeCompilerOptions");
|
|
772
772
|
pe = i2((e5, t3 = /* @__PURE__ */ new Map()) => {
|
|
773
773
|
const s5 = m3.resolve(e5), n3 = ve(s5, t3), o7 = m3.dirname(s5), { compilerOptions: l3 } = n3;
|
|
@@ -3345,14 +3345,14 @@ function fn(s5, e5 = "@") {
|
|
|
3345
3345
|
case 32:
|
|
3346
3346
|
break;
|
|
3347
3347
|
case 101: {
|
|
3348
|
-
if (!(d5[400] | 0) &&
|
|
3348
|
+
if (!(d5[400] | 0) && z5(h4) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && ($2(), (b3[804] | 0) == 0)) {
|
|
3349
3349
|
l3 = 9;
|
|
3350
3350
|
break e;
|
|
3351
3351
|
} else l3 = 17;
|
|
3352
3352
|
break;
|
|
3353
3353
|
}
|
|
3354
3354
|
case 105: {
|
|
3355
|
-
|
|
3355
|
+
z5(h4) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 17;
|
|
3356
3356
|
break;
|
|
3357
3357
|
}
|
|
3358
3358
|
case 59: {
|
|
@@ -3400,15 +3400,15 @@ function fn(s5, e5 = "@") {
|
|
|
3400
3400
|
case 32:
|
|
3401
3401
|
break;
|
|
3402
3402
|
case 101: {
|
|
3403
|
-
!(d5[400] | 0) &&
|
|
3403
|
+
!(d5[400] | 0) && z5(a7) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && $2(), l3 = 91;
|
|
3404
3404
|
break;
|
|
3405
3405
|
}
|
|
3406
3406
|
case 105: {
|
|
3407
|
-
|
|
3407
|
+
z5(a7) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 91;
|
|
3408
3408
|
break;
|
|
3409
3409
|
}
|
|
3410
3410
|
case 99: {
|
|
3411
|
-
|
|
3411
|
+
z5(a7) | 0 && !(A3(t3 + 4 | 0, 36, 8) | 0) && P3(d5[t3 + 12 >> 1] | 0) | 0 && (b3[806] = 1), l3 = 91;
|
|
3412
3412
|
break;
|
|
3413
3413
|
}
|
|
3414
3414
|
case 40: {
|
|
@@ -4437,10 +4437,10 @@ function fn(s5, e5 = "@") {
|
|
|
4437
4437
|
return t3 = t3 | 0, (d5[t3 >> 1] | 0) == 46 && (d5[t3 + -2 >> 1] | 0) == 46 ? t3 = (d5[t3 + -4 >> 1] | 0) == 46 : t3 = 0, t3 | 0;
|
|
4438
4438
|
}
|
|
4439
4439
|
f2(de3, "G");
|
|
4440
|
-
function
|
|
4440
|
+
function z5(t3) {
|
|
4441
4441
|
return t3 = t3 | 0, (r3[3] | 0) == (t3 | 0) ? t3 = 1 : t3 = Oe2(t3 + -2 | 0) | 0, t3 | 0;
|
|
4442
4442
|
}
|
|
4443
|
-
f2(
|
|
4443
|
+
f2(z5, "H");
|
|
4444
4444
|
function vt() {
|
|
4445
4445
|
var t3 = 0;
|
|
4446
4446
|
return t3 = r3[(r3[62] | 0) + 12 >> 2] | 0, t3 ? t3 = t3 - (r3[3] | 0) >> 1 : t3 = -1, t3 | 0;
|
|
@@ -5821,7 +5821,7 @@ import p4 from "path";
|
|
|
5821
5821
|
import { fileURLToPath as O4 } from "url";
|
|
5822
5822
|
import se3, { writeSync as te2 } from "fs";
|
|
5823
5823
|
import { inspect as oe3 } from "util";
|
|
5824
|
-
var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3,
|
|
5824
|
+
var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3, z4, we2, Me3, Te3, Fe3, H2, $e3;
|
|
5825
5825
|
var init_register_CFH5oNdT = __esm({
|
|
5826
5826
|
"node_modules/tsx/dist/register-CFH5oNdT.mjs"() {
|
|
5827
5827
|
"use strict";
|
|
@@ -5995,11 +5995,11 @@ var init_register_CFH5oNdT = __esm({
|
|
|
5995
5995
|
throw t3;
|
|
5996
5996
|
}
|
|
5997
5997
|
}, "createTsExtensionResolver");
|
|
5998
|
-
|
|
5998
|
+
z4 = "at cjsPreparseModuleExports (node:internal";
|
|
5999
5999
|
we2 = o4((s5) => {
|
|
6000
6000
|
const e5 = s5.stack.split(`
|
|
6001
6001
|
`).slice(1);
|
|
6002
|
-
return e5[1].includes(
|
|
6002
|
+
return e5[1].includes(z4) || e5[2].includes(z4);
|
|
6003
6003
|
}, "isFromCjsLexer");
|
|
6004
6004
|
Me3 = o4((s5, e5) => {
|
|
6005
6005
|
const a7 = s5.split("?"), n3 = new URLSearchParams(a7[1]);
|
|
@@ -7540,6 +7540,27 @@ var costScorer = ({ result }, providerId) => {
|
|
|
7540
7540
|
};
|
|
7541
7541
|
};
|
|
7542
7542
|
|
|
7543
|
+
// src/utils/deep-equal.ts
|
|
7544
|
+
function deepEqual(expected, actual) {
|
|
7545
|
+
if (expected === actual) return true;
|
|
7546
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
7547
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
7548
|
+
}
|
|
7549
|
+
if (typeof expected !== typeof actual) return false;
|
|
7550
|
+
if (expected === null || actual === null) return expected === actual;
|
|
7551
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
7552
|
+
if (expected.length !== actual.length) return false;
|
|
7553
|
+
return expected.every((val, i7) => deepEqual(val, actual[i7]));
|
|
7554
|
+
}
|
|
7555
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
7556
|
+
const objExpected = expected;
|
|
7557
|
+
const objActual = actual;
|
|
7558
|
+
const keysExpected = Object.keys(objExpected);
|
|
7559
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
7560
|
+
}
|
|
7561
|
+
return expected === actual;
|
|
7562
|
+
}
|
|
7563
|
+
|
|
7543
7564
|
// src/scorers/correctness.ts
|
|
7544
7565
|
var correctnessScorer = ({ task, result }) => {
|
|
7545
7566
|
if (task.expected === void 0) {
|
|
@@ -7563,25 +7584,6 @@ function normalizeOutput(expected, actual) {
|
|
|
7563
7584
|
}
|
|
7564
7585
|
return actual;
|
|
7565
7586
|
}
|
|
7566
|
-
function deepEqual(expected, actual) {
|
|
7567
|
-
if (expected === actual) return true;
|
|
7568
|
-
if (typeof expected === "string" && typeof actual === "string") {
|
|
7569
|
-
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
7570
|
-
}
|
|
7571
|
-
if (typeof expected !== typeof actual) return false;
|
|
7572
|
-
if (expected === null || actual === null) return expected === actual;
|
|
7573
|
-
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
7574
|
-
if (expected.length !== actual.length) return false;
|
|
7575
|
-
return expected.every((val, i7) => deepEqual(val, actual[i7]));
|
|
7576
|
-
}
|
|
7577
|
-
if (typeof expected === "object" && typeof actual === "object") {
|
|
7578
|
-
const objExpected = expected;
|
|
7579
|
-
const objActual = actual;
|
|
7580
|
-
const keysExpected = Object.keys(objExpected);
|
|
7581
|
-
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
7582
|
-
}
|
|
7583
|
-
return expected === actual;
|
|
7584
|
-
}
|
|
7585
7587
|
|
|
7586
7588
|
// src/scorers/schema-correctness.ts
|
|
7587
7589
|
var schemaCorrectnessScorer = ({ task, result }) => {
|
|
@@ -7790,15 +7792,54 @@ function parseJudgeResponse(response, model) {
|
|
|
7790
7792
|
|
|
7791
7793
|
// src/scorers/tool-usage.ts
|
|
7792
7794
|
var toolUsageScorer = ({ task, result }) => {
|
|
7793
|
-
|
|
7794
|
-
if (!expectedToolName) {
|
|
7795
|
+
if (!task.tools?.length) {
|
|
7795
7796
|
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
7796
7797
|
}
|
|
7797
|
-
const
|
|
7798
|
+
const calls = result.toolCalls ?? [];
|
|
7799
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
7800
|
+
if (expectedIsObject) {
|
|
7801
|
+
const matchingCall = calls.find((c3) => {
|
|
7802
|
+
const toolDef = task.tools.find((t3) => t3.name === c3.name);
|
|
7803
|
+
if (!toolDef) return false;
|
|
7804
|
+
return deepEqual(task.expected, c3.arguments);
|
|
7805
|
+
});
|
|
7806
|
+
if (matchingCall) {
|
|
7807
|
+
return {
|
|
7808
|
+
name: "tool-usage",
|
|
7809
|
+
value: 1,
|
|
7810
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
7811
|
+
};
|
|
7812
|
+
}
|
|
7813
|
+
const expectedKeys = Object.keys(task.expected);
|
|
7814
|
+
const partialMatch = calls.find((c3) => {
|
|
7815
|
+
if (typeof c3.arguments !== "object" || c3.arguments === null) return false;
|
|
7816
|
+
const argKeys = Object.keys(c3.arguments);
|
|
7817
|
+
return expectedKeys.some((k3) => argKeys.includes(k3));
|
|
7818
|
+
});
|
|
7819
|
+
if (partialMatch) {
|
|
7820
|
+
return {
|
|
7821
|
+
name: "tool-usage",
|
|
7822
|
+
value: 0.5,
|
|
7823
|
+
details: {
|
|
7824
|
+
reason: "correct tool but wrong arguments",
|
|
7825
|
+
expected: task.expected,
|
|
7826
|
+
actual: partialMatch.arguments,
|
|
7827
|
+
toolCalls: calls
|
|
7828
|
+
}
|
|
7829
|
+
};
|
|
7830
|
+
}
|
|
7831
|
+
return {
|
|
7832
|
+
name: "tool-usage",
|
|
7833
|
+
value: 0,
|
|
7834
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
7835
|
+
};
|
|
7836
|
+
}
|
|
7837
|
+
const expectedToolName = task.tools[0].name;
|
|
7838
|
+
const usedTool = calls.some((c3) => c3.name === expectedToolName);
|
|
7798
7839
|
return {
|
|
7799
7840
|
name: "tool-usage",
|
|
7800
7841
|
value: usedTool ? 1 : 0,
|
|
7801
|
-
details: { expectedToolName, usedTool, toolCalls:
|
|
7842
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
7802
7843
|
};
|
|
7803
7844
|
};
|
|
7804
7845
|
|
|
@@ -9706,12 +9747,247 @@ Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
|
9706
9747
|
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
9707
9748
|
};
|
|
9708
9749
|
|
|
9750
|
+
// src/packs/tool-calling.ts
|
|
9751
|
+
import { z as z2 } from "zod";
|
|
9752
|
+
var toolCallingPack = {
|
|
9753
|
+
name: "tool-calling",
|
|
9754
|
+
label: "Tool Calling",
|
|
9755
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
9756
|
+
tasks: [
|
|
9757
|
+
{
|
|
9758
|
+
name: "tc:simple-single-tool",
|
|
9759
|
+
prompt: "What's the current weather in Tokyo?",
|
|
9760
|
+
tools: [{
|
|
9761
|
+
name: "getWeather",
|
|
9762
|
+
description: "Get current weather for a city",
|
|
9763
|
+
parameters: z2.object({
|
|
9764
|
+
city: z2.string(),
|
|
9765
|
+
units: z2.enum(["celsius", "fahrenheit"]).optional()
|
|
9766
|
+
}),
|
|
9767
|
+
handler: async ({ city, units }) => ({
|
|
9768
|
+
city,
|
|
9769
|
+
tempC: 8,
|
|
9770
|
+
condition: "cloudy",
|
|
9771
|
+
units: units ?? "celsius"
|
|
9772
|
+
})
|
|
9773
|
+
}],
|
|
9774
|
+
expected: { city: "Tokyo" }
|
|
9775
|
+
},
|
|
9776
|
+
{
|
|
9777
|
+
name: "tc:complex-params",
|
|
9778
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
9779
|
+
tools: [{
|
|
9780
|
+
name: "searchRestaurants",
|
|
9781
|
+
description: "Search for restaurants matching criteria",
|
|
9782
|
+
parameters: z2.object({
|
|
9783
|
+
cuisine: z2.string(),
|
|
9784
|
+
location: z2.string(),
|
|
9785
|
+
radiusMiles: z2.number(),
|
|
9786
|
+
minRating: z2.number(),
|
|
9787
|
+
openNow: z2.boolean()
|
|
9788
|
+
}),
|
|
9789
|
+
handler: async (_args) => ({
|
|
9790
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
9791
|
+
})
|
|
9792
|
+
}],
|
|
9793
|
+
expected: {
|
|
9794
|
+
cuisine: "Italian",
|
|
9795
|
+
location: "downtown Portland",
|
|
9796
|
+
radiusMiles: 2,
|
|
9797
|
+
minRating: 4,
|
|
9798
|
+
openNow: true
|
|
9799
|
+
}
|
|
9800
|
+
},
|
|
9801
|
+
{
|
|
9802
|
+
name: "tc:select-from-many",
|
|
9803
|
+
prompt: "Convert 150 USD to Euros.",
|
|
9804
|
+
tools: [
|
|
9805
|
+
{
|
|
9806
|
+
name: "getWeather",
|
|
9807
|
+
description: "Get current weather for a city",
|
|
9808
|
+
parameters: z2.object({ city: z2.string() }),
|
|
9809
|
+
handler: async () => ({ tempC: 20 })
|
|
9810
|
+
},
|
|
9811
|
+
{
|
|
9812
|
+
name: "convertCurrency",
|
|
9813
|
+
description: "Convert an amount between currencies",
|
|
9814
|
+
parameters: z2.object({
|
|
9815
|
+
amount: z2.number(),
|
|
9816
|
+
from: z2.string(),
|
|
9817
|
+
to: z2.string()
|
|
9818
|
+
}),
|
|
9819
|
+
handler: async ({ amount, from, to }) => ({
|
|
9820
|
+
amount,
|
|
9821
|
+
from,
|
|
9822
|
+
to,
|
|
9823
|
+
result: 138.75,
|
|
9824
|
+
rate: 0.925
|
|
9825
|
+
})
|
|
9826
|
+
},
|
|
9827
|
+
{
|
|
9828
|
+
name: "translateText",
|
|
9829
|
+
description: "Translate text between languages",
|
|
9830
|
+
parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
|
|
9831
|
+
handler: async () => ({ translated: "" })
|
|
9832
|
+
},
|
|
9833
|
+
{
|
|
9834
|
+
name: "calculateTip",
|
|
9835
|
+
description: "Calculate tip amount for a bill",
|
|
9836
|
+
parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
|
|
9837
|
+
handler: async () => ({ tip: 0 })
|
|
9838
|
+
}
|
|
9839
|
+
],
|
|
9840
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
9841
|
+
},
|
|
9842
|
+
{
|
|
9843
|
+
name: "tc:parallel-calls",
|
|
9844
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
9845
|
+
tools: [{
|
|
9846
|
+
name: "getWeather",
|
|
9847
|
+
description: "Get current weather for a city",
|
|
9848
|
+
parameters: z2.object({ city: z2.string() }),
|
|
9849
|
+
handler: async ({ city }) => {
|
|
9850
|
+
const data = {
|
|
9851
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
9852
|
+
London: { tempC: 9, condition: "rainy" }
|
|
9853
|
+
};
|
|
9854
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
9855
|
+
}
|
|
9856
|
+
}],
|
|
9857
|
+
expected: "weather data for Paris and London"
|
|
9858
|
+
}
|
|
9859
|
+
],
|
|
9860
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
9861
|
+
};
|
|
9862
|
+
|
|
9863
|
+
// src/packs/reasoning.ts
|
|
9864
|
+
import { z as z3 } from "zod";
|
|
9865
|
+
var reasoningPack = {
|
|
9866
|
+
name: "reasoning",
|
|
9867
|
+
label: "Reasoning",
|
|
9868
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
9869
|
+
tasks: [
|
|
9870
|
+
{
|
|
9871
|
+
name: "rs:saas-mrr-calc",
|
|
9872
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
9873
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
9874
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
9875
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
9876
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
9877
|
+
expected: { mrr: 27425 },
|
|
9878
|
+
schema: z3.object({
|
|
9879
|
+
reasoning: z3.string().optional(),
|
|
9880
|
+
mrr: z3.number()
|
|
9881
|
+
})
|
|
9882
|
+
},
|
|
9883
|
+
{
|
|
9884
|
+
name: "rs:logical-deduction",
|
|
9885
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
9886
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
9887
|
+
1. Alice does not use Python, Java, or Go.
|
|
9888
|
+
2. Bob uses TypeScript.
|
|
9889
|
+
3. Carol uses neither Rust nor Go.
|
|
9890
|
+
4. Dave does not use Java.
|
|
9891
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
9892
|
+
What language does each developer use? Return as JSON.`,
|
|
9893
|
+
expected: {
|
|
9894
|
+
Alice: "Rust",
|
|
9895
|
+
Bob: "TypeScript",
|
|
9896
|
+
Carol: "Java",
|
|
9897
|
+
Dave: "Go",
|
|
9898
|
+
Eve: "Python"
|
|
9899
|
+
},
|
|
9900
|
+
schema: z3.object({
|
|
9901
|
+
Alice: z3.string(),
|
|
9902
|
+
Bob: z3.string(),
|
|
9903
|
+
Carol: z3.string(),
|
|
9904
|
+
Dave: z3.string(),
|
|
9905
|
+
Eve: z3.string()
|
|
9906
|
+
})
|
|
9907
|
+
},
|
|
9908
|
+
{
|
|
9909
|
+
name: "rs:data-interpretation",
|
|
9910
|
+
prompt: `Given this quarterly revenue data:
|
|
9911
|
+
| Quarter | Revenue | Growth |
|
|
9912
|
+
|---------|---------|--------|
|
|
9913
|
+
| Q1 2025 | $2.1M | - |
|
|
9914
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
9915
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
9916
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
9917
|
+
|
|
9918
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
9919
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
9920
|
+
expected: {
|
|
9921
|
+
highestGrowthQuarter: "Q4 2025",
|
|
9922
|
+
absoluteIncrease: 0.6,
|
|
9923
|
+
fullYearRevenue: 9.5
|
|
9924
|
+
},
|
|
9925
|
+
schema: z3.object({
|
|
9926
|
+
highestGrowthQuarter: z3.string(),
|
|
9927
|
+
absoluteIncrease: z3.number(),
|
|
9928
|
+
fullYearRevenue: z3.number()
|
|
9929
|
+
})
|
|
9930
|
+
},
|
|
9931
|
+
{
|
|
9932
|
+
name: "rs:critical-path",
|
|
9933
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
9934
|
+
- Build (3 min, no dependency)
|
|
9935
|
+
- Unit tests (5 min, depends on Build)
|
|
9936
|
+
- Integration tests (8 min, depends on Build)
|
|
9937
|
+
- Security scan (4 min, depends on Build)
|
|
9938
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
9939
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
9940
|
+
|
|
9941
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
9942
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
9943
|
+
expected: {
|
|
9944
|
+
totalMinutes: 16,
|
|
9945
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
9946
|
+
},
|
|
9947
|
+
schema: z3.object({
|
|
9948
|
+
totalMinutes: z3.number(),
|
|
9949
|
+
criticalPath: z3.array(z3.string())
|
|
9950
|
+
})
|
|
9951
|
+
},
|
|
9952
|
+
{
|
|
9953
|
+
name: "rs:pricing-rules",
|
|
9954
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
9955
|
+
Rules:
|
|
9956
|
+
- Base price: $100
|
|
9957
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
9958
|
+
- Annual billing: additional 15% off the discounted price
|
|
9959
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
9960
|
+
|
|
9961
|
+
Customers:
|
|
9962
|
+
A: 50 seats, monthly billing, for-profit
|
|
9963
|
+
B: 200 seats, annual billing, for-profit
|
|
9964
|
+
C: 75 seats, annual billing, non-profit
|
|
9965
|
+
D: 150 seats, monthly billing, for-profit
|
|
9966
|
+
|
|
9967
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
9968
|
+
expected: [
|
|
9969
|
+
{ id: "A", finalPrice: 100 },
|
|
9970
|
+
{ id: "B", finalPrice: 59.5 },
|
|
9971
|
+
{ id: "C", finalPrice: 50 },
|
|
9972
|
+
{ id: "D", finalPrice: 70 }
|
|
9973
|
+
],
|
|
9974
|
+
schema: z3.array(z3.object({
|
|
9975
|
+
id: z3.string(),
|
|
9976
|
+
finalPrice: z3.number()
|
|
9977
|
+
}))
|
|
9978
|
+
}
|
|
9979
|
+
],
|
|
9980
|
+
scorers: ["correctness", "latency", "cost"]
|
|
9981
|
+
};
|
|
9982
|
+
|
|
9709
9983
|
// src/packs/index.ts
|
|
9710
9984
|
var registry = /* @__PURE__ */ new Map();
|
|
9711
9985
|
function register(pack) {
|
|
9712
9986
|
registry.set(pack.name, pack);
|
|
9713
9987
|
}
|
|
9714
9988
|
register(structuredOutputPack);
|
|
9989
|
+
register(toolCallingPack);
|
|
9990
|
+
register(reasoningPack);
|
|
9715
9991
|
function loadPack(name) {
|
|
9716
9992
|
const pack = registry.get(name);
|
|
9717
9993
|
if (!pack) {
|
|
@@ -9893,11 +10169,15 @@ function printPackList() {
|
|
|
9893
10169
|
console.log("No packs available.");
|
|
9894
10170
|
return;
|
|
9895
10171
|
}
|
|
10172
|
+
const nameWidth = Math.max(...packs.map((p5) => p5.name.length)) + 2;
|
|
9896
10173
|
console.log("Available task packs:\n");
|
|
9897
10174
|
for (const p5 of packs) {
|
|
9898
|
-
|
|
10175
|
+
const tasks = `${p5.taskCount} tasks`;
|
|
10176
|
+
console.log(` ${p5.name.padEnd(nameWidth)} ${tasks.padEnd(9)} ${p5.description}`);
|
|
9899
10177
|
}
|
|
9900
|
-
console.log(
|
|
10178
|
+
console.log(`
|
|
10179
|
+
Run: npx duelist run --pack <name>`);
|
|
10180
|
+
console.log(`Combine: npx duelist run --pack structured-output,tool-calling`);
|
|
9901
10181
|
}
|
|
9902
10182
|
async function loadArenaWithPacks(packNames, configOpt) {
|
|
9903
10183
|
const configPath = resolve(configOpt);
|