agent-duelist 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +248 -142
- package/dist/cli.js +2284 -62
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +614 -109
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -3
- package/dist/index.d.ts +28 -3
- package/dist/index.js +612 -109
- package/dist/index.js.map +1 -1
- package/package.json +9 -3
package/dist/cli.js
CHANGED
|
@@ -498,7 +498,7 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
|
|
|
498
498
|
return p5(), n3.getToken() !== 2 ? k3(7, [2], []) : b3(), true;
|
|
499
499
|
}
|
|
500
500
|
i2(M2, "parseObject");
|
|
501
|
-
function
|
|
501
|
+
function z5() {
|
|
502
502
|
D3(), b3();
|
|
503
503
|
let w4 = true, j2 = false;
|
|
504
504
|
for (; n3.getToken() !== 4 && n3.getToken() !== 17; ) {
|
|
@@ -509,11 +509,11 @@ function Pe(e5, t3, s5 = Q.DEFAULT) {
|
|
|
509
509
|
}
|
|
510
510
|
return L3(), w4 || o7.pop(), n3.getToken() !== 4 ? k3(8, [4], []) : b3(), true;
|
|
511
511
|
}
|
|
512
|
-
i2(
|
|
512
|
+
i2(z5, "parseArray");
|
|
513
513
|
function U2() {
|
|
514
514
|
switch (n3.getToken()) {
|
|
515
515
|
case 3:
|
|
516
|
-
return
|
|
516
|
+
return z5();
|
|
517
517
|
case 1:
|
|
518
518
|
return M2();
|
|
519
519
|
case 10:
|
|
@@ -746,7 +746,7 @@ var init_dist2 = __esm({
|
|
|
746
746
|
}, "interpolateConfigDir");
|
|
747
747
|
Me = ["outDir", "declarationDir", "outFile", "rootDir", "baseUrl", "tsBuildInfoFile"];
|
|
748
748
|
ze = i2((e5) => {
|
|
749
|
-
var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2,
|
|
749
|
+
var t3, s5, n3, o7, l3, u5, a7, r3, g2, v4, d5, _4, p5, D3, L3, T3, F4, x, c3, y3, A3, b3, k3, R5, W, V2, M2, z5, U2, w4, j2, S2, $2;
|
|
750
750
|
if (e5.strict) {
|
|
751
751
|
const f6 = ["noImplicitAny", "noImplicitThis", "strictNullChecks", "strictFunctionTypes", "strictBindCallApply", "strictPropertyInitialization", "strictBuiltinIteratorReturn", "alwaysStrict", "useUnknownInCatchVariables"];
|
|
752
752
|
for (const B2 of f6) e5[B2] === void 0 && (e5[B2] = true);
|
|
@@ -767,7 +767,7 @@ var init_dist2 = __esm({
|
|
|
767
767
|
let f6 = e5.moduleResolution.toLowerCase();
|
|
768
768
|
f6 === "node" && (f6 = "node10"), e5.moduleResolution = f6, (f6 === "node16" || f6 === "nodenext" || f6 === "bundler") && ((R5 = e5.resolvePackageJsonExports) != null || (e5.resolvePackageJsonExports = true), (W = e5.resolvePackageJsonImports) != null || (e5.resolvePackageJsonImports = true)), f6 === "bundler" && ((V2 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true), (M2 = e5.resolveJsonModule) != null || (e5.resolveJsonModule = true));
|
|
769
769
|
}
|
|
770
|
-
e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((
|
|
770
|
+
e5.jsx && (e5.jsx = e5.jsx.toLowerCase()), e5.moduleDetection && (e5.moduleDetection = e5.moduleDetection.toLowerCase()), e5.importsNotUsedAsValues && (e5.importsNotUsedAsValues = e5.importsNotUsedAsValues.toLowerCase()), e5.newLine && (e5.newLine = e5.newLine.toLowerCase()), e5.esModuleInterop && ((z5 = e5.allowSyntheticDefaultImports) != null || (e5.allowSyntheticDefaultImports = true)), e5.verbatimModuleSyntax && ((U2 = e5.isolatedModules) != null || (e5.isolatedModules = true), (w4 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.isolatedModules && ((j2 = e5.preserveConstEnums) != null || (e5.preserveConstEnums = true)), e5.rewriteRelativeImportExtensions && ((S2 = e5.allowImportingTsExtensions) != null || (e5.allowImportingTsExtensions = true)), e5.lib && (e5.lib = e5.lib.map((f6) => f6.toLowerCase())), e5.checkJs && (($2 = e5.allowJs) != null || (e5.allowJs = true));
|
|
771
771
|
}, "normalizeCompilerOptions");
|
|
772
772
|
pe = i2((e5, t3 = /* @__PURE__ */ new Map()) => {
|
|
773
773
|
const s5 = m3.resolve(e5), n3 = ve(s5, t3), o7 = m3.dirname(s5), { compilerOptions: l3 } = n3;
|
|
@@ -3345,14 +3345,14 @@ function fn(s5, e5 = "@") {
|
|
|
3345
3345
|
case 32:
|
|
3346
3346
|
break;
|
|
3347
3347
|
case 101: {
|
|
3348
|
-
if (!(d5[400] | 0) &&
|
|
3348
|
+
if (!(d5[400] | 0) && z5(h4) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && ($2(), (b3[804] | 0) == 0)) {
|
|
3349
3349
|
l3 = 9;
|
|
3350
3350
|
break e;
|
|
3351
3351
|
} else l3 = 17;
|
|
3352
3352
|
break;
|
|
3353
3353
|
}
|
|
3354
3354
|
case 105: {
|
|
3355
|
-
|
|
3355
|
+
z5(h4) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 17;
|
|
3356
3356
|
break;
|
|
3357
3357
|
}
|
|
3358
3358
|
case 59: {
|
|
@@ -3400,15 +3400,15 @@ function fn(s5, e5 = "@") {
|
|
|
3400
3400
|
case 32:
|
|
3401
3401
|
break;
|
|
3402
3402
|
case 101: {
|
|
3403
|
-
!(d5[400] | 0) &&
|
|
3403
|
+
!(d5[400] | 0) && z5(a7) | 0 && !(A3(t3 + 4 | 0, 16, 10) | 0) && $2(), l3 = 91;
|
|
3404
3404
|
break;
|
|
3405
3405
|
}
|
|
3406
3406
|
case 105: {
|
|
3407
|
-
|
|
3407
|
+
z5(a7) | 0 && !(A3(t3 + 4 | 0, 26, 10) | 0) && W(), l3 = 91;
|
|
3408
3408
|
break;
|
|
3409
3409
|
}
|
|
3410
3410
|
case 99: {
|
|
3411
|
-
|
|
3411
|
+
z5(a7) | 0 && !(A3(t3 + 4 | 0, 36, 8) | 0) && P3(d5[t3 + 12 >> 1] | 0) | 0 && (b3[806] = 1), l3 = 91;
|
|
3412
3412
|
break;
|
|
3413
3413
|
}
|
|
3414
3414
|
case 40: {
|
|
@@ -4437,10 +4437,10 @@ function fn(s5, e5 = "@") {
|
|
|
4437
4437
|
return t3 = t3 | 0, (d5[t3 >> 1] | 0) == 46 && (d5[t3 + -2 >> 1] | 0) == 46 ? t3 = (d5[t3 + -4 >> 1] | 0) == 46 : t3 = 0, t3 | 0;
|
|
4438
4438
|
}
|
|
4439
4439
|
f2(de3, "G");
|
|
4440
|
-
function
|
|
4440
|
+
function z5(t3) {
|
|
4441
4441
|
return t3 = t3 | 0, (r3[3] | 0) == (t3 | 0) ? t3 = 1 : t3 = Oe2(t3 + -2 | 0) | 0, t3 | 0;
|
|
4442
4442
|
}
|
|
4443
|
-
f2(
|
|
4443
|
+
f2(z5, "H");
|
|
4444
4444
|
function vt() {
|
|
4445
4445
|
var t3 = 0;
|
|
4446
4446
|
return t3 = r3[(r3[62] | 0) + 12 >> 2] | 0, t3 ? t3 = t3 - (r3[3] | 0) >> 1 : t3 = -1, t3 | 0;
|
|
@@ -5821,7 +5821,7 @@ import p4 from "path";
|
|
|
5821
5821
|
import { fileURLToPath as O4 } from "url";
|
|
5822
5822
|
import se3, { writeSync as te2 } from "fs";
|
|
5823
5823
|
import { inspect as oe3 } from "util";
|
|
5824
|
-
var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3,
|
|
5824
|
+
var K3, o4, R4, D2, me3, N2, j, pe2, y2, C3, de2, E4, ge2, Q4, M, _3, S, A2, T2, Pe3, I4, F3, v3, J3, P2, je3, be2, xe3, k2, $, ye3, Ee, B, G3, _e3, Se3, b2, X3, w2, ve3, z4, we2, Me3, Te3, Fe3, H2, $e3;
|
|
5825
5825
|
var init_register_CFH5oNdT = __esm({
|
|
5826
5826
|
"node_modules/tsx/dist/register-CFH5oNdT.mjs"() {
|
|
5827
5827
|
"use strict";
|
|
@@ -5995,11 +5995,11 @@ var init_register_CFH5oNdT = __esm({
|
|
|
5995
5995
|
throw t3;
|
|
5996
5996
|
}
|
|
5997
5997
|
}, "createTsExtensionResolver");
|
|
5998
|
-
|
|
5998
|
+
z4 = "at cjsPreparseModuleExports (node:internal";
|
|
5999
5999
|
we2 = o4((s5) => {
|
|
6000
6000
|
const e5 = s5.stack.split(`
|
|
6001
6001
|
`).slice(1);
|
|
6002
|
-
return e5[1].includes(
|
|
6002
|
+
return e5[1].includes(z4) || e5[2].includes(z4);
|
|
6003
6003
|
}, "isFromCjsLexer");
|
|
6004
6004
|
Me3 = o4((s5, e5) => {
|
|
6005
6005
|
const a7 = s5.split("?"), n3 = new URLSearchParams(a7[1]);
|
|
@@ -6197,6 +6197,1748 @@ import { readFileSync as readFileSync3, writeFileSync as writeFileSync2, mkdirSy
|
|
|
6197
6197
|
import { resolve, join, dirname as dirname2 } from "path";
|
|
6198
6198
|
import { pathToFileURL, fileURLToPath } from "url";
|
|
6199
6199
|
|
|
6200
|
+
// src/scorers/latency.ts
|
|
6201
|
+
var MIN_MS = 500;
|
|
6202
|
+
var MAX_MS = 1e4;
|
|
6203
|
+
var latencyScorer = ({ result }) => {
|
|
6204
|
+
const clamped = Math.max(MIN_MS, Math.min(MAX_MS, result.latencyMs));
|
|
6205
|
+
const value = 1 - (clamped - MIN_MS) / (MAX_MS - MIN_MS);
|
|
6206
|
+
return {
|
|
6207
|
+
name: "latency",
|
|
6208
|
+
value: Math.round(value * 100) / 100,
|
|
6209
|
+
details: { ms: result.latencyMs }
|
|
6210
|
+
};
|
|
6211
|
+
};
|
|
6212
|
+
|
|
6213
|
+
// src/pricing/catalog.json
|
|
6214
|
+
var catalog_default = {
|
|
6215
|
+
_meta: {
|
|
6216
|
+
source: "OpenRouter API \u2014 all providers (https://openrouter.ai/api/v1/models)",
|
|
6217
|
+
updatedAt: "2026-02-28",
|
|
6218
|
+
unit: "USD per token"
|
|
6219
|
+
},
|
|
6220
|
+
models: {
|
|
6221
|
+
"ai21/jamba-large-1.7": {
|
|
6222
|
+
inputPerToken: 2e-6,
|
|
6223
|
+
outputPerToken: 8e-6
|
|
6224
|
+
},
|
|
6225
|
+
"aion-labs/aion-1.0": {
|
|
6226
|
+
inputPerToken: 4e-6,
|
|
6227
|
+
outputPerToken: 8e-6
|
|
6228
|
+
},
|
|
6229
|
+
"aion-labs/aion-1.0-mini": {
|
|
6230
|
+
inputPerToken: 7e-7,
|
|
6231
|
+
outputPerToken: 14e-7
|
|
6232
|
+
},
|
|
6233
|
+
"aion-labs/aion-2.0": {
|
|
6234
|
+
inputPerToken: 8e-7,
|
|
6235
|
+
outputPerToken: 16e-7
|
|
6236
|
+
},
|
|
6237
|
+
"aion-labs/aion-rp-llama-3.1-8b": {
|
|
6238
|
+
inputPerToken: 8e-7,
|
|
6239
|
+
outputPerToken: 16e-7
|
|
6240
|
+
},
|
|
6241
|
+
"alfredpros/codellama-7b-instruct-solidity": {
|
|
6242
|
+
inputPerToken: 8e-7,
|
|
6243
|
+
outputPerToken: 12e-7
|
|
6244
|
+
},
|
|
6245
|
+
"alibaba/tongyi-deepresearch-30b-a3b": {
|
|
6246
|
+
inputPerToken: 9e-8,
|
|
6247
|
+
outputPerToken: 45e-8
|
|
6248
|
+
},
|
|
6249
|
+
"allenai/molmo-2-8b": {
|
|
6250
|
+
inputPerToken: 2e-7,
|
|
6251
|
+
outputPerToken: 2e-7
|
|
6252
|
+
},
|
|
6253
|
+
"allenai/olmo-2-0325-32b-instruct": {
|
|
6254
|
+
inputPerToken: 5e-8,
|
|
6255
|
+
outputPerToken: 2e-7
|
|
6256
|
+
},
|
|
6257
|
+
"allenai/olmo-3-32b-think": {
|
|
6258
|
+
inputPerToken: 15e-8,
|
|
6259
|
+
outputPerToken: 5e-7
|
|
6260
|
+
},
|
|
6261
|
+
"allenai/olmo-3-7b-instruct": {
|
|
6262
|
+
inputPerToken: 1e-7,
|
|
6263
|
+
outputPerToken: 2e-7
|
|
6264
|
+
},
|
|
6265
|
+
"allenai/olmo-3-7b-think": {
|
|
6266
|
+
inputPerToken: 12e-8,
|
|
6267
|
+
outputPerToken: 2e-7
|
|
6268
|
+
},
|
|
6269
|
+
"allenai/olmo-3.1-32b-instruct": {
|
|
6270
|
+
inputPerToken: 2e-7,
|
|
6271
|
+
outputPerToken: 6e-7
|
|
6272
|
+
},
|
|
6273
|
+
"alpindale/goliath-120b": {
|
|
6274
|
+
inputPerToken: 375e-8,
|
|
6275
|
+
outputPerToken: 75e-7
|
|
6276
|
+
},
|
|
6277
|
+
"amazon/nova-2-lite-v1": {
|
|
6278
|
+
inputPerToken: 3e-7,
|
|
6279
|
+
outputPerToken: 25e-7
|
|
6280
|
+
},
|
|
6281
|
+
"amazon/nova-lite-v1": {
|
|
6282
|
+
inputPerToken: 6e-8,
|
|
6283
|
+
outputPerToken: 24e-8
|
|
6284
|
+
},
|
|
6285
|
+
"amazon/nova-micro-v1": {
|
|
6286
|
+
inputPerToken: 35e-9,
|
|
6287
|
+
outputPerToken: 14e-8
|
|
6288
|
+
},
|
|
6289
|
+
"amazon/nova-premier-v1": {
|
|
6290
|
+
inputPerToken: 25e-7,
|
|
6291
|
+
outputPerToken: 125e-7
|
|
6292
|
+
},
|
|
6293
|
+
"amazon/nova-pro-v1": {
|
|
6294
|
+
inputPerToken: 8e-7,
|
|
6295
|
+
outputPerToken: 32e-7
|
|
6296
|
+
},
|
|
6297
|
+
"anthracite-org/magnum-v4-72b": {
|
|
6298
|
+
inputPerToken: 3e-6,
|
|
6299
|
+
outputPerToken: 5e-6
|
|
6300
|
+
},
|
|
6301
|
+
"anthropic/claude-3-haiku": {
|
|
6302
|
+
inputPerToken: 25e-8,
|
|
6303
|
+
outputPerToken: 125e-8
|
|
6304
|
+
},
|
|
6305
|
+
"anthropic/claude-3.5-haiku": {
|
|
6306
|
+
inputPerToken: 8e-7,
|
|
6307
|
+
outputPerToken: 4e-6
|
|
6308
|
+
},
|
|
6309
|
+
"anthropic/claude-3.5-sonnet": {
|
|
6310
|
+
inputPerToken: 6e-6,
|
|
6311
|
+
outputPerToken: 3e-5
|
|
6312
|
+
},
|
|
6313
|
+
"anthropic/claude-3.7-sonnet": {
|
|
6314
|
+
inputPerToken: 3e-6,
|
|
6315
|
+
outputPerToken: 15e-6
|
|
6316
|
+
},
|
|
6317
|
+
"anthropic/claude-3.7-sonnet:thinking": {
|
|
6318
|
+
inputPerToken: 3e-6,
|
|
6319
|
+
outputPerToken: 15e-6
|
|
6320
|
+
},
|
|
6321
|
+
"anthropic/claude-haiku-4.5": {
|
|
6322
|
+
inputPerToken: 1e-6,
|
|
6323
|
+
outputPerToken: 5e-6
|
|
6324
|
+
},
|
|
6325
|
+
"anthropic/claude-opus-4": {
|
|
6326
|
+
inputPerToken: 15e-6,
|
|
6327
|
+
outputPerToken: 75e-6
|
|
6328
|
+
},
|
|
6329
|
+
"anthropic/claude-opus-4.1": {
|
|
6330
|
+
inputPerToken: 15e-6,
|
|
6331
|
+
outputPerToken: 75e-6
|
|
6332
|
+
},
|
|
6333
|
+
"anthropic/claude-opus-4.5": {
|
|
6334
|
+
inputPerToken: 5e-6,
|
|
6335
|
+
outputPerToken: 25e-6
|
|
6336
|
+
},
|
|
6337
|
+
"anthropic/claude-opus-4.6": {
|
|
6338
|
+
inputPerToken: 5e-6,
|
|
6339
|
+
outputPerToken: 25e-6
|
|
6340
|
+
},
|
|
6341
|
+
"anthropic/claude-sonnet-4": {
|
|
6342
|
+
inputPerToken: 3e-6,
|
|
6343
|
+
outputPerToken: 15e-6
|
|
6344
|
+
},
|
|
6345
|
+
"anthropic/claude-sonnet-4.5": {
|
|
6346
|
+
inputPerToken: 3e-6,
|
|
6347
|
+
outputPerToken: 15e-6
|
|
6348
|
+
},
|
|
6349
|
+
"anthropic/claude-sonnet-4.6": {
|
|
6350
|
+
inputPerToken: 3e-6,
|
|
6351
|
+
outputPerToken: 15e-6
|
|
6352
|
+
},
|
|
6353
|
+
"arcee-ai/coder-large": {
|
|
6354
|
+
inputPerToken: 5e-7,
|
|
6355
|
+
outputPerToken: 8e-7
|
|
6356
|
+
},
|
|
6357
|
+
"arcee-ai/maestro-reasoning": {
|
|
6358
|
+
inputPerToken: 9e-7,
|
|
6359
|
+
outputPerToken: 33e-7
|
|
6360
|
+
},
|
|
6361
|
+
"arcee-ai/spotlight": {
|
|
6362
|
+
inputPerToken: 18e-8,
|
|
6363
|
+
outputPerToken: 18e-8
|
|
6364
|
+
},
|
|
6365
|
+
"arcee-ai/trinity-mini": {
|
|
6366
|
+
inputPerToken: 45e-9,
|
|
6367
|
+
outputPerToken: 15e-8
|
|
6368
|
+
},
|
|
6369
|
+
"arcee-ai/virtuoso-large": {
|
|
6370
|
+
inputPerToken: 75e-8,
|
|
6371
|
+
outputPerToken: 12e-7
|
|
6372
|
+
},
|
|
6373
|
+
"baidu/ernie-4.5-21b-a3b": {
|
|
6374
|
+
inputPerToken: 7e-8,
|
|
6375
|
+
outputPerToken: 28e-8
|
|
6376
|
+
},
|
|
6377
|
+
"baidu/ernie-4.5-21b-a3b-thinking": {
|
|
6378
|
+
inputPerToken: 7e-8,
|
|
6379
|
+
outputPerToken: 28e-8
|
|
6380
|
+
},
|
|
6381
|
+
"baidu/ernie-4.5-300b-a47b": {
|
|
6382
|
+
inputPerToken: 28e-8,
|
|
6383
|
+
outputPerToken: 11e-7
|
|
6384
|
+
},
|
|
6385
|
+
"baidu/ernie-4.5-vl-28b-a3b": {
|
|
6386
|
+
inputPerToken: 14e-8,
|
|
6387
|
+
outputPerToken: 56e-8
|
|
6388
|
+
},
|
|
6389
|
+
"baidu/ernie-4.5-vl-424b-a47b": {
|
|
6390
|
+
inputPerToken: 42e-8,
|
|
6391
|
+
outputPerToken: 125e-8
|
|
6392
|
+
},
|
|
6393
|
+
"bytedance/seed-1.6": {
|
|
6394
|
+
inputPerToken: 25e-8,
|
|
6395
|
+
outputPerToken: 2e-6
|
|
6396
|
+
},
|
|
6397
|
+
"bytedance/seed-1.6-flash": {
|
|
6398
|
+
inputPerToken: 75e-9,
|
|
6399
|
+
outputPerToken: 3e-7
|
|
6400
|
+
},
|
|
6401
|
+
"bytedance/seed-2.0-mini": {
|
|
6402
|
+
inputPerToken: 1e-7,
|
|
6403
|
+
outputPerToken: 4e-7
|
|
6404
|
+
},
|
|
6405
|
+
"bytedance/ui-tars-1.5-7b": {
|
|
6406
|
+
inputPerToken: 1e-7,
|
|
6407
|
+
outputPerToken: 2e-7
|
|
6408
|
+
},
|
|
6409
|
+
"cohere/command-a": {
|
|
6410
|
+
inputPerToken: 25e-7,
|
|
6411
|
+
outputPerToken: 1e-5
|
|
6412
|
+
},
|
|
6413
|
+
"cohere/command-r-08-2024": {
|
|
6414
|
+
inputPerToken: 15e-8,
|
|
6415
|
+
outputPerToken: 6e-7
|
|
6416
|
+
},
|
|
6417
|
+
"cohere/command-r-plus-08-2024": {
|
|
6418
|
+
inputPerToken: 25e-7,
|
|
6419
|
+
outputPerToken: 1e-5
|
|
6420
|
+
},
|
|
6421
|
+
"cohere/command-r7b-12-2024": {
|
|
6422
|
+
inputPerToken: 375e-10,
|
|
6423
|
+
outputPerToken: 15e-8
|
|
6424
|
+
},
|
|
6425
|
+
"deepcogito/cogito-v2.1-671b": {
|
|
6426
|
+
inputPerToken: 125e-8,
|
|
6427
|
+
outputPerToken: 125e-8
|
|
6428
|
+
},
|
|
6429
|
+
"deepseek/deepseek-chat": {
|
|
6430
|
+
inputPerToken: 32e-8,
|
|
6431
|
+
outputPerToken: 89e-8
|
|
6432
|
+
},
|
|
6433
|
+
"deepseek/deepseek-chat-v3-0324": {
|
|
6434
|
+
inputPerToken: 2e-7,
|
|
6435
|
+
outputPerToken: 77e-8
|
|
6436
|
+
},
|
|
6437
|
+
"deepseek/deepseek-chat-v3.1": {
|
|
6438
|
+
inputPerToken: 15e-8,
|
|
6439
|
+
outputPerToken: 75e-8
|
|
6440
|
+
},
|
|
6441
|
+
"deepseek/deepseek-r1": {
|
|
6442
|
+
inputPerToken: 7e-7,
|
|
6443
|
+
outputPerToken: 25e-7
|
|
6444
|
+
},
|
|
6445
|
+
"deepseek/deepseek-r1-0528": {
|
|
6446
|
+
inputPerToken: 45e-8,
|
|
6447
|
+
outputPerToken: 215e-8
|
|
6448
|
+
},
|
|
6449
|
+
"deepseek/deepseek-r1-distill-llama-70b": {
|
|
6450
|
+
inputPerToken: 7e-7,
|
|
6451
|
+
outputPerToken: 8e-7
|
|
6452
|
+
},
|
|
6453
|
+
"deepseek/deepseek-r1-distill-qwen-32b": {
|
|
6454
|
+
inputPerToken: 29e-8,
|
|
6455
|
+
outputPerToken: 29e-8
|
|
6456
|
+
},
|
|
6457
|
+
"deepseek/deepseek-v3": {
|
|
6458
|
+
inputPerToken: 3e-7,
|
|
6459
|
+
outputPerToken: 88e-8
|
|
6460
|
+
},
|
|
6461
|
+
"deepseek/deepseek-v3.1-terminus": {
|
|
6462
|
+
inputPerToken: 21e-8,
|
|
6463
|
+
outputPerToken: 79e-8
|
|
6464
|
+
},
|
|
6465
|
+
"deepseek/deepseek-v3.1-terminus:exacto": {
|
|
6466
|
+
inputPerToken: 21e-8,
|
|
6467
|
+
outputPerToken: 79e-8
|
|
6468
|
+
},
|
|
6469
|
+
"deepseek/deepseek-v3.2": {
|
|
6470
|
+
inputPerToken: 25e-8,
|
|
6471
|
+
outputPerToken: 4e-7
|
|
6472
|
+
},
|
|
6473
|
+
"deepseek/deepseek-v3.2-exp": {
|
|
6474
|
+
inputPerToken: 27e-8,
|
|
6475
|
+
outputPerToken: 41e-8
|
|
6476
|
+
},
|
|
6477
|
+
"deepseek/deepseek-v3.2-speciale": {
|
|
6478
|
+
inputPerToken: 4e-7,
|
|
6479
|
+
outputPerToken: 12e-7
|
|
6480
|
+
},
|
|
6481
|
+
"eleutherai/llemma_7b": {
|
|
6482
|
+
inputPerToken: 8e-7,
|
|
6483
|
+
outputPerToken: 12e-7
|
|
6484
|
+
},
|
|
6485
|
+
"essentialai/rnj-1-instruct": {
|
|
6486
|
+
inputPerToken: 15e-8,
|
|
6487
|
+
outputPerToken: 15e-8
|
|
6488
|
+
},
|
|
6489
|
+
"google/gemini-2.0-flash": {
|
|
6490
|
+
inputPerToken: 1e-7,
|
|
6491
|
+
outputPerToken: 4e-7
|
|
6492
|
+
},
|
|
6493
|
+
"google/gemini-2.0-flash-001": {
|
|
6494
|
+
inputPerToken: 1e-7,
|
|
6495
|
+
outputPerToken: 4e-7
|
|
6496
|
+
},
|
|
6497
|
+
"google/gemini-2.0-flash-lite-001": {
|
|
6498
|
+
inputPerToken: 75e-9,
|
|
6499
|
+
outputPerToken: 3e-7
|
|
6500
|
+
},
|
|
6501
|
+
"google/gemini-2.5-flash": {
|
|
6502
|
+
inputPerToken: 3e-7,
|
|
6503
|
+
outputPerToken: 25e-7
|
|
6504
|
+
},
|
|
6505
|
+
"google/gemini-2.5-flash-image": {
|
|
6506
|
+
inputPerToken: 3e-7,
|
|
6507
|
+
outputPerToken: 25e-7
|
|
6508
|
+
},
|
|
6509
|
+
"google/gemini-2.5-flash-lite": {
|
|
6510
|
+
inputPerToken: 1e-7,
|
|
6511
|
+
outputPerToken: 4e-7
|
|
6512
|
+
},
|
|
6513
|
+
"google/gemini-2.5-flash-lite-preview-09-2025": {
|
|
6514
|
+
inputPerToken: 1e-7,
|
|
6515
|
+
outputPerToken: 4e-7
|
|
6516
|
+
},
|
|
6517
|
+
"google/gemini-2.5-pro": {
|
|
6518
|
+
inputPerToken: 125e-8,
|
|
6519
|
+
outputPerToken: 1e-5
|
|
6520
|
+
},
|
|
6521
|
+
"google/gemini-2.5-pro-preview": {
|
|
6522
|
+
inputPerToken: 125e-8,
|
|
6523
|
+
outputPerToken: 1e-5
|
|
6524
|
+
},
|
|
6525
|
+
"google/gemini-2.5-pro-preview-05-06": {
|
|
6526
|
+
inputPerToken: 125e-8,
|
|
6527
|
+
outputPerToken: 1e-5
|
|
6528
|
+
},
|
|
6529
|
+
"google/gemini-3-flash-preview": {
|
|
6530
|
+
inputPerToken: 5e-7,
|
|
6531
|
+
outputPerToken: 3e-6
|
|
6532
|
+
},
|
|
6533
|
+
"google/gemini-3-pro-image-preview": {
|
|
6534
|
+
inputPerToken: 2e-6,
|
|
6535
|
+
outputPerToken: 12e-6
|
|
6536
|
+
},
|
|
6537
|
+
"google/gemini-3-pro-preview": {
|
|
6538
|
+
inputPerToken: 2e-6,
|
|
6539
|
+
outputPerToken: 12e-6
|
|
6540
|
+
},
|
|
6541
|
+
"google/gemini-3.1-flash-image-preview": {
|
|
6542
|
+
inputPerToken: 25e-8,
|
|
6543
|
+
outputPerToken: 15e-7
|
|
6544
|
+
},
|
|
6545
|
+
"google/gemini-3.1-pro-preview": {
|
|
6546
|
+
inputPerToken: 2e-6,
|
|
6547
|
+
outputPerToken: 12e-6
|
|
6548
|
+
},
|
|
6549
|
+
"google/gemini-3.1-pro-preview-customtools": {
|
|
6550
|
+
inputPerToken: 2e-6,
|
|
6551
|
+
outputPerToken: 12e-6
|
|
6552
|
+
},
|
|
6553
|
+
"google/gemma-2-27b-it": {
|
|
6554
|
+
inputPerToken: 65e-8,
|
|
6555
|
+
outputPerToken: 65e-8
|
|
6556
|
+
},
|
|
6557
|
+
"google/gemma-2-9b-it": {
|
|
6558
|
+
inputPerToken: 3e-8,
|
|
6559
|
+
outputPerToken: 9e-8
|
|
6560
|
+
},
|
|
6561
|
+
"google/gemma-3-12b-it": {
|
|
6562
|
+
inputPerToken: 4e-8,
|
|
6563
|
+
outputPerToken: 13e-8
|
|
6564
|
+
},
|
|
6565
|
+
"google/gemma-3-27b-it": {
|
|
6566
|
+
inputPerToken: 4e-8,
|
|
6567
|
+
outputPerToken: 15e-8
|
|
6568
|
+
},
|
|
6569
|
+
"google/gemma-3-4b-it": {
|
|
6570
|
+
inputPerToken: 4e-8,
|
|
6571
|
+
outputPerToken: 8e-8
|
|
6572
|
+
},
|
|
6573
|
+
"google/gemma-3n-e4b-it": {
|
|
6574
|
+
inputPerToken: 2e-8,
|
|
6575
|
+
outputPerToken: 4e-8
|
|
6576
|
+
},
|
|
6577
|
+
"gryphe/mythomax-l2-13b": {
|
|
6578
|
+
inputPerToken: 6e-8,
|
|
6579
|
+
outputPerToken: 6e-8
|
|
6580
|
+
},
|
|
6581
|
+
"ibm-granite/granite-4.0-h-micro": {
|
|
6582
|
+
inputPerToken: 17e-9,
|
|
6583
|
+
outputPerToken: 11e-8
|
|
6584
|
+
},
|
|
6585
|
+
"inception/mercury": {
|
|
6586
|
+
inputPerToken: 25e-8,
|
|
6587
|
+
outputPerToken: 1e-6
|
|
6588
|
+
},
|
|
6589
|
+
"inception/mercury-coder": {
|
|
6590
|
+
inputPerToken: 25e-8,
|
|
6591
|
+
outputPerToken: 1e-6
|
|
6592
|
+
},
|
|
6593
|
+
"inflection/inflection-3-pi": {
|
|
6594
|
+
inputPerToken: 25e-7,
|
|
6595
|
+
outputPerToken: 1e-5
|
|
6596
|
+
},
|
|
6597
|
+
"inflection/inflection-3-productivity": {
|
|
6598
|
+
inputPerToken: 25e-7,
|
|
6599
|
+
outputPerToken: 1e-5
|
|
6600
|
+
},
|
|
6601
|
+
"kwaipilot/kat-coder-pro": {
|
|
6602
|
+
inputPerToken: 207e-9,
|
|
6603
|
+
outputPerToken: 828e-9
|
|
6604
|
+
},
|
|
6605
|
+
"liquid/lfm-2-24b-a2b": {
|
|
6606
|
+
inputPerToken: 3e-8,
|
|
6607
|
+
outputPerToken: 12e-8
|
|
6608
|
+
},
|
|
6609
|
+
"liquid/lfm-2.2-6b": {
|
|
6610
|
+
inputPerToken: 1e-8,
|
|
6611
|
+
outputPerToken: 2e-8
|
|
6612
|
+
},
|
|
6613
|
+
"liquid/lfm2-8b-a1b": {
|
|
6614
|
+
inputPerToken: 1e-8,
|
|
6615
|
+
outputPerToken: 2e-8
|
|
6616
|
+
},
|
|
6617
|
+
"mancer/weaver": {
|
|
6618
|
+
inputPerToken: 75e-8,
|
|
6619
|
+
outputPerToken: 1e-6
|
|
6620
|
+
},
|
|
6621
|
+
"meituan/longcat-flash-chat": {
|
|
6622
|
+
inputPerToken: 2e-7,
|
|
6623
|
+
outputPerToken: 8e-7
|
|
6624
|
+
},
|
|
6625
|
+
"meta/llama-3-70b-instruct": {
|
|
6626
|
+
inputPerToken: 51e-8,
|
|
6627
|
+
outputPerToken: 74e-8
|
|
6628
|
+
},
|
|
6629
|
+
"meta/llama-3-8b-instruct": {
|
|
6630
|
+
inputPerToken: 3e-8,
|
|
6631
|
+
outputPerToken: 4e-8
|
|
6632
|
+
},
|
|
6633
|
+
"meta/llama-3.1-405b": {
|
|
6634
|
+
inputPerToken: 4e-6,
|
|
6635
|
+
outputPerToken: 4e-6
|
|
6636
|
+
},
|
|
6637
|
+
"meta/llama-3.1-405b-instruct": {
|
|
6638
|
+
inputPerToken: 4e-6,
|
|
6639
|
+
outputPerToken: 4e-6
|
|
6640
|
+
},
|
|
6641
|
+
"meta/llama-3.1-70b-instruct": {
|
|
6642
|
+
inputPerToken: 4e-7,
|
|
6643
|
+
outputPerToken: 4e-7
|
|
6644
|
+
},
|
|
6645
|
+
"meta/llama-3.1-8b-instruct": {
|
|
6646
|
+
inputPerToken: 2e-8,
|
|
6647
|
+
outputPerToken: 5e-8
|
|
6648
|
+
},
|
|
6649
|
+
"meta/llama-3.2-11b-vision-instruct": {
|
|
6650
|
+
inputPerToken: 49e-9,
|
|
6651
|
+
outputPerToken: 49e-9
|
|
6652
|
+
},
|
|
6653
|
+
"meta/llama-3.2-1b-instruct": {
|
|
6654
|
+
inputPerToken: 27e-9,
|
|
6655
|
+
outputPerToken: 2e-7
|
|
6656
|
+
},
|
|
6657
|
+
"meta/llama-3.2-3b-instruct": {
|
|
6658
|
+
inputPerToken: 2e-8,
|
|
6659
|
+
outputPerToken: 2e-8
|
|
6660
|
+
},
|
|
6661
|
+
"meta/llama-3.3-70b": {
|
|
6662
|
+
inputPerToken: 12e-8,
|
|
6663
|
+
outputPerToken: 3e-7
|
|
6664
|
+
},
|
|
6665
|
+
"meta/llama-3.3-70b-instruct": {
|
|
6666
|
+
inputPerToken: 1e-7,
|
|
6667
|
+
outputPerToken: 32e-8
|
|
6668
|
+
},
|
|
6669
|
+
"meta/llama-4-maverick": {
|
|
6670
|
+
inputPerToken: 15e-8,
|
|
6671
|
+
outputPerToken: 6e-7
|
|
6672
|
+
},
|
|
6673
|
+
"meta/llama-4-scout": {
|
|
6674
|
+
inputPerToken: 8e-8,
|
|
6675
|
+
outputPerToken: 3e-7
|
|
6676
|
+
},
|
|
6677
|
+
"meta/llama-guard-2-8b": {
|
|
6678
|
+
inputPerToken: 2e-7,
|
|
6679
|
+
outputPerToken: 2e-7
|
|
6680
|
+
},
|
|
6681
|
+
"meta/llama-guard-3-8b": {
|
|
6682
|
+
inputPerToken: 2e-8,
|
|
6683
|
+
outputPerToken: 6e-8
|
|
6684
|
+
},
|
|
6685
|
+
"meta/llama-guard-4-12b": {
|
|
6686
|
+
inputPerToken: 18e-8,
|
|
6687
|
+
outputPerToken: 18e-8
|
|
6688
|
+
},
|
|
6689
|
+
"microsoft/phi-4": {
|
|
6690
|
+
inputPerToken: 6e-8,
|
|
6691
|
+
outputPerToken: 14e-8
|
|
6692
|
+
},
|
|
6693
|
+
"microsoft/wizardlm-2-8x22b": {
|
|
6694
|
+
inputPerToken: 62e-8,
|
|
6695
|
+
outputPerToken: 62e-8
|
|
6696
|
+
},
|
|
6697
|
+
"minimax/minimax-01": {
|
|
6698
|
+
inputPerToken: 2e-7,
|
|
6699
|
+
outputPerToken: 11e-7
|
|
6700
|
+
},
|
|
6701
|
+
"minimax/minimax-m1": {
|
|
6702
|
+
inputPerToken: 4e-7,
|
|
6703
|
+
outputPerToken: 22e-7
|
|
6704
|
+
},
|
|
6705
|
+
"minimax/minimax-m2": {
|
|
6706
|
+
inputPerToken: 255e-9,
|
|
6707
|
+
outputPerToken: 1e-6
|
|
6708
|
+
},
|
|
6709
|
+
"minimax/minimax-m2-her": {
|
|
6710
|
+
inputPerToken: 3e-7,
|
|
6711
|
+
outputPerToken: 12e-7
|
|
6712
|
+
},
|
|
6713
|
+
"minimax/minimax-m2.1": {
|
|
6714
|
+
inputPerToken: 27e-8,
|
|
6715
|
+
outputPerToken: 95e-8
|
|
6716
|
+
},
|
|
6717
|
+
"minimax/minimax-m2.5": {
|
|
6718
|
+
inputPerToken: 295e-9,
|
|
6719
|
+
outputPerToken: 12e-7
|
|
6720
|
+
},
|
|
6721
|
+
"mistral/codestral-2508": {
|
|
6722
|
+
inputPerToken: 3e-7,
|
|
6723
|
+
outputPerToken: 9e-7
|
|
6724
|
+
},
|
|
6725
|
+
"mistral/devstral-2512": {
|
|
6726
|
+
inputPerToken: 4e-7,
|
|
6727
|
+
outputPerToken: 2e-6
|
|
6728
|
+
},
|
|
6729
|
+
"mistral/devstral-medium": {
|
|
6730
|
+
inputPerToken: 4e-7,
|
|
6731
|
+
outputPerToken: 2e-6
|
|
6732
|
+
},
|
|
6733
|
+
"mistral/devstral-small": {
|
|
6734
|
+
inputPerToken: 1e-7,
|
|
6735
|
+
outputPerToken: 3e-7
|
|
6736
|
+
},
|
|
6737
|
+
"mistral/ministral-14b-2512": {
|
|
6738
|
+
inputPerToken: 2e-7,
|
|
6739
|
+
outputPerToken: 2e-7
|
|
6740
|
+
},
|
|
6741
|
+
"mistral/ministral-3b-2512": {
|
|
6742
|
+
inputPerToken: 1e-7,
|
|
6743
|
+
outputPerToken: 1e-7
|
|
6744
|
+
},
|
|
6745
|
+
"mistral/ministral-8b-2512": {
|
|
6746
|
+
inputPerToken: 15e-8,
|
|
6747
|
+
outputPerToken: 15e-8
|
|
6748
|
+
},
|
|
6749
|
+
"mistral/mistral-7b-instruct": {
|
|
6750
|
+
inputPerToken: 2e-7,
|
|
6751
|
+
outputPerToken: 2e-7
|
|
6752
|
+
},
|
|
6753
|
+
"mistral/mistral-7b-instruct-v0.1": {
|
|
6754
|
+
inputPerToken: 11e-8,
|
|
6755
|
+
outputPerToken: 19e-8
|
|
6756
|
+
},
|
|
6757
|
+
"mistral/mistral-7b-instruct-v0.3": {
|
|
6758
|
+
inputPerToken: 2e-7,
|
|
6759
|
+
outputPerToken: 2e-7
|
|
6760
|
+
},
|
|
6761
|
+
"mistral/mistral-large": {
|
|
6762
|
+
inputPerToken: 2e-6,
|
|
6763
|
+
outputPerToken: 6e-6
|
|
6764
|
+
},
|
|
6765
|
+
"mistral/mistral-large-2407": {
|
|
6766
|
+
inputPerToken: 2e-6,
|
|
6767
|
+
outputPerToken: 6e-6
|
|
6768
|
+
},
|
|
6769
|
+
"mistral/mistral-large-2411": {
|
|
6770
|
+
inputPerToken: 2e-6,
|
|
6771
|
+
outputPerToken: 6e-6
|
|
6772
|
+
},
|
|
6773
|
+
"mistral/mistral-large-2512": {
|
|
6774
|
+
inputPerToken: 5e-7,
|
|
6775
|
+
outputPerToken: 15e-7
|
|
6776
|
+
},
|
|
6777
|
+
"mistral/mistral-medium-3": {
|
|
6778
|
+
inputPerToken: 4e-7,
|
|
6779
|
+
outputPerToken: 2e-6
|
|
6780
|
+
},
|
|
6781
|
+
"mistral/mistral-medium-3.1": {
|
|
6782
|
+
inputPerToken: 4e-7,
|
|
6783
|
+
outputPerToken: 2e-6
|
|
6784
|
+
},
|
|
6785
|
+
"mistral/mistral-nemo": {
|
|
6786
|
+
inputPerToken: 2e-8,
|
|
6787
|
+
outputPerToken: 4e-8
|
|
6788
|
+
},
|
|
6789
|
+
"mistral/mistral-saba": {
|
|
6790
|
+
inputPerToken: 2e-7,
|
|
6791
|
+
outputPerToken: 6e-7
|
|
6792
|
+
},
|
|
6793
|
+
"mistral/mistral-small": {
|
|
6794
|
+
inputPerToken: 1e-7,
|
|
6795
|
+
outputPerToken: 3e-7
|
|
6796
|
+
},
|
|
6797
|
+
"mistral/mistral-small-24b-instruct-2501": {
|
|
6798
|
+
inputPerToken: 5e-8,
|
|
6799
|
+
outputPerToken: 8e-8
|
|
6800
|
+
},
|
|
6801
|
+
"mistral/mistral-small-3.1-24b-instruct": {
|
|
6802
|
+
inputPerToken: 35e-8,
|
|
6803
|
+
outputPerToken: 56e-8
|
|
6804
|
+
},
|
|
6805
|
+
"mistral/mistral-small-3.2-24b-instruct": {
|
|
6806
|
+
inputPerToken: 6e-8,
|
|
6807
|
+
outputPerToken: 18e-8
|
|
6808
|
+
},
|
|
6809
|
+
"mistral/mistral-small-creative": {
|
|
6810
|
+
inputPerToken: 1e-7,
|
|
6811
|
+
outputPerToken: 3e-7
|
|
6812
|
+
},
|
|
6813
|
+
"mistral/mixtral-8x22b-instruct": {
|
|
6814
|
+
inputPerToken: 2e-6,
|
|
6815
|
+
outputPerToken: 6e-6
|
|
6816
|
+
},
|
|
6817
|
+
"mistral/mixtral-8x7b-instruct": {
|
|
6818
|
+
inputPerToken: 54e-8,
|
|
6819
|
+
outputPerToken: 54e-8
|
|
6820
|
+
},
|
|
6821
|
+
"mistral/pixtral-large-2411": {
|
|
6822
|
+
inputPerToken: 2e-6,
|
|
6823
|
+
outputPerToken: 6e-6
|
|
6824
|
+
},
|
|
6825
|
+
"mistral/voxtral-small-24b-2507": {
|
|
6826
|
+
inputPerToken: 1e-7,
|
|
6827
|
+
outputPerToken: 3e-7
|
|
6828
|
+
},
|
|
6829
|
+
"moonshotai/kimi-k2": {
|
|
6830
|
+
inputPerToken: 55e-8,
|
|
6831
|
+
outputPerToken: 22e-7
|
|
6832
|
+
},
|
|
6833
|
+
"moonshotai/kimi-k2-0905": {
|
|
6834
|
+
inputPerToken: 4e-7,
|
|
6835
|
+
outputPerToken: 2e-6
|
|
6836
|
+
},
|
|
6837
|
+
"moonshotai/kimi-k2-0905:exacto": {
|
|
6838
|
+
inputPerToken: 6e-7,
|
|
6839
|
+
outputPerToken: 25e-7
|
|
6840
|
+
},
|
|
6841
|
+
"moonshotai/kimi-k2-thinking": {
|
|
6842
|
+
inputPerToken: 47e-8,
|
|
6843
|
+
outputPerToken: 2e-6
|
|
6844
|
+
},
|
|
6845
|
+
"moonshotai/kimi-k2.5": {
|
|
6846
|
+
inputPerToken: 45e-8,
|
|
6847
|
+
outputPerToken: 22e-7
|
|
6848
|
+
},
|
|
6849
|
+
"morph/morph-v3-fast": {
|
|
6850
|
+
inputPerToken: 8e-7,
|
|
6851
|
+
outputPerToken: 12e-7
|
|
6852
|
+
},
|
|
6853
|
+
"morph/morph-v3-large": {
|
|
6854
|
+
inputPerToken: 9e-7,
|
|
6855
|
+
outputPerToken: 19e-7
|
|
6856
|
+
},
|
|
6857
|
+
"neversleep/llama-3.1-lumimaid-8b": {
|
|
6858
|
+
inputPerToken: 9e-8,
|
|
6859
|
+
outputPerToken: 6e-7
|
|
6860
|
+
},
|
|
6861
|
+
"neversleep/noromaid-20b": {
|
|
6862
|
+
inputPerToken: 1e-6,
|
|
6863
|
+
outputPerToken: 175e-8
|
|
6864
|
+
},
|
|
6865
|
+
"nex-agi/deepseek-v3.1-nex-n1": {
|
|
6866
|
+
inputPerToken: 27e-8,
|
|
6867
|
+
outputPerToken: 1e-6
|
|
6868
|
+
},
|
|
6869
|
+
"nousresearch/hermes-2-pro-llama-3-8b": {
|
|
6870
|
+
inputPerToken: 14e-8,
|
|
6871
|
+
outputPerToken: 14e-8
|
|
6872
|
+
},
|
|
6873
|
+
"nousresearch/hermes-3-llama-3.1-405b": {
|
|
6874
|
+
inputPerToken: 1e-6,
|
|
6875
|
+
outputPerToken: 1e-6
|
|
6876
|
+
},
|
|
6877
|
+
"nousresearch/hermes-3-llama-3.1-70b": {
|
|
6878
|
+
inputPerToken: 3e-7,
|
|
6879
|
+
outputPerToken: 3e-7
|
|
6880
|
+
},
|
|
6881
|
+
"nousresearch/hermes-4-405b": {
|
|
6882
|
+
inputPerToken: 1e-6,
|
|
6883
|
+
outputPerToken: 3e-6
|
|
6884
|
+
},
|
|
6885
|
+
"nousresearch/hermes-4-70b": {
|
|
6886
|
+
inputPerToken: 13e-8,
|
|
6887
|
+
outputPerToken: 4e-7
|
|
6888
|
+
},
|
|
6889
|
+
"nvidia/llama-3.1-nemotron-70b-instruct": {
|
|
6890
|
+
inputPerToken: 12e-7,
|
|
6891
|
+
outputPerToken: 12e-7
|
|
6892
|
+
},
|
|
6893
|
+
"nvidia/llama-3.3-nemotron-super-49b-v1.5": {
|
|
6894
|
+
inputPerToken: 1e-7,
|
|
6895
|
+
outputPerToken: 4e-7
|
|
6896
|
+
},
|
|
6897
|
+
"nvidia/nemotron-3-nano-30b-a3b": {
|
|
6898
|
+
inputPerToken: 5e-8,
|
|
6899
|
+
outputPerToken: 2e-7
|
|
6900
|
+
},
|
|
6901
|
+
"nvidia/nemotron-nano-12b-v2-vl": {
|
|
6902
|
+
inputPerToken: 2e-7,
|
|
6903
|
+
outputPerToken: 6e-7
|
|
6904
|
+
},
|
|
6905
|
+
"nvidia/nemotron-nano-9b-v2": {
|
|
6906
|
+
inputPerToken: 4e-8,
|
|
6907
|
+
outputPerToken: 16e-8
|
|
6908
|
+
},
|
|
6909
|
+
"openai/gpt-3.5-turbo": {
|
|
6910
|
+
inputPerToken: 5e-7,
|
|
6911
|
+
outputPerToken: 15e-7
|
|
6912
|
+
},
|
|
6913
|
+
"openai/gpt-3.5-turbo-0613": {
|
|
6914
|
+
inputPerToken: 1e-6,
|
|
6915
|
+
outputPerToken: 2e-6
|
|
6916
|
+
},
|
|
6917
|
+
"openai/gpt-3.5-turbo-16k": {
|
|
6918
|
+
inputPerToken: 3e-6,
|
|
6919
|
+
outputPerToken: 4e-6
|
|
6920
|
+
},
|
|
6921
|
+
"openai/gpt-3.5-turbo-instruct": {
|
|
6922
|
+
inputPerToken: 15e-7,
|
|
6923
|
+
outputPerToken: 2e-6
|
|
6924
|
+
},
|
|
6925
|
+
"openai/gpt-4": {
|
|
6926
|
+
inputPerToken: 3e-5,
|
|
6927
|
+
outputPerToken: 6e-5
|
|
6928
|
+
},
|
|
6929
|
+
"openai/gpt-4-0314": {
|
|
6930
|
+
inputPerToken: 3e-5,
|
|
6931
|
+
outputPerToken: 6e-5
|
|
6932
|
+
},
|
|
6933
|
+
"openai/gpt-4-1106-preview": {
|
|
6934
|
+
inputPerToken: 1e-5,
|
|
6935
|
+
outputPerToken: 3e-5
|
|
6936
|
+
},
|
|
6937
|
+
"openai/gpt-4-turbo": {
|
|
6938
|
+
inputPerToken: 1e-5,
|
|
6939
|
+
outputPerToken: 3e-5
|
|
6940
|
+
},
|
|
6941
|
+
"openai/gpt-4-turbo-preview": {
|
|
6942
|
+
inputPerToken: 1e-5,
|
|
6943
|
+
outputPerToken: 3e-5
|
|
6944
|
+
},
|
|
6945
|
+
"openai/gpt-4.1": {
|
|
6946
|
+
inputPerToken: 2e-6,
|
|
6947
|
+
outputPerToken: 8e-6
|
|
6948
|
+
},
|
|
6949
|
+
"openai/gpt-4.1-mini": {
|
|
6950
|
+
inputPerToken: 4e-7,
|
|
6951
|
+
outputPerToken: 16e-7
|
|
6952
|
+
},
|
|
6953
|
+
"openai/gpt-4.1-nano": {
|
|
6954
|
+
inputPerToken: 1e-7,
|
|
6955
|
+
outputPerToken: 4e-7
|
|
6956
|
+
},
|
|
6957
|
+
"openai/gpt-4o": {
|
|
6958
|
+
inputPerToken: 25e-7,
|
|
6959
|
+
outputPerToken: 1e-5
|
|
6960
|
+
},
|
|
6961
|
+
"openai/gpt-4o-2024-05-13": {
|
|
6962
|
+
inputPerToken: 5e-6,
|
|
6963
|
+
outputPerToken: 15e-6
|
|
6964
|
+
},
|
|
6965
|
+
"openai/gpt-4o-2024-08-06": {
|
|
6966
|
+
inputPerToken: 25e-7,
|
|
6967
|
+
outputPerToken: 1e-5
|
|
6968
|
+
},
|
|
6969
|
+
"openai/gpt-4o-2024-11-20": {
|
|
6970
|
+
inputPerToken: 25e-7,
|
|
6971
|
+
outputPerToken: 1e-5
|
|
6972
|
+
},
|
|
6973
|
+
"openai/gpt-4o-audio-preview": {
|
|
6974
|
+
inputPerToken: 25e-7,
|
|
6975
|
+
outputPerToken: 1e-5
|
|
6976
|
+
},
|
|
6977
|
+
"openai/gpt-4o-mini": {
|
|
6978
|
+
inputPerToken: 15e-8,
|
|
6979
|
+
outputPerToken: 6e-7
|
|
6980
|
+
},
|
|
6981
|
+
"openai/gpt-4o-mini-2024-07-18": {
|
|
6982
|
+
inputPerToken: 15e-8,
|
|
6983
|
+
outputPerToken: 6e-7
|
|
6984
|
+
},
|
|
6985
|
+
"openai/gpt-4o-mini-search-preview": {
|
|
6986
|
+
inputPerToken: 15e-8,
|
|
6987
|
+
outputPerToken: 6e-7
|
|
6988
|
+
},
|
|
6989
|
+
"openai/gpt-4o-search-preview": {
|
|
6990
|
+
inputPerToken: 25e-7,
|
|
6991
|
+
outputPerToken: 1e-5
|
|
6992
|
+
},
|
|
6993
|
+
"openai/gpt-4o:extended": {
|
|
6994
|
+
inputPerToken: 6e-6,
|
|
6995
|
+
outputPerToken: 18e-6
|
|
6996
|
+
},
|
|
6997
|
+
"openai/gpt-5": {
|
|
6998
|
+
inputPerToken: 125e-8,
|
|
6999
|
+
outputPerToken: 1e-5
|
|
7000
|
+
},
|
|
7001
|
+
"openai/gpt-5-chat": {
|
|
7002
|
+
inputPerToken: 125e-8,
|
|
7003
|
+
outputPerToken: 1e-5
|
|
7004
|
+
},
|
|
7005
|
+
"openai/gpt-5-codex": {
|
|
7006
|
+
inputPerToken: 125e-8,
|
|
7007
|
+
outputPerToken: 1e-5
|
|
7008
|
+
},
|
|
7009
|
+
"openai/gpt-5-image": {
|
|
7010
|
+
inputPerToken: 1e-5,
|
|
7011
|
+
outputPerToken: 1e-5
|
|
7012
|
+
},
|
|
7013
|
+
"openai/gpt-5-image-mini": {
|
|
7014
|
+
inputPerToken: 25e-7,
|
|
7015
|
+
outputPerToken: 2e-6
|
|
7016
|
+
},
|
|
7017
|
+
"openai/gpt-5-mini": {
|
|
7018
|
+
inputPerToken: 25e-8,
|
|
7019
|
+
outputPerToken: 2e-6
|
|
7020
|
+
},
|
|
7021
|
+
"openai/gpt-5-nano": {
|
|
7022
|
+
inputPerToken: 5e-8,
|
|
7023
|
+
outputPerToken: 4e-7
|
|
7024
|
+
},
|
|
7025
|
+
"openai/gpt-5-pro": {
|
|
7026
|
+
inputPerToken: 15e-6,
|
|
7027
|
+
outputPerToken: 12e-5
|
|
7028
|
+
},
|
|
7029
|
+
"openai/gpt-5.1": {
|
|
7030
|
+
inputPerToken: 125e-8,
|
|
7031
|
+
outputPerToken: 1e-5
|
|
7032
|
+
},
|
|
7033
|
+
"openai/gpt-5.1-chat": {
|
|
7034
|
+
inputPerToken: 125e-8,
|
|
7035
|
+
outputPerToken: 1e-5
|
|
7036
|
+
},
|
|
7037
|
+
"openai/gpt-5.1-codex": {
|
|
7038
|
+
inputPerToken: 125e-8,
|
|
7039
|
+
outputPerToken: 1e-5
|
|
7040
|
+
},
|
|
7041
|
+
"openai/gpt-5.1-codex-max": {
|
|
7042
|
+
inputPerToken: 125e-8,
|
|
7043
|
+
outputPerToken: 1e-5
|
|
7044
|
+
},
|
|
7045
|
+
"openai/gpt-5.1-codex-mini": {
|
|
7046
|
+
inputPerToken: 25e-8,
|
|
7047
|
+
outputPerToken: 2e-6
|
|
7048
|
+
},
|
|
7049
|
+
"openai/gpt-5.2": {
|
|
7050
|
+
inputPerToken: 175e-8,
|
|
7051
|
+
outputPerToken: 14e-6
|
|
7052
|
+
},
|
|
7053
|
+
"openai/gpt-5.2-chat": {
|
|
7054
|
+
inputPerToken: 175e-8,
|
|
7055
|
+
outputPerToken: 14e-6
|
|
7056
|
+
},
|
|
7057
|
+
"openai/gpt-5.2-codex": {
|
|
7058
|
+
inputPerToken: 175e-8,
|
|
7059
|
+
outputPerToken: 14e-6
|
|
7060
|
+
},
|
|
7061
|
+
"openai/gpt-5.2-pro": {
|
|
7062
|
+
inputPerToken: 21e-6,
|
|
7063
|
+
outputPerToken: 168e-6
|
|
7064
|
+
},
|
|
7065
|
+
"openai/gpt-5.3-codex": {
|
|
7066
|
+
inputPerToken: 175e-8,
|
|
7067
|
+
outputPerToken: 14e-6
|
|
7068
|
+
},
|
|
7069
|
+
"openai/gpt-audio": {
|
|
7070
|
+
inputPerToken: 25e-7,
|
|
7071
|
+
outputPerToken: 1e-5
|
|
7072
|
+
},
|
|
7073
|
+
"openai/gpt-audio-mini": {
|
|
7074
|
+
inputPerToken: 6e-7,
|
|
7075
|
+
outputPerToken: 24e-7
|
|
7076
|
+
},
|
|
7077
|
+
"openai/gpt-oss-120b": {
|
|
7078
|
+
inputPerToken: 39e-9,
|
|
7079
|
+
outputPerToken: 19e-8
|
|
7080
|
+
},
|
|
7081
|
+
"openai/gpt-oss-120b:exacto": {
|
|
7082
|
+
inputPerToken: 39e-9,
|
|
7083
|
+
outputPerToken: 19e-8
|
|
7084
|
+
},
|
|
7085
|
+
"openai/gpt-oss-20b": {
|
|
7086
|
+
inputPerToken: 3e-8,
|
|
7087
|
+
outputPerToken: 14e-8
|
|
7088
|
+
},
|
|
7089
|
+
"openai/gpt-oss-safeguard-20b": {
|
|
7090
|
+
inputPerToken: 75e-9,
|
|
7091
|
+
outputPerToken: 3e-7
|
|
7092
|
+
},
|
|
7093
|
+
"openai/o1": {
|
|
7094
|
+
inputPerToken: 15e-6,
|
|
7095
|
+
outputPerToken: 6e-5
|
|
7096
|
+
},
|
|
7097
|
+
"openai/o1-pro": {
|
|
7098
|
+
inputPerToken: 15e-5,
|
|
7099
|
+
outputPerToken: 6e-4
|
|
7100
|
+
},
|
|
7101
|
+
"openai/o3": {
|
|
7102
|
+
inputPerToken: 2e-6,
|
|
7103
|
+
outputPerToken: 8e-6
|
|
7104
|
+
},
|
|
7105
|
+
"openai/o3-deep-research": {
|
|
7106
|
+
inputPerToken: 1e-5,
|
|
7107
|
+
outputPerToken: 4e-5
|
|
7108
|
+
},
|
|
7109
|
+
"openai/o3-mini": {
|
|
7110
|
+
inputPerToken: 11e-7,
|
|
7111
|
+
outputPerToken: 44e-7
|
|
7112
|
+
},
|
|
7113
|
+
"openai/o3-mini-high": {
|
|
7114
|
+
inputPerToken: 11e-7,
|
|
7115
|
+
outputPerToken: 44e-7
|
|
7116
|
+
},
|
|
7117
|
+
"openai/o3-pro": {
|
|
7118
|
+
inputPerToken: 2e-5,
|
|
7119
|
+
outputPerToken: 8e-5
|
|
7120
|
+
},
|
|
7121
|
+
"openai/o4-mini": {
|
|
7122
|
+
inputPerToken: 11e-7,
|
|
7123
|
+
outputPerToken: 44e-7
|
|
7124
|
+
},
|
|
7125
|
+
"openai/o4-mini-deep-research": {
|
|
7126
|
+
inputPerToken: 2e-6,
|
|
7127
|
+
outputPerToken: 8e-6
|
|
7128
|
+
},
|
|
7129
|
+
"openai/o4-mini-high": {
|
|
7130
|
+
inputPerToken: 11e-7,
|
|
7131
|
+
outputPerToken: 44e-7
|
|
7132
|
+
},
|
|
7133
|
+
"opengvlab/internvl3-78b": {
|
|
7134
|
+
inputPerToken: 15e-8,
|
|
7135
|
+
outputPerToken: 6e-7
|
|
7136
|
+
},
|
|
7137
|
+
"perplexity/sonar": {
|
|
7138
|
+
inputPerToken: 1e-6,
|
|
7139
|
+
outputPerToken: 1e-6
|
|
7140
|
+
},
|
|
7141
|
+
"perplexity/sonar-deep-research": {
|
|
7142
|
+
inputPerToken: 2e-6,
|
|
7143
|
+
outputPerToken: 8e-6
|
|
7144
|
+
},
|
|
7145
|
+
"perplexity/sonar-pro": {
|
|
7146
|
+
inputPerToken: 3e-6,
|
|
7147
|
+
outputPerToken: 15e-6
|
|
7148
|
+
},
|
|
7149
|
+
"perplexity/sonar-pro-search": {
|
|
7150
|
+
inputPerToken: 3e-6,
|
|
7151
|
+
outputPerToken: 15e-6
|
|
7152
|
+
},
|
|
7153
|
+
"perplexity/sonar-reasoning-pro": {
|
|
7154
|
+
inputPerToken: 2e-6,
|
|
7155
|
+
outputPerToken: 8e-6
|
|
7156
|
+
},
|
|
7157
|
+
"prime-intellect/intellect-3": {
|
|
7158
|
+
inputPerToken: 2e-7,
|
|
7159
|
+
outputPerToken: 11e-7
|
|
7160
|
+
},
|
|
7161
|
+
"qwen/qwen-2.5-72b-instruct": {
|
|
7162
|
+
inputPerToken: 12e-8,
|
|
7163
|
+
outputPerToken: 39e-8
|
|
7164
|
+
},
|
|
7165
|
+
"qwen/qwen-2.5-7b-instruct": {
|
|
7166
|
+
inputPerToken: 4e-8,
|
|
7167
|
+
outputPerToken: 1e-7
|
|
7168
|
+
},
|
|
7169
|
+
"qwen/qwen-2.5-coder-32b-instruct": {
|
|
7170
|
+
inputPerToken: 20000000000000002e-23,
|
|
7171
|
+
outputPerToken: 20000000000000002e-23
|
|
7172
|
+
},
|
|
7173
|
+
"qwen/qwen-2.5-vl-7b-instruct": {
|
|
7174
|
+
inputPerToken: 20000000000000002e-23,
|
|
7175
|
+
outputPerToken: 20000000000000002e-23
|
|
7176
|
+
},
|
|
7177
|
+
"qwen/qwen-max": {
|
|
7178
|
+
inputPerToken: 16e-7,
|
|
7179
|
+
outputPerToken: 64e-7
|
|
7180
|
+
},
|
|
7181
|
+
"qwen/qwen-plus": {
|
|
7182
|
+
inputPerToken: 4e-7,
|
|
7183
|
+
outputPerToken: 12e-7
|
|
7184
|
+
},
|
|
7185
|
+
"qwen/qwen-plus-2025-07-28": {
|
|
7186
|
+
inputPerToken: 4e-7,
|
|
7187
|
+
outputPerToken: 12e-7
|
|
7188
|
+
},
|
|
7189
|
+
"qwen/qwen-plus-2025-07-28:thinking": {
|
|
7190
|
+
inputPerToken: 4e-7,
|
|
7191
|
+
outputPerToken: 12e-7
|
|
7192
|
+
},
|
|
7193
|
+
"qwen/qwen-turbo": {
|
|
7194
|
+
inputPerToken: 5e-8,
|
|
7195
|
+
outputPerToken: 2e-7
|
|
7196
|
+
},
|
|
7197
|
+
"qwen/qwen-vl-max": {
|
|
7198
|
+
inputPerToken: 8e-7,
|
|
7199
|
+
outputPerToken: 32e-7
|
|
7200
|
+
},
|
|
7201
|
+
"qwen/qwen-vl-plus": {
|
|
7202
|
+
inputPerToken: 21e-8,
|
|
7203
|
+
outputPerToken: 63e-8
|
|
7204
|
+
},
|
|
7205
|
+
"qwen/qwen2.5-coder-7b-instruct": {
|
|
7206
|
+
inputPerToken: 3e-8,
|
|
7207
|
+
outputPerToken: 9e-8
|
|
7208
|
+
},
|
|
7209
|
+
"qwen/qwen2.5-vl-32b-instruct": {
|
|
7210
|
+
inputPerToken: 2e-7,
|
|
7211
|
+
outputPerToken: 6e-7
|
|
7212
|
+
},
|
|
7213
|
+
"qwen/qwen2.5-vl-72b-instruct": {
|
|
7214
|
+
inputPerToken: 8e-7,
|
|
7215
|
+
outputPerToken: 8e-7
|
|
7216
|
+
},
|
|
7217
|
+
"qwen/qwen3-14b": {
|
|
7218
|
+
inputPerToken: 6e-8,
|
|
7219
|
+
outputPerToken: 24e-8
|
|
7220
|
+
},
|
|
7221
|
+
"qwen/qwen3-235b-a22b": {
|
|
7222
|
+
inputPerToken: 455e-9,
|
|
7223
|
+
outputPerToken: 182e-8
|
|
7224
|
+
},
|
|
7225
|
+
"qwen/qwen3-235b-a22b-2507": {
|
|
7226
|
+
inputPerToken: 71e-9,
|
|
7227
|
+
outputPerToken: 1e-7
|
|
7228
|
+
},
|
|
7229
|
+
"qwen/qwen3-30b-a3b": {
|
|
7230
|
+
inputPerToken: 8e-8,
|
|
7231
|
+
outputPerToken: 28e-8
|
|
7232
|
+
},
|
|
7233
|
+
"qwen/qwen3-30b-a3b-instruct-2507": {
|
|
7234
|
+
inputPerToken: 9e-8,
|
|
7235
|
+
outputPerToken: 3e-7
|
|
7236
|
+
},
|
|
7237
|
+
"qwen/qwen3-30b-a3b-thinking-2507": {
|
|
7238
|
+
inputPerToken: 51e-9,
|
|
7239
|
+
outputPerToken: 34e-8
|
|
7240
|
+
},
|
|
7241
|
+
"qwen/qwen3-32b": {
|
|
7242
|
+
inputPerToken: 8e-8,
|
|
7243
|
+
outputPerToken: 24e-8
|
|
7244
|
+
},
|
|
7245
|
+
"qwen/qwen3-8b": {
|
|
7246
|
+
inputPerToken: 5e-8,
|
|
7247
|
+
outputPerToken: 4e-7
|
|
7248
|
+
},
|
|
7249
|
+
"qwen/qwen3-coder": {
|
|
7250
|
+
inputPerToken: 22e-8,
|
|
7251
|
+
outputPerToken: 1e-6
|
|
7252
|
+
},
|
|
7253
|
+
"qwen/qwen3-coder-30b-a3b-instruct": {
|
|
7254
|
+
inputPerToken: 7e-8,
|
|
7255
|
+
outputPerToken: 27e-8
|
|
7256
|
+
},
|
|
7257
|
+
"qwen/qwen3-coder-flash": {
|
|
7258
|
+
inputPerToken: 3e-7,
|
|
7259
|
+
outputPerToken: 15e-7
|
|
7260
|
+
},
|
|
7261
|
+
"qwen/qwen3-coder-next": {
|
|
7262
|
+
inputPerToken: 12e-8,
|
|
7263
|
+
outputPerToken: 75e-8
|
|
7264
|
+
},
|
|
7265
|
+
"qwen/qwen3-coder-plus": {
|
|
7266
|
+
inputPerToken: 1e-6,
|
|
7267
|
+
outputPerToken: 5e-6
|
|
7268
|
+
},
|
|
7269
|
+
"qwen/qwen3-coder:exacto": {
|
|
7270
|
+
inputPerToken: 22e-8,
|
|
7271
|
+
outputPerToken: 18e-7
|
|
7272
|
+
},
|
|
7273
|
+
"qwen/qwen3-max": {
|
|
7274
|
+
inputPerToken: 12e-7,
|
|
7275
|
+
outputPerToken: 6e-6
|
|
7276
|
+
},
|
|
7277
|
+
"qwen/qwen3-max-thinking": {
|
|
7278
|
+
inputPerToken: 12e-7,
|
|
7279
|
+
outputPerToken: 6e-6
|
|
7280
|
+
},
|
|
7281
|
+
"qwen/qwen3-next-80b-a3b-instruct": {
|
|
7282
|
+
inputPerToken: 9e-8,
|
|
7283
|
+
outputPerToken: 11e-7
|
|
7284
|
+
},
|
|
7285
|
+
"qwen/qwen3-next-80b-a3b-thinking": {
|
|
7286
|
+
inputPerToken: 15e-8,
|
|
7287
|
+
outputPerToken: 12e-7
|
|
7288
|
+
},
|
|
7289
|
+
"qwen/qwen3-vl-235b-a22b-instruct": {
|
|
7290
|
+
inputPerToken: 2e-7,
|
|
7291
|
+
outputPerToken: 88e-8
|
|
7292
|
+
},
|
|
7293
|
+
"qwen/qwen3-vl-30b-a3b-instruct": {
|
|
7294
|
+
inputPerToken: 13e-8,
|
|
7295
|
+
outputPerToken: 52e-8
|
|
7296
|
+
},
|
|
7297
|
+
"qwen/qwen3-vl-32b-instruct": {
|
|
7298
|
+
inputPerToken: 104e-9,
|
|
7299
|
+
outputPerToken: 416e-9
|
|
7300
|
+
},
|
|
7301
|
+
"qwen/qwen3-vl-8b-instruct": {
|
|
7302
|
+
inputPerToken: 8e-8,
|
|
7303
|
+
outputPerToken: 5e-7
|
|
7304
|
+
},
|
|
7305
|
+
"qwen/qwen3-vl-8b-thinking": {
|
|
7306
|
+
inputPerToken: 117e-9,
|
|
7307
|
+
outputPerToken: 1365e-9
|
|
7308
|
+
},
|
|
7309
|
+
"qwen/qwen3.5-122b-a10b": {
|
|
7310
|
+
inputPerToken: 4e-7,
|
|
7311
|
+
outputPerToken: 32e-7
|
|
7312
|
+
},
|
|
7313
|
+
"qwen/qwen3.5-27b": {
|
|
7314
|
+
inputPerToken: 3e-7,
|
|
7315
|
+
outputPerToken: 24e-7
|
|
7316
|
+
},
|
|
7317
|
+
"qwen/qwen3.5-35b-a3b": {
|
|
7318
|
+
inputPerToken: 25e-8,
|
|
7319
|
+
outputPerToken: 2e-6
|
|
7320
|
+
},
|
|
7321
|
+
"qwen/qwen3.5-397b-a17b": {
|
|
7322
|
+
inputPerToken: 55e-8,
|
|
7323
|
+
outputPerToken: 35e-7
|
|
7324
|
+
},
|
|
7325
|
+
"qwen/qwen3.5-flash-02-23": {
|
|
7326
|
+
inputPerToken: 1e-7,
|
|
7327
|
+
outputPerToken: 4e-7
|
|
7328
|
+
},
|
|
7329
|
+
"qwen/qwen3.5-plus-02-15": {
|
|
7330
|
+
inputPerToken: 4e-7,
|
|
7331
|
+
outputPerToken: 24e-7
|
|
7332
|
+
},
|
|
7333
|
+
"qwen/qwq-32b": {
|
|
7334
|
+
inputPerToken: 15e-8,
|
|
7335
|
+
outputPerToken: 4e-7
|
|
7336
|
+
},
|
|
7337
|
+
"raifle/sorcererlm-8x22b": {
|
|
7338
|
+
inputPerToken: 45e-7,
|
|
7339
|
+
outputPerToken: 45e-7
|
|
7340
|
+
},
|
|
7341
|
+
"relace/relace-apply-3": {
|
|
7342
|
+
inputPerToken: 85e-8,
|
|
7343
|
+
outputPerToken: 125e-8
|
|
7344
|
+
},
|
|
7345
|
+
"relace/relace-search": {
|
|
7346
|
+
inputPerToken: 1e-6,
|
|
7347
|
+
outputPerToken: 3e-6
|
|
7348
|
+
},
|
|
7349
|
+
"sao10k/l3-euryale-70b": {
|
|
7350
|
+
inputPerToken: 148e-8,
|
|
7351
|
+
outputPerToken: 148e-8
|
|
7352
|
+
},
|
|
7353
|
+
"sao10k/l3-lunaris-8b": {
|
|
7354
|
+
inputPerToken: 4e-8,
|
|
7355
|
+
outputPerToken: 5e-8
|
|
7356
|
+
},
|
|
7357
|
+
"sao10k/l3.1-70b-hanami-x1": {
|
|
7358
|
+
inputPerToken: 3e-6,
|
|
7359
|
+
outputPerToken: 3e-6
|
|
7360
|
+
},
|
|
7361
|
+
"sao10k/l3.1-euryale-70b": {
|
|
7362
|
+
inputPerToken: 65e-8,
|
|
7363
|
+
outputPerToken: 75e-8
|
|
7364
|
+
},
|
|
7365
|
+
"sao10k/l3.3-euryale-70b": {
|
|
7366
|
+
inputPerToken: 65e-8,
|
|
7367
|
+
outputPerToken: 75e-8
|
|
7368
|
+
},
|
|
7369
|
+
"stepfun/step-3.5-flash": {
|
|
7370
|
+
inputPerToken: 1e-7,
|
|
7371
|
+
outputPerToken: 3e-7
|
|
7372
|
+
},
|
|
7373
|
+
"switchpoint/router": {
|
|
7374
|
+
inputPerToken: 85e-8,
|
|
7375
|
+
outputPerToken: 34e-7
|
|
7376
|
+
},
|
|
7377
|
+
"tencent/hunyuan-a13b-instruct": {
|
|
7378
|
+
inputPerToken: 14e-8,
|
|
7379
|
+
outputPerToken: 57e-8
|
|
7380
|
+
},
|
|
7381
|
+
"thedrummer/cydonia-24b-v4.1": {
|
|
7382
|
+
inputPerToken: 3e-7,
|
|
7383
|
+
outputPerToken: 5e-7
|
|
7384
|
+
},
|
|
7385
|
+
"thedrummer/rocinante-12b": {
|
|
7386
|
+
inputPerToken: 17e-8,
|
|
7387
|
+
outputPerToken: 43e-8
|
|
7388
|
+
},
|
|
7389
|
+
"thedrummer/skyfall-36b-v2": {
|
|
7390
|
+
inputPerToken: 55e-8,
|
|
7391
|
+
outputPerToken: 8e-7
|
|
7392
|
+
},
|
|
7393
|
+
"thedrummer/unslopnemo-12b": {
|
|
7394
|
+
inputPerToken: 4e-7,
|
|
7395
|
+
outputPerToken: 4e-7
|
|
7396
|
+
},
|
|
7397
|
+
"tngtech/deepseek-r1t2-chimera": {
|
|
7398
|
+
inputPerToken: 25e-8,
|
|
7399
|
+
outputPerToken: 85e-8
|
|
7400
|
+
},
|
|
7401
|
+
"undi95/remm-slerp-l2-13b": {
|
|
7402
|
+
inputPerToken: 45e-8,
|
|
7403
|
+
outputPerToken: 65e-8
|
|
7404
|
+
},
|
|
7405
|
+
"writer/palmyra-x5": {
|
|
7406
|
+
inputPerToken: 6e-7,
|
|
7407
|
+
outputPerToken: 6e-6
|
|
7408
|
+
},
|
|
7409
|
+
"xai/grok-3": {
|
|
7410
|
+
inputPerToken: 3e-6,
|
|
7411
|
+
outputPerToken: 15e-6
|
|
7412
|
+
},
|
|
7413
|
+
"xai/grok-3-beta": {
|
|
7414
|
+
inputPerToken: 3e-6,
|
|
7415
|
+
outputPerToken: 15e-6
|
|
7416
|
+
},
|
|
7417
|
+
"xai/grok-3-mini": {
|
|
7418
|
+
inputPerToken: 3e-7,
|
|
7419
|
+
outputPerToken: 5e-7
|
|
7420
|
+
},
|
|
7421
|
+
"xai/grok-3-mini-beta": {
|
|
7422
|
+
inputPerToken: 3e-7,
|
|
7423
|
+
outputPerToken: 5e-7
|
|
7424
|
+
},
|
|
7425
|
+
"xai/grok-4": {
|
|
7426
|
+
inputPerToken: 3e-6,
|
|
7427
|
+
outputPerToken: 15e-6
|
|
7428
|
+
},
|
|
7429
|
+
"xai/grok-4-fast": {
|
|
7430
|
+
inputPerToken: 2e-7,
|
|
7431
|
+
outputPerToken: 5e-7
|
|
7432
|
+
},
|
|
7433
|
+
"xai/grok-4.1-fast": {
|
|
7434
|
+
inputPerToken: 2e-7,
|
|
7435
|
+
outputPerToken: 5e-7
|
|
7436
|
+
},
|
|
7437
|
+
"xai/grok-code-fast-1": {
|
|
7438
|
+
inputPerToken: 2e-7,
|
|
7439
|
+
outputPerToken: 15e-7
|
|
7440
|
+
},
|
|
7441
|
+
"xiaomi/mimo-v2-flash": {
|
|
7442
|
+
inputPerToken: 9e-8,
|
|
7443
|
+
outputPerToken: 29e-8
|
|
7444
|
+
},
|
|
7445
|
+
"z-ai/glm-4-32b": {
|
|
7446
|
+
inputPerToken: 1e-7,
|
|
7447
|
+
outputPerToken: 1e-7
|
|
7448
|
+
},
|
|
7449
|
+
"z-ai/glm-4.5": {
|
|
7450
|
+
inputPerToken: 55e-8,
|
|
7451
|
+
outputPerToken: 2e-6
|
|
7452
|
+
},
|
|
7453
|
+
"z-ai/glm-4.5-air": {
|
|
7454
|
+
inputPerToken: 13e-8,
|
|
7455
|
+
outputPerToken: 85e-8
|
|
7456
|
+
},
|
|
7457
|
+
"z-ai/glm-4.5v": {
|
|
7458
|
+
inputPerToken: 6e-7,
|
|
7459
|
+
outputPerToken: 18e-7
|
|
7460
|
+
},
|
|
7461
|
+
"z-ai/glm-4.6": {
|
|
7462
|
+
inputPerToken: 35e-8,
|
|
7463
|
+
outputPerToken: 171e-8
|
|
7464
|
+
},
|
|
7465
|
+
"z-ai/glm-4.6:exacto": {
|
|
7466
|
+
inputPerToken: 44e-8,
|
|
7467
|
+
outputPerToken: 176e-8
|
|
7468
|
+
},
|
|
7469
|
+
"z-ai/glm-4.6v": {
|
|
7470
|
+
inputPerToken: 3e-7,
|
|
7471
|
+
outputPerToken: 9e-7
|
|
7472
|
+
},
|
|
7473
|
+
"z-ai/glm-4.7": {
|
|
7474
|
+
inputPerToken: 3e-7,
|
|
7475
|
+
outputPerToken: 14e-7
|
|
7476
|
+
},
|
|
7477
|
+
"z-ai/glm-4.7-flash": {
|
|
7478
|
+
inputPerToken: 6e-8,
|
|
7479
|
+
outputPerToken: 4e-7
|
|
7480
|
+
},
|
|
7481
|
+
"z-ai/glm-5": {
|
|
7482
|
+
inputPerToken: 95e-8,
|
|
7483
|
+
outputPerToken: 255e-8
|
|
7484
|
+
}
|
|
7485
|
+
}
|
|
7486
|
+
};
|
|
7487
|
+
|
|
7488
|
+
// src/pricing/lookup.ts
|
|
7489
|
+
var models = catalog_default.models;
|
|
7490
|
+
var modelNameIndex = /* @__PURE__ */ new Map();
|
|
7491
|
+
for (const key of Object.keys(models)) {
|
|
7492
|
+
const name = key.split("/").slice(1).join("/");
|
|
7493
|
+
if (name && !modelNameIndex.has(name)) {
|
|
7494
|
+
modelNameIndex.set(name, key);
|
|
7495
|
+
}
|
|
7496
|
+
}
|
|
7497
|
+
function lookupPricing(providerId) {
|
|
7498
|
+
if (models[providerId]) return models[providerId];
|
|
7499
|
+
const model = providerId.split("/").slice(1).join("/");
|
|
7500
|
+
if (!model) return void 0;
|
|
7501
|
+
const asOpenai = `openai/${model}`;
|
|
7502
|
+
if (models[asOpenai]) return models[asOpenai];
|
|
7503
|
+
const crossKey = modelNameIndex.get(model);
|
|
7504
|
+
if (crossKey) return models[crossKey];
|
|
7505
|
+
return void 0;
|
|
7506
|
+
}
|
|
7507
|
+
function estimateCost(pricing, promptTokens, completionTokens) {
|
|
7508
|
+
return pricing.inputPerToken * promptTokens + pricing.outputPerToken * completionTokens;
|
|
7509
|
+
}
|
|
7510
|
+
|
|
7511
|
+
// src/scorers/cost.ts
|
|
7512
|
+
var costScorer = ({ result }, providerId) => {
|
|
7513
|
+
const promptTokens = result.usage?.promptTokens ?? 0;
|
|
7514
|
+
const completionTokens = result.usage?.completionTokens ?? 0;
|
|
7515
|
+
const totalTokens = promptTokens + completionTokens;
|
|
7516
|
+
const pricing = lookupPricing(providerId);
|
|
7517
|
+
if (!pricing) {
|
|
7518
|
+
return {
|
|
7519
|
+
name: "cost",
|
|
7520
|
+
value: -1,
|
|
7521
|
+
details: {
|
|
7522
|
+
estimatedUsd: null,
|
|
7523
|
+
promptTokens,
|
|
7524
|
+
completionTokens,
|
|
7525
|
+
totalTokens,
|
|
7526
|
+
note: "No pricing data available for this model"
|
|
7527
|
+
}
|
|
7528
|
+
};
|
|
7529
|
+
}
|
|
7530
|
+
const usd = estimateCost(pricing, promptTokens, completionTokens);
|
|
7531
|
+
return {
|
|
7532
|
+
name: "cost",
|
|
7533
|
+
value: usd,
|
|
7534
|
+
details: {
|
|
7535
|
+
estimatedUsd: usd,
|
|
7536
|
+
promptTokens,
|
|
7537
|
+
completionTokens,
|
|
7538
|
+
totalTokens
|
|
7539
|
+
}
|
|
7540
|
+
};
|
|
7541
|
+
};
|
|
7542
|
+
|
|
7543
|
+
// src/utils/deep-equal.ts
|
|
7544
|
+
function deepEqual(expected, actual) {
|
|
7545
|
+
if (expected === actual) return true;
|
|
7546
|
+
if (typeof expected === "string" && typeof actual === "string") {
|
|
7547
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
7548
|
+
}
|
|
7549
|
+
if (typeof expected !== typeof actual) return false;
|
|
7550
|
+
if (expected === null || actual === null) return expected === actual;
|
|
7551
|
+
if (Array.isArray(expected) && Array.isArray(actual)) {
|
|
7552
|
+
if (expected.length !== actual.length) return false;
|
|
7553
|
+
return expected.every((val, i7) => deepEqual(val, actual[i7]));
|
|
7554
|
+
}
|
|
7555
|
+
if (typeof expected === "object" && typeof actual === "object") {
|
|
7556
|
+
const objExpected = expected;
|
|
7557
|
+
const objActual = actual;
|
|
7558
|
+
const keysExpected = Object.keys(objExpected);
|
|
7559
|
+
return keysExpected.every((key) => key in objActual && deepEqual(objExpected[key], objActual[key]));
|
|
7560
|
+
}
|
|
7561
|
+
return expected === actual;
|
|
7562
|
+
}
|
|
7563
|
+
|
|
7564
|
+
// src/scorers/correctness.ts
|
|
7565
|
+
var correctnessScorer = ({ task, result }) => {
|
|
7566
|
+
if (task.expected === void 0) {
|
|
7567
|
+
return { name: "correctness", value: 0.5, details: { reason: "no expected value" } };
|
|
7568
|
+
}
|
|
7569
|
+
const actual = normalizeOutput(task.expected, result.output);
|
|
7570
|
+
const match = deepEqual(task.expected, actual);
|
|
7571
|
+
return {
|
|
7572
|
+
name: "correctness",
|
|
7573
|
+
value: match ? 1 : 0,
|
|
7574
|
+
details: { expected: task.expected, actual: result.output }
|
|
7575
|
+
};
|
|
7576
|
+
};
|
|
7577
|
+
function normalizeOutput(expected, actual) {
|
|
7578
|
+
if (Array.isArray(expected) && !Array.isArray(actual) && typeof actual === "object" && actual !== null) {
|
|
7579
|
+
const entries = Object.entries(actual);
|
|
7580
|
+
const arrayEntries = entries.filter(([, v4]) => Array.isArray(v4));
|
|
7581
|
+
if (arrayEntries.length === 1) {
|
|
7582
|
+
return arrayEntries[0][1];
|
|
7583
|
+
}
|
|
7584
|
+
}
|
|
7585
|
+
return actual;
|
|
7586
|
+
}
|
|
7587
|
+
|
|
7588
|
+
// src/scorers/schema-correctness.ts
|
|
7589
|
+
var schemaCorrectnessScorer = ({ task, result }) => {
|
|
7590
|
+
if (!task.schema) {
|
|
7591
|
+
return { name: "schema-correctness", value: -1, details: { reason: "no schema defined" } };
|
|
7592
|
+
}
|
|
7593
|
+
let data = result.output;
|
|
7594
|
+
if (typeof data === "string") {
|
|
7595
|
+
try {
|
|
7596
|
+
data = JSON.parse(data);
|
|
7597
|
+
} catch {
|
|
7598
|
+
return {
|
|
7599
|
+
name: "schema-correctness",
|
|
7600
|
+
value: 0,
|
|
7601
|
+
details: { reason: "output is not valid JSON" }
|
|
7602
|
+
};
|
|
7603
|
+
}
|
|
7604
|
+
}
|
|
7605
|
+
let parsed = task.schema.safeParse(data);
|
|
7606
|
+
if (!parsed.success && !Array.isArray(data) && typeof data === "object" && data !== null) {
|
|
7607
|
+
const arrayEntries = Object.entries(data).filter(([, v4]) => Array.isArray(v4));
|
|
7608
|
+
if (arrayEntries.length === 1) {
|
|
7609
|
+
const unwrapped = task.schema.safeParse(arrayEntries[0][1]);
|
|
7610
|
+
if (unwrapped.success) parsed = unwrapped;
|
|
7611
|
+
}
|
|
7612
|
+
}
|
|
7613
|
+
return {
|
|
7614
|
+
name: "schema-correctness",
|
|
7615
|
+
value: parsed.success ? 1 : 0,
|
|
7616
|
+
details: parsed.success ? { valid: true } : { valid: false, errors: parsed.error.issues.map((i7) => i7.message) }
|
|
7617
|
+
};
|
|
7618
|
+
};
|
|
7619
|
+
|
|
7620
|
+
// src/scorers/fuzzy-similarity.ts
|
|
7621
|
+
var fuzzySimilarityScorer = ({ task, result }) => {
|
|
7622
|
+
if (task.expected === void 0) {
|
|
7623
|
+
return { name: "fuzzy-similarity", value: -1, details: { reason: "no expected value" } };
|
|
7624
|
+
}
|
|
7625
|
+
const a7 = stringify(task.expected);
|
|
7626
|
+
const b3 = stringify(result.output);
|
|
7627
|
+
const setA = tokenize(a7);
|
|
7628
|
+
const setB = tokenize(b3);
|
|
7629
|
+
const similarity = jaccardSimilarity(setA, setB);
|
|
7630
|
+
return {
|
|
7631
|
+
name: "fuzzy-similarity",
|
|
7632
|
+
value: Math.round(similarity * 100) / 100,
|
|
7633
|
+
details: { method: "jaccard", expectedTokens: setA.size, actualTokens: setB.size }
|
|
7634
|
+
};
|
|
7635
|
+
};
|
|
7636
|
+
function stringify(value) {
|
|
7637
|
+
if (typeof value === "string") return value.toLowerCase();
|
|
7638
|
+
return JSON.stringify(value).toLowerCase();
|
|
7639
|
+
}
|
|
7640
|
+
function tokenize(text) {
|
|
7641
|
+
return new Set(text.match(/\w+/g) ?? []);
|
|
7642
|
+
}
|
|
7643
|
+
function jaccardSimilarity(a7, b3) {
|
|
7644
|
+
if (a7.size === 0 && b3.size === 0) return 1;
|
|
7645
|
+
let intersection = 0;
|
|
7646
|
+
for (const token of a7) {
|
|
7647
|
+
if (b3.has(token)) intersection++;
|
|
7648
|
+
}
|
|
7649
|
+
const union = a7.size + b3.size - intersection;
|
|
7650
|
+
return union === 0 ? 1 : intersection / union;
|
|
7651
|
+
}
|
|
7652
|
+
|
|
7653
|
+
// src/scorers/llm-judge.ts
|
|
7654
|
+
import OpenAI2, { AzureOpenAI as AzureOpenAI2 } from "openai";
|
|
7655
|
+
|
|
7656
|
+
// src/providers/openai.ts
|
|
7657
|
+
import OpenAI, { AzureOpenAI } from "openai";
|
|
7658
|
+
import { zodToJsonSchema as zodToJsonSchema2 } from "zod-to-json-schema";
|
|
7659
|
+
|
|
7660
|
+
// src/providers/shared.ts
|
|
7661
|
+
import { zodToJsonSchema } from "zod-to-json-schema";
|
|
7662
|
+
|
|
7663
|
+
// src/providers/openai.ts
|
|
7664
|
+
var REQUEST_TIMEOUT_MS = 6e4;
|
|
7665
|
+
|
|
7666
|
+
// src/scorers/llm-judge.ts
|
|
7667
|
+
var JUDGE_PROMPT = `You are a strict scoring judge. Evaluate the actual output against the expected output on three criteria. Score each from 0.0 to 1.0 using the full range (not just 0, 0.5, 1).
|
|
7668
|
+
|
|
7669
|
+
Criteria:
|
|
7670
|
+
1. Accuracy \u2014 are the facts, entities, and claims correct? Penalize hallucinations or wrong details.
|
|
7671
|
+
2. Completeness \u2014 does it capture all key information from the expected output? Penalize missing points.
|
|
7672
|
+
3. Conciseness \u2014 is it free of unnecessary filler, repetition, or tangential content? Penalize verbosity.
|
|
7673
|
+
|
|
7674
|
+
Respond with ONLY this exact format \u2014 three lines, no other text:
|
|
7675
|
+
accuracy: <number>
|
|
7676
|
+
completeness: <number>
|
|
7677
|
+
conciseness: <number>
|
|
7678
|
+
|
|
7679
|
+
Task: {task}
|
|
7680
|
+
Expected: {expected}
|
|
7681
|
+
Actual: {actual}`;
|
|
7682
|
+
function resolveJudgeClient(configModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
7683
|
+
const model = configModel ?? process.env.DUELIST_JUDGE_MODEL ?? "gpt-5-mini";
|
|
7684
|
+
if (model.startsWith("gemini") && process.env.GOOGLE_API_KEY) {
|
|
7685
|
+
return {
|
|
7686
|
+
client: new OpenAI2({
|
|
7687
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
7688
|
+
baseURL: "https://generativelanguage.googleapis.com/v1beta/openai/",
|
|
7689
|
+
timeout: timeoutMs
|
|
7690
|
+
}),
|
|
7691
|
+
model
|
|
7692
|
+
};
|
|
7693
|
+
}
|
|
7694
|
+
if (!process.env.OPENAI_API_KEY && process.env.AZURE_OPENAI_API_KEY) {
|
|
7695
|
+
return {
|
|
7696
|
+
client: new AzureOpenAI2({
|
|
7697
|
+
apiKey: process.env.AZURE_OPENAI_API_KEY,
|
|
7698
|
+
endpoint: process.env.AZURE_OPENAI_ENDPOINT,
|
|
7699
|
+
apiVersion: process.env.AZURE_OPENAI_API_VERSION ?? "2024-12-01-preview",
|
|
7700
|
+
deployment: model,
|
|
7701
|
+
timeout: timeoutMs
|
|
7702
|
+
}),
|
|
7703
|
+
model
|
|
7704
|
+
};
|
|
7705
|
+
}
|
|
7706
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
7707
|
+
if (!apiKey) return void 0;
|
|
7708
|
+
return { client: new OpenAI2({ apiKey, timeout: timeoutMs }), model };
|
|
7709
|
+
}
|
|
7710
|
+
function isTemperatureError(err) {
|
|
7711
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
7712
|
+
const lower = msg.toLowerCase();
|
|
7713
|
+
return lower.includes("temperature") && (lower.includes("not supported") || lower.includes("is not allowed") || lower.includes("unsupported") || lower.includes("invalid"));
|
|
7714
|
+
}
|
|
7715
|
+
function createLlmJudgeScorer(judgeModel, timeoutMs = REQUEST_TIMEOUT_MS) {
|
|
7716
|
+
let cached = void 0;
|
|
7717
|
+
let useTemperature = true;
|
|
7718
|
+
return async ({ task, result }) => {
|
|
7719
|
+
if (task.expected === void 0) {
|
|
7720
|
+
return { name: "llm-judge-correctness", value: -1, details: { reason: "no expected value" } };
|
|
7721
|
+
}
|
|
7722
|
+
if (cached === void 0) {
|
|
7723
|
+
cached = resolveJudgeClient(judgeModel, timeoutMs) ?? null;
|
|
7724
|
+
}
|
|
7725
|
+
if (!cached) {
|
|
7726
|
+
return {
|
|
7727
|
+
name: "llm-judge-correctness",
|
|
7728
|
+
value: -1,
|
|
7729
|
+
details: { reason: "no API key available for judge model" }
|
|
7730
|
+
};
|
|
7731
|
+
}
|
|
7732
|
+
const { client, model } = cached;
|
|
7733
|
+
const prompt = JUDGE_PROMPT.replace("{task}", task.prompt).replace("{expected}", JSON.stringify(task.expected)).replace("{actual}", JSON.stringify(result.output));
|
|
7734
|
+
const messages = [{ role: "user", content: prompt }];
|
|
7735
|
+
try {
|
|
7736
|
+
const response = await callJudge(client, model, messages, useTemperature);
|
|
7737
|
+
return parseJudgeResponse(response, model);
|
|
7738
|
+
} catch (err) {
|
|
7739
|
+
if (useTemperature && isTemperatureError(err)) {
|
|
7740
|
+
useTemperature = false;
|
|
7741
|
+
try {
|
|
7742
|
+
const response = await callJudge(client, model, messages, false);
|
|
7743
|
+
return parseJudgeResponse(response, model);
|
|
7744
|
+
} catch (retryErr) {
|
|
7745
|
+
return {
|
|
7746
|
+
name: "llm-judge-correctness",
|
|
7747
|
+
value: -1,
|
|
7748
|
+
details: { reason: `judge call failed: ${retryErr instanceof Error ? retryErr.message : String(retryErr)}` }
|
|
7749
|
+
};
|
|
7750
|
+
}
|
|
7751
|
+
}
|
|
7752
|
+
return {
|
|
7753
|
+
name: "llm-judge-correctness",
|
|
7754
|
+
value: -1,
|
|
7755
|
+
details: { reason: `judge call failed: ${err instanceof Error ? err.message : String(err)}` }
|
|
7756
|
+
};
|
|
7757
|
+
}
|
|
7758
|
+
};
|
|
7759
|
+
}
|
|
7760
|
+
async function callJudge(client, model, messages, withTemperature) {
|
|
7761
|
+
return client.chat.completions.create({
|
|
7762
|
+
model,
|
|
7763
|
+
messages,
|
|
7764
|
+
max_completion_tokens: 2048,
|
|
7765
|
+
...withTemperature ? { temperature: 0 } : {}
|
|
7766
|
+
});
|
|
7767
|
+
}
|
|
7768
|
+
function parseJudgeResponse(response, model) {
|
|
7769
|
+
const content = response.choices[0]?.message?.content?.trim() ?? "";
|
|
7770
|
+
const parsed = {};
|
|
7771
|
+
for (const line of content.split("\n")) {
|
|
7772
|
+
const match = line.match(/^(accuracy|completeness|conciseness)\s*:\s*([\d.]+)/i);
|
|
7773
|
+
if (match) parsed[match[1].toLowerCase()] = parseFloat(match[2]);
|
|
7774
|
+
}
|
|
7775
|
+
const accuracy = parsed.accuracy;
|
|
7776
|
+
const completeness = parsed.completeness;
|
|
7777
|
+
const conciseness = parsed.conciseness;
|
|
7778
|
+
if (accuracy == null || completeness == null || conciseness == null || [accuracy, completeness, conciseness].some((s5) => isNaN(s5) || s5 < 0 || s5 > 1)) {
|
|
7779
|
+
return {
|
|
7780
|
+
name: "llm-judge-correctness",
|
|
7781
|
+
value: -1,
|
|
7782
|
+
details: { reason: `judge returned unparseable scores: "${content}"`, model }
|
|
7783
|
+
};
|
|
7784
|
+
}
|
|
7785
|
+
const composite = Math.round((accuracy + completeness + conciseness) / 3 * 100) / 100;
|
|
7786
|
+
return {
|
|
7787
|
+
name: "llm-judge-correctness",
|
|
7788
|
+
value: composite,
|
|
7789
|
+
details: { model, accuracy, completeness, conciseness }
|
|
7790
|
+
};
|
|
7791
|
+
}
|
|
7792
|
+
|
|
7793
|
+
// src/scorers/tool-usage.ts
|
|
7794
|
+
var toolUsageScorer = ({ task, result }) => {
|
|
7795
|
+
if (!task.tools?.length) {
|
|
7796
|
+
return { name: "tool-usage", value: -1, details: { reason: "no tools configured on task" } };
|
|
7797
|
+
}
|
|
7798
|
+
const calls = result.toolCalls ?? [];
|
|
7799
|
+
const expectedIsObject = task.expected !== void 0 && typeof task.expected === "object" && task.expected !== null && !Array.isArray(task.expected);
|
|
7800
|
+
if (expectedIsObject) {
|
|
7801
|
+
const matchingCall = calls.find((c3) => {
|
|
7802
|
+
const toolDef = task.tools.find((t3) => t3.name === c3.name);
|
|
7803
|
+
if (!toolDef) return false;
|
|
7804
|
+
return deepEqual(task.expected, c3.arguments);
|
|
7805
|
+
});
|
|
7806
|
+
if (matchingCall) {
|
|
7807
|
+
return {
|
|
7808
|
+
name: "tool-usage",
|
|
7809
|
+
value: 1,
|
|
7810
|
+
details: { matchedTool: matchingCall.name, arguments: matchingCall.arguments, toolCalls: calls }
|
|
7811
|
+
};
|
|
7812
|
+
}
|
|
7813
|
+
const expectedKeys = Object.keys(task.expected);
|
|
7814
|
+
const partialMatch = calls.find((c3) => {
|
|
7815
|
+
if (typeof c3.arguments !== "object" || c3.arguments === null) return false;
|
|
7816
|
+
const argKeys = Object.keys(c3.arguments);
|
|
7817
|
+
return expectedKeys.some((k3) => argKeys.includes(k3));
|
|
7818
|
+
});
|
|
7819
|
+
if (partialMatch) {
|
|
7820
|
+
return {
|
|
7821
|
+
name: "tool-usage",
|
|
7822
|
+
value: 0.5,
|
|
7823
|
+
details: {
|
|
7824
|
+
reason: "correct tool but wrong arguments",
|
|
7825
|
+
expected: task.expected,
|
|
7826
|
+
actual: partialMatch.arguments,
|
|
7827
|
+
toolCalls: calls
|
|
7828
|
+
}
|
|
7829
|
+
};
|
|
7830
|
+
}
|
|
7831
|
+
return {
|
|
7832
|
+
name: "tool-usage",
|
|
7833
|
+
value: 0,
|
|
7834
|
+
details: { reason: "no matching tool call", expected: task.expected, toolCalls: calls }
|
|
7835
|
+
};
|
|
7836
|
+
}
|
|
7837
|
+
const expectedToolName = task.tools[0].name;
|
|
7838
|
+
const usedTool = calls.some((c3) => c3.name === expectedToolName);
|
|
7839
|
+
return {
|
|
7840
|
+
name: "tool-usage",
|
|
7841
|
+
value: usedTool ? 1 : 0,
|
|
7842
|
+
details: { expectedToolName, usedTool, toolCalls: calls }
|
|
7843
|
+
};
|
|
7844
|
+
};
|
|
7845
|
+
|
|
7846
|
+
// src/scorers/index.ts
|
|
7847
|
+
var staticScorers = {
|
|
7848
|
+
latency: latencyScorer,
|
|
7849
|
+
cost: costScorer,
|
|
7850
|
+
correctness: correctnessScorer,
|
|
7851
|
+
"schema-correctness": schemaCorrectnessScorer,
|
|
7852
|
+
"fuzzy-similarity": fuzzySimilarityScorer,
|
|
7853
|
+
"tool-usage": toolUsageScorer
|
|
7854
|
+
};
|
|
7855
|
+
function resolveScorers(names, judgeModel, timeoutMs) {
|
|
7856
|
+
return names.map((name) => {
|
|
7857
|
+
if (name === "llm-judge-correctness") {
|
|
7858
|
+
return createLlmJudgeScorer(judgeModel, timeoutMs);
|
|
7859
|
+
}
|
|
7860
|
+
const scorer = staticScorers[name];
|
|
7861
|
+
if (!scorer) {
|
|
7862
|
+
throw new Error(`Unknown scorer: "${name}"`);
|
|
7863
|
+
}
|
|
7864
|
+
return scorer;
|
|
7865
|
+
});
|
|
7866
|
+
}
|
|
7867
|
+
|
|
7868
|
+
// src/runner.ts
|
|
7869
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
7870
|
+
function withTimeout(run, ms) {
|
|
7871
|
+
return new Promise((resolve2, reject) => {
|
|
7872
|
+
const controller = new AbortController();
|
|
7873
|
+
const timer = setTimeout(() => {
|
|
7874
|
+
controller.abort();
|
|
7875
|
+
reject(new Error(`Request timed out after ${ms}ms`));
|
|
7876
|
+
}, ms);
|
|
7877
|
+
run(controller.signal).then(
|
|
7878
|
+
(v4) => {
|
|
7879
|
+
clearTimeout(timer);
|
|
7880
|
+
resolve2(v4);
|
|
7881
|
+
},
|
|
7882
|
+
(e5) => {
|
|
7883
|
+
clearTimeout(timer);
|
|
7884
|
+
reject(e5);
|
|
7885
|
+
}
|
|
7886
|
+
);
|
|
7887
|
+
});
|
|
7888
|
+
}
|
|
7889
|
+
async function runBenchmarks(options) {
|
|
7890
|
+
const { providers, tasks, scorers, runs, onResult } = options;
|
|
7891
|
+
const timeout = options.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
7892
|
+
const results = [];
|
|
7893
|
+
for (const task of tasks) {
|
|
7894
|
+
for (let run = 1; run <= runs; run++) {
|
|
7895
|
+
const runResults = await Promise.all(
|
|
7896
|
+
providers.map(async (provider) => {
|
|
7897
|
+
let result;
|
|
7898
|
+
try {
|
|
7899
|
+
const taskResult = await withTimeout((signal) => provider.run({
|
|
7900
|
+
prompt: task.prompt,
|
|
7901
|
+
schema: task.schema,
|
|
7902
|
+
tools: task.tools,
|
|
7903
|
+
signal,
|
|
7904
|
+
timeout
|
|
7905
|
+
}), timeout);
|
|
7906
|
+
const scores = await Promise.all(
|
|
7907
|
+
scorers.map((scorer) => scorer({ task, result: taskResult }, provider.id))
|
|
7908
|
+
);
|
|
7909
|
+
result = {
|
|
7910
|
+
providerId: provider.id,
|
|
7911
|
+
taskName: task.name,
|
|
7912
|
+
run,
|
|
7913
|
+
scores,
|
|
7914
|
+
raw: {
|
|
7915
|
+
output: taskResult.output,
|
|
7916
|
+
latencyMs: taskResult.latencyMs,
|
|
7917
|
+
usage: taskResult.usage,
|
|
7918
|
+
toolCalls: taskResult.toolCalls
|
|
7919
|
+
}
|
|
7920
|
+
};
|
|
7921
|
+
} catch (err) {
|
|
7922
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
7923
|
+
result = {
|
|
7924
|
+
providerId: provider.id,
|
|
7925
|
+
taskName: task.name,
|
|
7926
|
+
run,
|
|
7927
|
+
scores: [],
|
|
7928
|
+
error: message,
|
|
7929
|
+
raw: { output: "", latencyMs: 0 }
|
|
7930
|
+
};
|
|
7931
|
+
}
|
|
7932
|
+
onResult?.(result);
|
|
7933
|
+
return result;
|
|
7934
|
+
})
|
|
7935
|
+
);
|
|
7936
|
+
results.push(...runResults);
|
|
7937
|
+
}
|
|
7938
|
+
}
|
|
7939
|
+
return results;
|
|
7940
|
+
}
|
|
7941
|
+
|
|
6200
7942
|
// src/utils/format.ts
|
|
6201
7943
|
var MAX_FRACTION_DIGITS = 100;
|
|
6202
7944
|
function formatCost(usd) {
|
|
@@ -6368,37 +8110,76 @@ function computeColumnStats(providerData, scorerNames) {
|
|
|
6368
8110
|
}
|
|
6369
8111
|
return stats;
|
|
6370
8112
|
}
|
|
8113
|
+
var QUALITY_SCORERS = /* @__PURE__ */ new Set([
|
|
8114
|
+
"correctness",
|
|
8115
|
+
"schema-correctness",
|
|
8116
|
+
"fuzzy-similarity",
|
|
8117
|
+
"llm-judge-correctness",
|
|
8118
|
+
"tool-usage"
|
|
8119
|
+
]);
|
|
8120
|
+
function passesQualityGate(providerId, columnStats) {
|
|
8121
|
+
const qualityColumns = [...columnStats.keys()].filter((k3) => QUALITY_SCORERS.has(k3));
|
|
8122
|
+
if (qualityColumns.length === 0) return true;
|
|
8123
|
+
return qualityColumns.some((col) => {
|
|
8124
|
+
const val = columnStats.get(col)?.values.get(providerId);
|
|
8125
|
+
return val !== void 0 && val > 0;
|
|
8126
|
+
});
|
|
8127
|
+
}
|
|
6371
8128
|
function computeMedals(columnStats, providerIds) {
|
|
6372
8129
|
const medals = /* @__PURE__ */ new Map();
|
|
6373
8130
|
if (providerIds.length < 2) {
|
|
6374
8131
|
for (const id of providerIds) medals.set(id, "none");
|
|
6375
8132
|
return medals;
|
|
6376
8133
|
}
|
|
6377
|
-
const
|
|
6378
|
-
|
|
6379
|
-
|
|
8134
|
+
const eligible = new Set(providerIds.filter((id) => passesQualityGate(id, columnStats)));
|
|
8135
|
+
const qualityWins = /* @__PURE__ */ new Map();
|
|
8136
|
+
const efficiencyWins = /* @__PURE__ */ new Map();
|
|
8137
|
+
for (const id of providerIds) {
|
|
8138
|
+
qualityWins.set(id, 0);
|
|
8139
|
+
efficiencyWins.set(id, 0);
|
|
8140
|
+
}
|
|
8141
|
+
for (const [colName, colStats] of columnStats) {
|
|
6380
8142
|
if (colStats.best === void 0) continue;
|
|
6381
8143
|
const bestProviders = [...colStats.values.entries()].filter(([, v4]) => v4 !== void 0 && v4 === colStats.best);
|
|
6382
8144
|
if (bestProviders.length === 1) {
|
|
6383
|
-
|
|
8145
|
+
const winnerId = bestProviders[0][0];
|
|
8146
|
+
if (QUALITY_SCORERS.has(colName)) {
|
|
8147
|
+
qualityWins.set(winnerId, (qualityWins.get(winnerId) ?? 0) + 1);
|
|
8148
|
+
} else {
|
|
8149
|
+
efficiencyWins.set(winnerId, (efficiencyWins.get(winnerId) ?? 0) + 1);
|
|
8150
|
+
}
|
|
6384
8151
|
}
|
|
6385
8152
|
}
|
|
6386
|
-
const totalWins = [...
|
|
8153
|
+
const totalWins = [...qualityWins.values()].reduce((a7, b3) => a7 + b3, 0) + [...efficiencyWins.values()].reduce((a7, b3) => a7 + b3, 0);
|
|
6387
8154
|
if (totalWins === 0) {
|
|
6388
8155
|
for (const id of providerIds) medals.set(id, "none");
|
|
6389
8156
|
return medals;
|
|
6390
8157
|
}
|
|
6391
|
-
const
|
|
6392
|
-
(
|
|
6393
|
-
|
|
8158
|
+
const eligibleSorted = providerIds.filter((id) => eligible.has(id)).sort((a7, b3) => {
|
|
8159
|
+
const qDiff = (qualityWins.get(b3) ?? 0) - (qualityWins.get(a7) ?? 0);
|
|
8160
|
+
if (qDiff !== 0) return qDiff;
|
|
8161
|
+
const eDiff = (efficiencyWins.get(b3) ?? 0) - (efficiencyWins.get(a7) ?? 0);
|
|
8162
|
+
if (eDiff !== 0) return eDiff;
|
|
8163
|
+
return a7.localeCompare(b3);
|
|
8164
|
+
});
|
|
6394
8165
|
const medalList = ["gold", "silver", "bronze"];
|
|
6395
8166
|
let rank = 0;
|
|
6396
|
-
for (let i7 = 0; i7 <
|
|
6397
|
-
if (i7 > 0
|
|
6398
|
-
|
|
8167
|
+
for (let i7 = 0; i7 < eligibleSorted.length; i7++) {
|
|
8168
|
+
if (i7 > 0) {
|
|
8169
|
+
const prevQ = qualityWins.get(eligibleSorted[i7 - 1]) ?? 0;
|
|
8170
|
+
const currQ = qualityWins.get(eligibleSorted[i7]) ?? 0;
|
|
8171
|
+
if (currQ < prevQ) {
|
|
8172
|
+
rank = i7;
|
|
8173
|
+
} else if (currQ === prevQ) {
|
|
8174
|
+
const prevE = efficiencyWins.get(eligibleSorted[i7 - 1]) ?? 0;
|
|
8175
|
+
const currE = efficiencyWins.get(eligibleSorted[i7]) ?? 0;
|
|
8176
|
+
if (currE < prevE) rank = i7;
|
|
8177
|
+
}
|
|
6399
8178
|
}
|
|
6400
|
-
|
|
6401
|
-
|
|
8179
|
+
medals.set(eligibleSorted[i7], rank < medalList.length ? medalList[rank] : "none");
|
|
8180
|
+
}
|
|
8181
|
+
for (const id of providerIds) {
|
|
8182
|
+
if (!eligible.has(id)) medals.set(id, "none");
|
|
6402
8183
|
}
|
|
6403
8184
|
return medals;
|
|
6404
8185
|
}
|
|
@@ -6795,24 +8576,10 @@ function printSummary(results, providers, byProvider) {
|
|
|
6795
8576
|
console.log(` ${medal} Cheapest: ${bold(byCost.id)} ${dim(providerLabel(byCost.id))} ${brightGreen}${boldCode}${costStr}${reset}`);
|
|
6796
8577
|
}
|
|
6797
8578
|
}
|
|
6798
|
-
if (!single) {
|
|
6799
|
-
|
|
6800
|
-
|
|
6801
|
-
|
|
6802
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
6803
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
6804
|
-
const maxWins = Math.max(...wins.values());
|
|
6805
|
-
if (maxWins > 0) {
|
|
6806
|
-
const topProviders = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
|
|
6807
|
-
console.log("");
|
|
6808
|
-
if (topProviders.length === 1) {
|
|
6809
|
-
const [winnerId, winCount] = topProviders[0];
|
|
6810
|
-
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${winnerId}${reset} ${dim(providerLabel(winnerId))} ${dim(`(${winCount}/3 categories)`)}`);
|
|
6811
|
-
} else {
|
|
6812
|
-
const names = topProviders.map(([id]) => bold(id)).join(dim(", "));
|
|
6813
|
-
console.log(` \u{1F3C6} Overall: ${names} ${dim(`(tied at ${maxWins}/3)`)}`);
|
|
6814
|
-
}
|
|
6815
|
-
}
|
|
8579
|
+
if (!single && byCorrectness && byCorrectness.avg > 0) {
|
|
8580
|
+
console.log("");
|
|
8581
|
+
const pct = `${Math.round(byCorrectness.avg * 100)}%`;
|
|
8582
|
+
console.log(` \u{1F3C6} Overall: ${brightGreen}${boldCode}${byCorrectness.id}${reset} ${dim(providerLabel(byCorrectness.id))} ${dim(`(${pct} avg correctness)`)}`);
|
|
6816
8583
|
}
|
|
6817
8584
|
console.log("");
|
|
6818
8585
|
}
|
|
@@ -6841,6 +8608,32 @@ function buildSummary(results) {
|
|
|
6841
8608
|
};
|
|
6842
8609
|
}
|
|
6843
8610
|
|
|
8611
|
+
// src/arena.ts
|
|
8612
|
+
function defineArena(config) {
|
|
8613
|
+
if (config.providers.length === 0) {
|
|
8614
|
+
throw new Error("At least one provider is required");
|
|
8615
|
+
}
|
|
8616
|
+
const scorerNames = config.scorers ?? ["latency", "cost", "correctness"];
|
|
8617
|
+
const scorerFns = resolveScorers(scorerNames, config.judgeModel, config.timeout);
|
|
8618
|
+
const runs = config.runs ?? 1;
|
|
8619
|
+
return {
|
|
8620
|
+
config,
|
|
8621
|
+
async run(options) {
|
|
8622
|
+
if (config.tasks.length === 0) {
|
|
8623
|
+
throw new Error("At least one task is required");
|
|
8624
|
+
}
|
|
8625
|
+
return runBenchmarks({
|
|
8626
|
+
providers: config.providers,
|
|
8627
|
+
tasks: config.tasks,
|
|
8628
|
+
scorers: scorerFns,
|
|
8629
|
+
runs,
|
|
8630
|
+
timeout: config.timeout,
|
|
8631
|
+
onResult: options?.onResult
|
|
8632
|
+
});
|
|
8633
|
+
}
|
|
8634
|
+
};
|
|
8635
|
+
}
|
|
8636
|
+
|
|
6844
8637
|
// src/reporter/markdown.ts
|
|
6845
8638
|
var COMMENT_MARKER = "<!-- duelist-ci-report -->";
|
|
6846
8639
|
function markdownReporter(report, _current) {
|
|
@@ -6968,17 +8761,8 @@ function htmlReporter(results) {
|
|
|
6968
8761
|
return { id, avg };
|
|
6969
8762
|
}).filter((p5) => p5.avg !== void 0).sort((a7, b3) => a7.avg - b3.avg)[0];
|
|
6970
8763
|
let overallWinner;
|
|
6971
|
-
if (multi) {
|
|
6972
|
-
|
|
6973
|
-
for (const id of providers) wins.set(id, 0);
|
|
6974
|
-
if (byCorrectness) wins.set(byCorrectness.id, (wins.get(byCorrectness.id) ?? 0) + 1);
|
|
6975
|
-
if (byLatency && byLatency.avg !== Infinity) wins.set(byLatency.id, (wins.get(byLatency.id) ?? 0) + 1);
|
|
6976
|
-
if (byCost?.avg !== void 0) wins.set(byCost.id, (wins.get(byCost.id) ?? 0) + 1);
|
|
6977
|
-
const maxWins = Math.max(...wins.values());
|
|
6978
|
-
if (maxWins > 0) {
|
|
6979
|
-
const tops = [...wins.entries()].filter(([, w4]) => w4 === maxWins);
|
|
6980
|
-
if (tops.length === 1) overallWinner = tops[0][0];
|
|
6981
|
-
}
|
|
8764
|
+
if (multi && byCorrectness && byCorrectness.avg > 0) {
|
|
8765
|
+
overallWinner = byCorrectness.id;
|
|
6982
8766
|
}
|
|
6983
8767
|
const errorResults = results.filter((r3) => r3.error);
|
|
6984
8768
|
const deduped = dedupeErrors(errorResults);
|
|
@@ -7499,7 +9283,7 @@ function renderErrors(errors) {
|
|
|
7499
9283
|
</div>`;
|
|
7500
9284
|
}).join("\n");
|
|
7501
9285
|
return `<section class="errors-section">
|
|
7502
|
-
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'
|
|
9286
|
+
<h2 class="errors-title" onclick="this.nextElementSibling.style.display=this.nextElementSibling.style.display==='none'?'block':'none'">Errors</h2>
|
|
7503
9287
|
<div class="errors-list">
|
|
7504
9288
|
${items}
|
|
7505
9289
|
</div>
|
|
@@ -7843,6 +9627,403 @@ async function upsertPrComment(ctx, body, marker) {
|
|
|
7843
9627
|
}
|
|
7844
9628
|
}
|
|
7845
9629
|
|
|
9630
|
+
// src/packs/structured-output.ts
|
|
9631
|
+
import { z } from "zod";
|
|
9632
|
+
var structuredOutputPack = {
|
|
9633
|
+
name: "structured-output",
|
|
9634
|
+
label: "Structured Output",
|
|
9635
|
+
description: "Zod schema stress test \u2014 flat objects, nesting, arrays, enums, empty arrays, and adversarial input",
|
|
9636
|
+
tasks: [
|
|
9637
|
+
{
|
|
9638
|
+
name: "so:flat-entity",
|
|
9639
|
+
prompt: "Extract the person's details from this text: 'Maria Garcia, age 34, works as a software architect in Barcelona, Spain. Her employee ID is EMP-2847.' Return as JSON.",
|
|
9640
|
+
expected: {
|
|
9641
|
+
name: "Maria Garcia",
|
|
9642
|
+
age: 34,
|
|
9643
|
+
role: "software architect",
|
|
9644
|
+
city: "Barcelona",
|
|
9645
|
+
country: "Spain",
|
|
9646
|
+
employeeId: "EMP-2847"
|
|
9647
|
+
},
|
|
9648
|
+
schema: z.object({
|
|
9649
|
+
name: z.string(),
|
|
9650
|
+
age: z.number(),
|
|
9651
|
+
role: z.string(),
|
|
9652
|
+
city: z.string(),
|
|
9653
|
+
country: z.string(),
|
|
9654
|
+
employeeId: z.string()
|
|
9655
|
+
})
|
|
9656
|
+
},
|
|
9657
|
+
{
|
|
9658
|
+
name: "so:nested-object",
|
|
9659
|
+
prompt: "Parse this shipping label into structured JSON: 'Ship to: Acme Corp, Attn: John Lee, 4th Floor, 742 Evergreen Terrace, Springfield, IL 62704, USA. Order #ORD-9912, 3 items, 2.4kg, express shipping.' Use shippingMethod values: standard, express, or overnight. Return as JSON.",
|
|
9660
|
+
expected: {
|
|
9661
|
+
recipient: { company: "Acme Corp", contact: "John Lee", floor: "4th Floor" },
|
|
9662
|
+
address: { street: "742 Evergreen Terrace", city: "Springfield", state: "IL", zip: "62704", country: "USA" },
|
|
9663
|
+
order: { id: "ORD-9912", itemCount: 3, weightKg: 2.4, shippingMethod: "express" }
|
|
9664
|
+
},
|
|
9665
|
+
schema: z.object({
|
|
9666
|
+
recipient: z.object({ company: z.string(), contact: z.string(), floor: z.string() }),
|
|
9667
|
+
address: z.object({
|
|
9668
|
+
street: z.string(),
|
|
9669
|
+
city: z.string(),
|
|
9670
|
+
state: z.string(),
|
|
9671
|
+
zip: z.string(),
|
|
9672
|
+
country: z.string()
|
|
9673
|
+
}),
|
|
9674
|
+
order: z.object({
|
|
9675
|
+
id: z.string(),
|
|
9676
|
+
itemCount: z.number(),
|
|
9677
|
+
weightKg: z.number(),
|
|
9678
|
+
shippingMethod: z.enum(["standard", "express", "overnight"])
|
|
9679
|
+
})
|
|
9680
|
+
})
|
|
9681
|
+
},
|
|
9682
|
+
{
|
|
9683
|
+
name: "so:array-of-objects",
|
|
9684
|
+
prompt: "Extract all mentioned products with their prices and categories from this text: 'Our summer sale includes the UltraWidget Pro ($49.99, Electronics), ComfortMax Chair ($199.00, Furniture), and AquaPure Filter ($24.50, Home & Kitchen). The SmartLamp Mini is also available at $34.99 in the Electronics category.' Return as a JSON array.",
|
|
9685
|
+
expected: [
|
|
9686
|
+
{ name: "UltraWidget Pro", price: 49.99, category: "Electronics" },
|
|
9687
|
+
{ name: "ComfortMax Chair", price: 199, category: "Furniture" },
|
|
9688
|
+
{ name: "AquaPure Filter", price: 24.5, category: "Home & Kitchen" },
|
|
9689
|
+
{ name: "SmartLamp Mini", price: 34.99, category: "Electronics" }
|
|
9690
|
+
],
|
|
9691
|
+
schema: z.array(z.object({ name: z.string(), price: z.number(), category: z.string() }))
|
|
9692
|
+
},
|
|
9693
|
+
{
|
|
9694
|
+
name: "so:empty-arrays",
|
|
9695
|
+
prompt: "Extract all error codes and their severity levels from this log message: 'System health check completed at 14:32 UTC. All services operational. No warnings or errors detected. Uptime: 99.97%.' Classify status as one of: healthy, degraded, or down. Return as JSON.",
|
|
9696
|
+
expected: { errors: [], warnings: [], status: "healthy", uptimePercent: 99.97 },
|
|
9697
|
+
schema: z.object({
|
|
9698
|
+
errors: z.array(z.object({ code: z.string(), severity: z.string() })),
|
|
9699
|
+
warnings: z.array(z.string()),
|
|
9700
|
+
status: z.enum(["healthy", "degraded", "down"]),
|
|
9701
|
+
uptimePercent: z.number()
|
|
9702
|
+
})
|
|
9703
|
+
},
|
|
9704
|
+
{
|
|
9705
|
+
name: "so:enum-classification",
|
|
9706
|
+
prompt: "Classify each of these support tickets by priority (low/medium/high/critical) and category (billing/technical/account/general). Use just the letter (A, B, C, D) as the id.\nTicket A: 'My account was charged twice for the same subscription.'\nTicket B: 'The API returns 500 errors intermittently.'\nTicket C: 'How do I update my display name?'\nTicket D: 'Production database is completely unresponsive, all services down.'\nReturn as a JSON array.",
|
|
9707
|
+
expected: [
|
|
9708
|
+
{ id: "A", priority: "high", category: "billing" },
|
|
9709
|
+
{ id: "B", priority: "high", category: "technical" },
|
|
9710
|
+
{ id: "C", priority: "low", category: "account" },
|
|
9711
|
+
{ id: "D", priority: "critical", category: "technical" }
|
|
9712
|
+
],
|
|
9713
|
+
schema: z.array(
|
|
9714
|
+
z.object({
|
|
9715
|
+
id: z.string(),
|
|
9716
|
+
priority: z.enum(["low", "medium", "high", "critical"]),
|
|
9717
|
+
category: z.enum(["billing", "technical", "account", "general"])
|
|
9718
|
+
})
|
|
9719
|
+
)
|
|
9720
|
+
},
|
|
9721
|
+
{
|
|
9722
|
+
name: "so:adversarial-input",
|
|
9723
|
+
prompt: `Extract the actual product review data from this messy input. Ignore any JSON-like noise in the text.
|
|
9724
|
+
|
|
9725
|
+
User said: 'I bought the {product: "fake"} headphones for $59.99 and they're great! Rating: 5/5. The "noise-cancelling" feature works well even in {"noisy": true} environments. Would recommend to friend=true. Purchased on 01/15/2026.'
|
|
9726
|
+
Return as JSON. Use ISO 8601 date format (YYYY-MM-DD).`,
|
|
9727
|
+
expected: {
|
|
9728
|
+
product: "headphones",
|
|
9729
|
+
price: 59.99,
|
|
9730
|
+
rating: 5,
|
|
9731
|
+
maxRating: 5,
|
|
9732
|
+
features: ["noise-cancelling"],
|
|
9733
|
+
recommended: true,
|
|
9734
|
+
purchaseDate: "2026-01-15"
|
|
9735
|
+
},
|
|
9736
|
+
schema: z.object({
|
|
9737
|
+
product: z.string(),
|
|
9738
|
+
price: z.number(),
|
|
9739
|
+
rating: z.number(),
|
|
9740
|
+
maxRating: z.number(),
|
|
9741
|
+
features: z.array(z.string()),
|
|
9742
|
+
recommended: z.boolean(),
|
|
9743
|
+
purchaseDate: z.string()
|
|
9744
|
+
})
|
|
9745
|
+
}
|
|
9746
|
+
],
|
|
9747
|
+
scorers: ["correctness", "schema-correctness", "latency", "cost"]
|
|
9748
|
+
};
|
|
9749
|
+
|
|
9750
|
+
// src/packs/tool-calling.ts
|
|
9751
|
+
import { z as z2 } from "zod";
|
|
9752
|
+
var toolCallingPack = {
|
|
9753
|
+
name: "tool-calling",
|
|
9754
|
+
label: "Tool Calling",
|
|
9755
|
+
description: "Function invocation accuracy \u2014 single calls, complex params, tool selection, parallel calls, and relevance detection",
|
|
9756
|
+
tasks: [
|
|
9757
|
+
{
|
|
9758
|
+
name: "tc:simple-single-tool",
|
|
9759
|
+
prompt: "What's the current weather in Tokyo?",
|
|
9760
|
+
tools: [{
|
|
9761
|
+
name: "getWeather",
|
|
9762
|
+
description: "Get current weather for a city",
|
|
9763
|
+
parameters: z2.object({
|
|
9764
|
+
city: z2.string(),
|
|
9765
|
+
units: z2.enum(["celsius", "fahrenheit"]).optional()
|
|
9766
|
+
}),
|
|
9767
|
+
handler: async ({ city, units }) => ({
|
|
9768
|
+
city,
|
|
9769
|
+
tempC: 8,
|
|
9770
|
+
condition: "cloudy",
|
|
9771
|
+
units: units ?? "celsius"
|
|
9772
|
+
})
|
|
9773
|
+
}],
|
|
9774
|
+
expected: { city: "Tokyo" }
|
|
9775
|
+
},
|
|
9776
|
+
{
|
|
9777
|
+
name: "tc:complex-params",
|
|
9778
|
+
prompt: "Search for Italian restaurants within 2 miles of downtown Portland that are open now and have at least a 4-star rating.",
|
|
9779
|
+
tools: [{
|
|
9780
|
+
name: "searchRestaurants",
|
|
9781
|
+
description: "Search for restaurants matching criteria",
|
|
9782
|
+
parameters: z2.object({
|
|
9783
|
+
cuisine: z2.string(),
|
|
9784
|
+
location: z2.string(),
|
|
9785
|
+
radiusMiles: z2.number(),
|
|
9786
|
+
minRating: z2.number(),
|
|
9787
|
+
openNow: z2.boolean()
|
|
9788
|
+
}),
|
|
9789
|
+
handler: async (_args) => ({
|
|
9790
|
+
results: [{ name: "Trattoria Roma", rating: 4.5, distance: 1.2 }]
|
|
9791
|
+
})
|
|
9792
|
+
}],
|
|
9793
|
+
expected: {
|
|
9794
|
+
cuisine: "Italian",
|
|
9795
|
+
location: "downtown Portland",
|
|
9796
|
+
radiusMiles: 2,
|
|
9797
|
+
minRating: 4,
|
|
9798
|
+
openNow: true
|
|
9799
|
+
}
|
|
9800
|
+
},
|
|
9801
|
+
{
|
|
9802
|
+
name: "tc:select-from-many",
|
|
9803
|
+
prompt: "Convert 150 USD to Euros.",
|
|
9804
|
+
tools: [
|
|
9805
|
+
{
|
|
9806
|
+
name: "getWeather",
|
|
9807
|
+
description: "Get current weather for a city",
|
|
9808
|
+
parameters: z2.object({ city: z2.string() }),
|
|
9809
|
+
handler: async () => ({ tempC: 20 })
|
|
9810
|
+
},
|
|
9811
|
+
{
|
|
9812
|
+
name: "convertCurrency",
|
|
9813
|
+
description: "Convert an amount between currencies",
|
|
9814
|
+
parameters: z2.object({
|
|
9815
|
+
amount: z2.number(),
|
|
9816
|
+
from: z2.string(),
|
|
9817
|
+
to: z2.string()
|
|
9818
|
+
}),
|
|
9819
|
+
handler: async ({ amount, from, to }) => ({
|
|
9820
|
+
amount,
|
|
9821
|
+
from,
|
|
9822
|
+
to,
|
|
9823
|
+
result: 138.75,
|
|
9824
|
+
rate: 0.925
|
|
9825
|
+
})
|
|
9826
|
+
},
|
|
9827
|
+
{
|
|
9828
|
+
name: "translateText",
|
|
9829
|
+
description: "Translate text between languages",
|
|
9830
|
+
parameters: z2.object({ text: z2.string(), targetLang: z2.string() }),
|
|
9831
|
+
handler: async () => ({ translated: "" })
|
|
9832
|
+
},
|
|
9833
|
+
{
|
|
9834
|
+
name: "calculateTip",
|
|
9835
|
+
description: "Calculate tip amount for a bill",
|
|
9836
|
+
parameters: z2.object({ billAmount: z2.number(), tipPercent: z2.number() }),
|
|
9837
|
+
handler: async () => ({ tip: 0 })
|
|
9838
|
+
}
|
|
9839
|
+
],
|
|
9840
|
+
expected: { amount: 150, from: "USD", to: "EUR" }
|
|
9841
|
+
},
|
|
9842
|
+
{
|
|
9843
|
+
name: "tc:parallel-calls",
|
|
9844
|
+
prompt: "I'm planning a trip. What's the weather like in both Paris and London right now?",
|
|
9845
|
+
tools: [{
|
|
9846
|
+
name: "getWeather",
|
|
9847
|
+
description: "Get current weather for a city",
|
|
9848
|
+
parameters: z2.object({ city: z2.string() }),
|
|
9849
|
+
handler: async ({ city }) => {
|
|
9850
|
+
const data = {
|
|
9851
|
+
Paris: { tempC: 12, condition: "partly cloudy" },
|
|
9852
|
+
London: { tempC: 9, condition: "rainy" }
|
|
9853
|
+
};
|
|
9854
|
+
return data[city] ?? { tempC: 15, condition: "unknown" };
|
|
9855
|
+
}
|
|
9856
|
+
}],
|
|
9857
|
+
expected: "weather data for Paris and London"
|
|
9858
|
+
}
|
|
9859
|
+
],
|
|
9860
|
+
scorers: ["tool-usage", "latency", "cost"]
|
|
9861
|
+
};
|
|
9862
|
+
|
|
9863
|
+
// src/packs/reasoning.ts
|
|
9864
|
+
import { z as z3 } from "zod";
|
|
9865
|
+
var reasoningPack = {
|
|
9866
|
+
name: "reasoning",
|
|
9867
|
+
label: "Reasoning",
|
|
9868
|
+
description: "Logic, math, and multi-step thinking \u2014 arithmetic, deduction, data interpretation, critical path, and business rules",
|
|
9869
|
+
tasks: [
|
|
9870
|
+
{
|
|
9871
|
+
name: "rs:saas-mrr-calc",
|
|
9872
|
+
prompt: `A SaaS company charges $49/month for the basic plan and $149/month for pro.
|
|
9873
|
+
In Q1 they had 200 basic subscribers and 85 pro subscribers.
|
|
9874
|
+
In Q2, 15% of basic users upgraded to pro and they gained 40 new basic subscribers.
|
|
9875
|
+
No one churned. What is the Q2 monthly recurring revenue (MRR)?
|
|
9876
|
+
Return as JSON with your reasoning and the final MRR number.`,
|
|
9877
|
+
expected: { mrr: 27425 },
|
|
9878
|
+
schema: z3.object({
|
|
9879
|
+
reasoning: z3.string().optional(),
|
|
9880
|
+
mrr: z3.number()
|
|
9881
|
+
})
|
|
9882
|
+
},
|
|
9883
|
+
{
|
|
9884
|
+
name: "rs:logical-deduction",
|
|
9885
|
+
prompt: `Five developers \u2014 Alice, Bob, Carol, Dave, and Eve \u2014 each use a different
|
|
9886
|
+
primary language: Rust, TypeScript, Python, Go, and Java. Given:
|
|
9887
|
+
1. Alice does not use Python, Java, or Go.
|
|
9888
|
+
2. Bob uses TypeScript.
|
|
9889
|
+
3. Carol uses neither Rust nor Go.
|
|
9890
|
+
4. Dave does not use Java.
|
|
9891
|
+
5. Eve uses neither Rust, Go, nor Java.
|
|
9892
|
+
What language does each developer use? Return as JSON.`,
|
|
9893
|
+
expected: {
|
|
9894
|
+
Alice: "Rust",
|
|
9895
|
+
Bob: "TypeScript",
|
|
9896
|
+
Carol: "Java",
|
|
9897
|
+
Dave: "Go",
|
|
9898
|
+
Eve: "Python"
|
|
9899
|
+
},
|
|
9900
|
+
schema: z3.object({
|
|
9901
|
+
Alice: z3.string(),
|
|
9902
|
+
Bob: z3.string(),
|
|
9903
|
+
Carol: z3.string(),
|
|
9904
|
+
Dave: z3.string(),
|
|
9905
|
+
Eve: z3.string()
|
|
9906
|
+
})
|
|
9907
|
+
},
|
|
9908
|
+
{
|
|
9909
|
+
name: "rs:data-interpretation",
|
|
9910
|
+
prompt: `Given this quarterly revenue data:
|
|
9911
|
+
| Quarter | Revenue | Growth |
|
|
9912
|
+
|---------|---------|--------|
|
|
9913
|
+
| Q1 2025 | $2.1M | - |
|
|
9914
|
+
| Q2 2025 | $2.4M | 14.3% |
|
|
9915
|
+
| Q3 2025 | $2.2M | -8.3% |
|
|
9916
|
+
| Q4 2025 | $2.8M | 27.3% |
|
|
9917
|
+
|
|
9918
|
+
Which quarter had the highest absolute revenue increase compared to the previous
|
|
9919
|
+
quarter? What was the full-year total revenue in millions? Return as JSON.`,
|
|
9920
|
+
expected: {
|
|
9921
|
+
highestGrowthQuarter: "Q4 2025",
|
|
9922
|
+
absoluteIncrease: 0.6,
|
|
9923
|
+
fullYearRevenue: 9.5
|
|
9924
|
+
},
|
|
9925
|
+
schema: z3.object({
|
|
9926
|
+
highestGrowthQuarter: z3.string(),
|
|
9927
|
+
absoluteIncrease: z3.number(),
|
|
9928
|
+
fullYearRevenue: z3.number()
|
|
9929
|
+
})
|
|
9930
|
+
},
|
|
9931
|
+
{
|
|
9932
|
+
name: "rs:critical-path",
|
|
9933
|
+
prompt: `A deployment pipeline has these stages with dependencies:
|
|
9934
|
+
- Build (3 min, no dependency)
|
|
9935
|
+
- Unit tests (5 min, depends on Build)
|
|
9936
|
+
- Integration tests (8 min, depends on Build)
|
|
9937
|
+
- Security scan (4 min, depends on Build)
|
|
9938
|
+
- Staging deploy (2 min, depends on Unit tests AND Integration tests AND Security scan)
|
|
9939
|
+
- Smoke tests (3 min, depends on Staging deploy)
|
|
9940
|
+
|
|
9941
|
+
Assuming stages run in parallel where possible, what is the total pipeline
|
|
9942
|
+
duration in minutes? Which stages are on the critical path? Return as JSON.`,
|
|
9943
|
+
expected: {
|
|
9944
|
+
totalMinutes: 16,
|
|
9945
|
+
criticalPath: ["Build", "Integration tests", "Staging deploy", "Smoke tests"]
|
|
9946
|
+
},
|
|
9947
|
+
schema: z3.object({
|
|
9948
|
+
totalMinutes: z3.number(),
|
|
9949
|
+
criticalPath: z3.array(z3.string())
|
|
9950
|
+
})
|
|
9951
|
+
},
|
|
9952
|
+
{
|
|
9953
|
+
name: "rs:pricing-rules",
|
|
9954
|
+
prompt: `Apply these pricing rules to each customer and return the final price:
|
|
9955
|
+
Rules:
|
|
9956
|
+
- Base price: $100
|
|
9957
|
+
- Enterprise customers (>100 seats): 30% discount
|
|
9958
|
+
- Annual billing: additional 15% off the discounted price
|
|
9959
|
+
- Non-profit organizations: flat $50 regardless of other rules
|
|
9960
|
+
|
|
9961
|
+
Customers:
|
|
9962
|
+
A: 50 seats, monthly billing, for-profit
|
|
9963
|
+
B: 200 seats, annual billing, for-profit
|
|
9964
|
+
C: 75 seats, annual billing, non-profit
|
|
9965
|
+
D: 150 seats, monthly billing, for-profit
|
|
9966
|
+
|
|
9967
|
+
Return as a JSON array with customer id and finalPrice.`,
|
|
9968
|
+
expected: [
|
|
9969
|
+
{ id: "A", finalPrice: 100 },
|
|
9970
|
+
{ id: "B", finalPrice: 59.5 },
|
|
9971
|
+
{ id: "C", finalPrice: 50 },
|
|
9972
|
+
{ id: "D", finalPrice: 70 }
|
|
9973
|
+
],
|
|
9974
|
+
schema: z3.array(z3.object({
|
|
9975
|
+
id: z3.string(),
|
|
9976
|
+
finalPrice: z3.number()
|
|
9977
|
+
}))
|
|
9978
|
+
}
|
|
9979
|
+
],
|
|
9980
|
+
scorers: ["correctness", "latency", "cost"]
|
|
9981
|
+
};
|
|
9982
|
+
|
|
9983
|
+
// src/packs/index.ts
|
|
9984
|
+
var registry = /* @__PURE__ */ new Map();
|
|
9985
|
+
function register(pack) {
|
|
9986
|
+
registry.set(pack.name, pack);
|
|
9987
|
+
}
|
|
9988
|
+
register(structuredOutputPack);
|
|
9989
|
+
register(toolCallingPack);
|
|
9990
|
+
register(reasoningPack);
|
|
9991
|
+
function loadPack(name) {
|
|
9992
|
+
const pack = registry.get(name);
|
|
9993
|
+
if (!pack) {
|
|
9994
|
+
const available = [...registry.keys()].join(", ");
|
|
9995
|
+
throw new Error(`Unknown pack "${name}". Available packs: ${available}`);
|
|
9996
|
+
}
|
|
9997
|
+
return pack;
|
|
9998
|
+
}
|
|
9999
|
+
function listPacks() {
|
|
10000
|
+
return [...registry.values()].map((p5) => ({
|
|
10001
|
+
name: p5.name,
|
|
10002
|
+
label: p5.label,
|
|
10003
|
+
description: p5.description,
|
|
10004
|
+
taskCount: p5.tasks.length
|
|
10005
|
+
}));
|
|
10006
|
+
}
|
|
10007
|
+
|
|
10008
|
+
// src/packs/loader.ts
|
|
10009
|
+
function buildPackConfig(config) {
|
|
10010
|
+
const packs = config.packs.map((name) => loadPack(name));
|
|
10011
|
+
const tasks = packs.flatMap((p5) => p5.tasks);
|
|
10012
|
+
const scorerSet = /* @__PURE__ */ new Set();
|
|
10013
|
+
for (const pack of packs) {
|
|
10014
|
+
for (const scorer of pack.scorers) {
|
|
10015
|
+
scorerSet.add(scorer);
|
|
10016
|
+
}
|
|
10017
|
+
}
|
|
10018
|
+
return {
|
|
10019
|
+
providers: config.providers,
|
|
10020
|
+
tasks,
|
|
10021
|
+
scorers: [...scorerSet],
|
|
10022
|
+
runs: config.runs ?? 1,
|
|
10023
|
+
timeout: config.timeout
|
|
10024
|
+
};
|
|
10025
|
+
}
|
|
10026
|
+
|
|
7846
10027
|
// src/cli.ts
|
|
7847
10028
|
var __dirname2 = dirname2(fileURLToPath(import.meta.url));
|
|
7848
10029
|
var program = new Command();
|
|
@@ -7867,12 +10048,16 @@ program.command("init").description("Scaffold an arena.config.ts in the current
|
|
|
7867
10048
|
console.log(" 1. export OPENAI_API_KEY=sk-...");
|
|
7868
10049
|
console.log(" 2. npx duelist run");
|
|
7869
10050
|
});
|
|
7870
|
-
program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--reporter <type>", "Output format: console, json, or html", "console").option("--output <path>", "Output file path (used with html reporter)", "duelist-report.html").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
|
|
10051
|
+
program.command("run").description("Run benchmarks defined in your arena config").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--pack <names>", 'Run built-in task pack(s), comma-separated. Use "list" to see available packs.').option("--reporter <type>", "Output format: console, json, or html", "console").option("--output <path>", "Output file path (used with html reporter)", "duelist-report.html").option("-q, --quiet", "Suppress per-result progress (show only final report)").action(async (opts) => {
|
|
10052
|
+
if (opts.pack === "list") {
|
|
10053
|
+
printPackList();
|
|
10054
|
+
return;
|
|
10055
|
+
}
|
|
7871
10056
|
if (!["console", "json", "html"].includes(opts.reporter)) {
|
|
7872
10057
|
console.error(`Unknown reporter "${opts.reporter}". Use "console", "json", or "html".`);
|
|
7873
10058
|
process.exit(1);
|
|
7874
10059
|
}
|
|
7875
|
-
const typedArena = await loadArenaConfig(opts.config);
|
|
10060
|
+
const typedArena = opts.pack ? await loadArenaWithPacks(opts.pack, opts.config) : await loadArenaConfig(opts.config);
|
|
7876
10061
|
try {
|
|
7877
10062
|
const showProgress = opts.reporter !== "json" && !opts.quiet;
|
|
7878
10063
|
const onResult = showProgress ? logResult : void 0;
|
|
@@ -7907,7 +10092,11 @@ function collectThreshold(value, previous) {
|
|
|
7907
10092
|
previous.set(scorer, Number(delta));
|
|
7908
10093
|
return previous;
|
|
7909
10094
|
}
|
|
7910
|
-
program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
|
|
10095
|
+
program.command("ci").description("Run benchmarks, compare against baseline, and enforce quality gates").option("-c, --config <path>", "Path to config file", "arena.config.ts").option("--pack <names>", 'Run built-in task pack(s), comma-separated. Use "list" to see available packs.').option("--baseline <path>", "Baseline JSON file", ".duelist/baseline.json").option("--budget <dollars>", "Max total cost in USD", parseFloat).option("--threshold <scorer=delta>", "Regression threshold (repeatable)", collectThreshold, /* @__PURE__ */ new Map()).option("--update-baseline", "Save results as new baseline after passing").option("--comment", "Post results as GitHub PR comment").option("-q, --quiet", "Suppress per-result progress").action(async (opts) => {
|
|
10096
|
+
if (opts.pack === "list") {
|
|
10097
|
+
printPackList();
|
|
10098
|
+
return;
|
|
10099
|
+
}
|
|
7911
10100
|
const ciOpts = {
|
|
7912
10101
|
configPath: opts.config,
|
|
7913
10102
|
baselinePath: resolve(opts.baseline),
|
|
@@ -7917,7 +10106,7 @@ program.command("ci").description("Run benchmarks, compare against baseline, and
|
|
|
7917
10106
|
comment: opts.comment ?? false,
|
|
7918
10107
|
quiet: opts.quiet ?? false
|
|
7919
10108
|
};
|
|
7920
|
-
const typedArena = await loadArenaConfig(ciOpts.configPath);
|
|
10109
|
+
const typedArena = opts.pack ? await loadArenaWithPacks(opts.pack, ciOpts.configPath) : await loadArenaConfig(ciOpts.configPath);
|
|
7921
10110
|
console.log("Running benchmarks...");
|
|
7922
10111
|
const onResult = ciOpts.quiet ? void 0 : logResult;
|
|
7923
10112
|
let results;
|
|
@@ -7974,6 +10163,39 @@ program.command("ci").description("Run benchmarks, compare against baseline, and
|
|
|
7974
10163
|
process.exit(report.failed ? 1 : 0);
|
|
7975
10164
|
});
|
|
7976
10165
|
program.parse();
|
|
10166
|
+
function printPackList() {
|
|
10167
|
+
const packs = listPacks();
|
|
10168
|
+
if (packs.length === 0) {
|
|
10169
|
+
console.log("No packs available.");
|
|
10170
|
+
return;
|
|
10171
|
+
}
|
|
10172
|
+
const nameWidth = Math.max(...packs.map((p5) => p5.name.length)) + 2;
|
|
10173
|
+
console.log("Available task packs:\n");
|
|
10174
|
+
for (const p5 of packs) {
|
|
10175
|
+
const tasks = `${p5.taskCount} tasks`;
|
|
10176
|
+
console.log(` ${p5.name.padEnd(nameWidth)} ${tasks.padEnd(9)} ${p5.description}`);
|
|
10177
|
+
}
|
|
10178
|
+
console.log(`
|
|
10179
|
+
Run: npx duelist run --pack <name>`);
|
|
10180
|
+
console.log(`Combine: npx duelist run --pack structured-output,tool-calling`);
|
|
10181
|
+
}
|
|
10182
|
+
async function loadArenaWithPacks(packNames, configOpt) {
|
|
10183
|
+
const configPath = resolve(configOpt);
|
|
10184
|
+
if (!existsSync(configPath)) {
|
|
10185
|
+
console.error("No arena.config.ts found. Create one with `npx duelist init` to configure");
|
|
10186
|
+
console.error("your providers, then re-run with --pack.");
|
|
10187
|
+
process.exit(1);
|
|
10188
|
+
}
|
|
10189
|
+
const userArena = await loadArenaConfig(configOpt);
|
|
10190
|
+
const packs = packNames.split(",").map((s5) => s5.trim());
|
|
10191
|
+
const packConfig = buildPackConfig({
|
|
10192
|
+
packs,
|
|
10193
|
+
providers: userArena.config.providers,
|
|
10194
|
+
runs: userArena.config.runs,
|
|
10195
|
+
timeout: userArena.config.timeout
|
|
10196
|
+
});
|
|
10197
|
+
return defineArena(packConfig);
|
|
10198
|
+
}
|
|
7977
10199
|
async function loadArenaConfig(configOpt) {
|
|
7978
10200
|
const configPath = resolve(configOpt);
|
|
7979
10201
|
if (!existsSync(configPath)) {
|