vitest-evals 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -35
- package/dist/harness.d.mts +15 -20
- package/dist/harness.d.ts +15 -20
- package/dist/harness.js +0 -1
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +0 -1
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +45 -68
- package/dist/index.d.ts +45 -68
- package/dist/index.js +21 -40
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +21 -40
- package/dist/index.mjs.map +1 -1
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +14 -13
- package/dist/judges/factualityJudge.d.ts +14 -13
- package/dist/judges/factualityJudge.js +9 -9
- package/dist/judges/factualityJudge.js.map +1 -1
- package/dist/judges/factualityJudge.mjs +9 -9
- package/dist/judges/factualityJudge.mjs.map +1 -1
- package/dist/judges/index.js +17 -20
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +17 -20
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +6 -10
- package/dist/judges/judgeHarness.d.ts +6 -10
- package/dist/judges/judgeHarness.js +3 -8
- package/dist/judges/judgeHarness.js.map +1 -1
- package/dist/judges/judgeHarness.mjs +3 -8
- package/dist/judges/judgeHarness.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +7 -9
- package/dist/judges/structuredOutputJudge.d.ts +7 -9
- package/dist/judges/structuredOutputJudge.js +3 -3
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs +3 -3
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +12 -9
- package/dist/judges/toolCallJudge.d.ts +12 -9
- package/dist/judges/toolCallJudge.js +3 -3
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +3 -3
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +13 -24
- package/dist/judges/types.d.ts +13 -24
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +3 -3
package/dist/index.mjs
CHANGED
|
@@ -92,7 +92,6 @@ function createHarness(options) {
|
|
|
92
92
|
try {
|
|
93
93
|
const result = await options.run({
|
|
94
94
|
input,
|
|
95
|
-
metadata: context.metadata,
|
|
96
95
|
signal: context.signal,
|
|
97
96
|
artifacts: context.artifacts,
|
|
98
97
|
setArtifact: context.setArtifact
|
|
@@ -483,17 +482,14 @@ function serializeError(error) {
|
|
|
483
482
|
function createJudgeHarness(options) {
|
|
484
483
|
return createHarness({
|
|
485
484
|
name: options.name ?? "judge-harness",
|
|
486
|
-
run: async ({ input, signal
|
|
487
|
-
return normalizeJudgeHarnessResult(
|
|
488
|
-
await options.run(input, { signal, metadata })
|
|
489
|
-
);
|
|
485
|
+
run: async ({ input, signal }) => {
|
|
486
|
+
return normalizeJudgeHarnessResult(await options.run(input, { signal }));
|
|
490
487
|
}
|
|
491
488
|
});
|
|
492
489
|
}
|
|
493
490
|
async function runJudgeHarness(judgeHarness, input, options = {}) {
|
|
494
491
|
const artifacts = {};
|
|
495
492
|
const run = await judgeHarness.run(input, {
|
|
496
|
-
metadata: options.metadata ?? {},
|
|
497
493
|
signal: options.signal,
|
|
498
494
|
artifacts,
|
|
499
495
|
setArtifact: (name, value) => {
|
|
@@ -507,8 +503,7 @@ function createRunJudge(judgeHarness, signal) {
|
|
|
507
503
|
return void 0;
|
|
508
504
|
}
|
|
509
505
|
return (input, options) => runJudgeHarness(judgeHarness, input, {
|
|
510
|
-
|
|
511
|
-
signal
|
|
506
|
+
signal: options?.signal ?? signal
|
|
512
507
|
});
|
|
513
508
|
}
|
|
514
509
|
function normalizeJudgeHarnessResult(result) {
|
|
@@ -586,22 +581,24 @@ function FactualityJudge(config = {}) {
|
|
|
586
581
|
return {
|
|
587
582
|
name: config.name ?? "FactualityJudge",
|
|
588
583
|
judgeHarness,
|
|
589
|
-
assess: (opts) => assessFactuality(opts,
|
|
584
|
+
assess: (opts) => assessFactuality(opts, {
|
|
585
|
+
expected: config.expected,
|
|
586
|
+
judgeHarness
|
|
587
|
+
})
|
|
590
588
|
};
|
|
591
589
|
}
|
|
592
|
-
async function assessFactuality(opts,
|
|
593
|
-
const
|
|
594
|
-
const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
|
|
590
|
+
async function assessFactuality(opts, config) {
|
|
591
|
+
const expected = opts.expected ?? config.expected;
|
|
595
592
|
if (isMissingExpectedAnswer(expected)) {
|
|
596
593
|
return {
|
|
597
594
|
score: 0,
|
|
598
595
|
metadata: {
|
|
599
|
-
rationale: "FactualityJudge requires a non-empty expert answer in `expected` or
|
|
596
|
+
rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
|
|
600
597
|
}
|
|
601
598
|
};
|
|
602
599
|
}
|
|
603
600
|
const runJudge = opts.runJudge ?? createRunJudge(
|
|
604
|
-
|
|
601
|
+
config.judgeHarness,
|
|
605
602
|
opts.signal
|
|
606
603
|
);
|
|
607
604
|
if (!runJudge) {
|
|
@@ -1009,15 +1006,15 @@ function StructuredOutputScorer(config = {}) {
|
|
|
1009
1006
|
|
|
1010
1007
|
// src/judges/structuredOutputJudge.ts
|
|
1011
1008
|
function StructuredOutputJudge(config = {}) {
|
|
1012
|
-
const
|
|
1009
|
+
const { expected, ...scorerConfig } = config;
|
|
1010
|
+
const scorer = StructuredOutputScorer(scorerConfig);
|
|
1013
1011
|
return {
|
|
1014
1012
|
name: "StructuredOutputJudge",
|
|
1015
1013
|
assess: (opts) => {
|
|
1016
|
-
const metadata = opts.metadata;
|
|
1017
1014
|
return scorer({
|
|
1018
1015
|
...opts,
|
|
1019
1016
|
input: formatStructuredOutput(opts.input),
|
|
1020
|
-
expected: opts.expected ??
|
|
1017
|
+
expected: opts.expected ?? expected,
|
|
1021
1018
|
output: formatStructuredOutput(opts.output)
|
|
1022
1019
|
});
|
|
1023
1020
|
}
|
|
@@ -1227,17 +1224,17 @@ function evaluateUnorderedTools(expected, actual, options) {
|
|
|
1227
1224
|
|
|
1228
1225
|
// src/judges/toolCallJudge.ts
|
|
1229
1226
|
function ToolCallJudge(config = {}) {
|
|
1230
|
-
const
|
|
1227
|
+
const { expectedTools, ...scorerConfig } = config;
|
|
1228
|
+
const scorer = ToolCallScorer(scorerConfig);
|
|
1231
1229
|
return {
|
|
1232
1230
|
name: "ToolCallJudge",
|
|
1233
1231
|
assess: (opts) => {
|
|
1234
|
-
const metadata = opts.metadata;
|
|
1235
1232
|
return scorer({
|
|
1236
1233
|
...opts,
|
|
1237
1234
|
input: formatJudgeValue2(opts.input),
|
|
1238
1235
|
output: formatJudgeValue2(opts.output),
|
|
1239
1236
|
expectedTools: normalizeExpectedTools(
|
|
1240
|
-
opts.expectedTools ??
|
|
1237
|
+
opts.expectedTools ?? expectedTools
|
|
1241
1238
|
)
|
|
1242
1239
|
});
|
|
1243
1240
|
}
|
|
@@ -1268,10 +1265,7 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1268
1265
|
throw new Error(
|
|
1269
1266
|
"describeEval must override the harness fixture before running tests."
|
|
1270
1267
|
);
|
|
1271
|
-
}).extend(
|
|
1272
|
-
"automaticJudges",
|
|
1273
|
-
[]
|
|
1274
|
-
).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
|
|
1268
|
+
}).extend("automaticJudges", []).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
|
|
1275
1269
|
"run",
|
|
1276
1270
|
async ({
|
|
1277
1271
|
automaticJudges,
|
|
@@ -1282,12 +1276,10 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1282
1276
|
signal,
|
|
1283
1277
|
task
|
|
1284
1278
|
}) => {
|
|
1285
|
-
return async (input
|
|
1279
|
+
return async (input) => {
|
|
1286
1280
|
const resolvedHarness = harness;
|
|
1287
|
-
const metadata = createMetadata(options?.metadata);
|
|
1288
1281
|
const artifacts = {};
|
|
1289
1282
|
const context = {
|
|
1290
|
-
metadata,
|
|
1291
1283
|
signal,
|
|
1292
1284
|
artifacts,
|
|
1293
1285
|
setArtifact: (artifactName, value) => {
|
|
@@ -1317,7 +1309,6 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1317
1309
|
resolvedHarness,
|
|
1318
1310
|
input,
|
|
1319
1311
|
explicitJudgeHarness,
|
|
1320
|
-
metadata,
|
|
1321
1312
|
signal
|
|
1322
1313
|
);
|
|
1323
1314
|
}
|
|
@@ -1336,7 +1327,6 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1336
1327
|
resolvedHarness,
|
|
1337
1328
|
input,
|
|
1338
1329
|
explicitJudgeHarness,
|
|
1339
|
-
metadata,
|
|
1340
1330
|
signal
|
|
1341
1331
|
);
|
|
1342
1332
|
}
|
|
@@ -1356,7 +1346,6 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1356
1346
|
resolvedHarness,
|
|
1357
1347
|
input,
|
|
1358
1348
|
explicitJudgeHarness,
|
|
1359
|
-
metadata,
|
|
1360
1349
|
signal
|
|
1361
1350
|
);
|
|
1362
1351
|
if (automaticJudges.length > 0) {
|
|
@@ -1367,7 +1356,6 @@ var evalTest = test.extend("harness", async () => {
|
|
|
1367
1356
|
resolvedHarness,
|
|
1368
1357
|
input,
|
|
1369
1358
|
judgeHarness,
|
|
1370
|
-
metadata,
|
|
1371
1359
|
run,
|
|
1372
1360
|
signal
|
|
1373
1361
|
);
|
|
@@ -1434,10 +1422,7 @@ function describeEval(name, options, define) {
|
|
|
1434
1422
|
define(it);
|
|
1435
1423
|
});
|
|
1436
1424
|
}
|
|
1437
|
-
function
|
|
1438
|
-
return { ...metadata ?? {} };
|
|
1439
|
-
}
|
|
1440
|
-
async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, metadata, run, signal) {
|
|
1425
|
+
async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, run, signal) {
|
|
1441
1426
|
const runToolCalls = toolCalls2(run.session);
|
|
1442
1427
|
const scores = await Promise.all(
|
|
1443
1428
|
judges.map((judge) => {
|
|
@@ -1449,7 +1434,6 @@ async function applyAutomaticJudges(task, judges, threshold, harness, input, jud
|
|
|
1449
1434
|
input,
|
|
1450
1435
|
output: run.output,
|
|
1451
1436
|
toolCalls: runToolCalls,
|
|
1452
|
-
metadata,
|
|
1453
1437
|
run,
|
|
1454
1438
|
session: run.session,
|
|
1455
1439
|
signal,
|
|
@@ -1494,12 +1478,11 @@ function setHarnessMeta(task, name, run) {
|
|
|
1494
1478
|
run
|
|
1495
1479
|
};
|
|
1496
1480
|
}
|
|
1497
|
-
function recordJudgeRunContext(run, harness, input, judgeHarness,
|
|
1481
|
+
function recordJudgeRunContext(run, harness, input, judgeHarness, signal) {
|
|
1498
1482
|
const context = {
|
|
1499
1483
|
harness,
|
|
1500
1484
|
input,
|
|
1501
1485
|
judgeHarness,
|
|
1502
|
-
metadata,
|
|
1503
1486
|
run,
|
|
1504
1487
|
signal
|
|
1505
1488
|
};
|
|
@@ -1569,7 +1552,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
|
|
|
1569
1552
|
const judgeHarness = options.judgeHarness ?? resolveJudgeHarnessForJudge(judge, registeredContext?.judgeHarness);
|
|
1570
1553
|
const runJudge = createRunJudge(judgeHarness, registeredContext?.signal);
|
|
1571
1554
|
const signal = registeredContext?.signal;
|
|
1572
|
-
const metadata = options.metadata ?? registeredContext?.metadata ?? {};
|
|
1573
1555
|
const input = options.input ?? registeredContext?.input ?? void 0;
|
|
1574
1556
|
const contextualOptions = {
|
|
1575
1557
|
...options,
|
|
@@ -1592,7 +1574,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
|
|
|
1592
1574
|
...judgeParams,
|
|
1593
1575
|
input: resolvedInput,
|
|
1594
1576
|
output,
|
|
1595
|
-
metadata,
|
|
1596
1577
|
run,
|
|
1597
1578
|
session: options.session ?? run.session,
|
|
1598
1579
|
signal,
|