vitest-evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +27 -35
  2. package/dist/harness.d.mts +15 -20
  3. package/dist/harness.d.ts +15 -20
  4. package/dist/harness.js +0 -1
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +0 -1
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +45 -68
  9. package/dist/index.d.ts +45 -68
  10. package/dist/index.js +21 -40
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +21 -40
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/toolCallScorer.js.map +1 -1
  15. package/dist/internal/toolCallScorer.mjs.map +1 -1
  16. package/dist/judges/factualityJudge.d.mts +14 -13
  17. package/dist/judges/factualityJudge.d.ts +14 -13
  18. package/dist/judges/factualityJudge.js +9 -9
  19. package/dist/judges/factualityJudge.js.map +1 -1
  20. package/dist/judges/factualityJudge.mjs +9 -9
  21. package/dist/judges/factualityJudge.mjs.map +1 -1
  22. package/dist/judges/index.js +17 -20
  23. package/dist/judges/index.js.map +1 -1
  24. package/dist/judges/index.mjs +17 -20
  25. package/dist/judges/index.mjs.map +1 -1
  26. package/dist/judges/judgeHarness.d.mts +6 -10
  27. package/dist/judges/judgeHarness.d.ts +6 -10
  28. package/dist/judges/judgeHarness.js +3 -8
  29. package/dist/judges/judgeHarness.js.map +1 -1
  30. package/dist/judges/judgeHarness.mjs +3 -8
  31. package/dist/judges/judgeHarness.mjs.map +1 -1
  32. package/dist/judges/structuredOutputJudge.d.mts +7 -9
  33. package/dist/judges/structuredOutputJudge.d.ts +7 -9
  34. package/dist/judges/structuredOutputJudge.js +3 -3
  35. package/dist/judges/structuredOutputJudge.js.map +1 -1
  36. package/dist/judges/structuredOutputJudge.mjs +3 -3
  37. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  38. package/dist/judges/toolCallJudge.d.mts +12 -9
  39. package/dist/judges/toolCallJudge.d.ts +12 -9
  40. package/dist/judges/toolCallJudge.js +3 -3
  41. package/dist/judges/toolCallJudge.js.map +1 -1
  42. package/dist/judges/toolCallJudge.mjs +3 -3
  43. package/dist/judges/toolCallJudge.mjs.map +1 -1
  44. package/dist/judges/types.d.mts +13 -24
  45. package/dist/judges/types.d.ts +13 -24
  46. package/dist/judges/types.js.map +1 -1
  47. package/dist/legacy/scorers/index.js.map +1 -1
  48. package/dist/legacy/scorers/index.mjs.map +1 -1
  49. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  50. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  51. package/dist/legacy.js.map +1 -1
  52. package/dist/legacy.mjs.map +1 -1
  53. package/dist/reporter.js.map +1 -1
  54. package/dist/reporter.mjs.map +1 -1
  55. package/package.json +3 -3
package/dist/index.mjs CHANGED
@@ -92,7 +92,6 @@ function createHarness(options) {
92
92
  try {
93
93
  const result = await options.run({
94
94
  input,
95
- metadata: context.metadata,
96
95
  signal: context.signal,
97
96
  artifacts: context.artifacts,
98
97
  setArtifact: context.setArtifact
@@ -483,17 +482,14 @@ function serializeError(error) {
483
482
  function createJudgeHarness(options) {
484
483
  return createHarness({
485
484
  name: options.name ?? "judge-harness",
486
- run: async ({ input, signal, metadata }) => {
487
- return normalizeJudgeHarnessResult(
488
- await options.run(input, { signal, metadata })
489
- );
485
+ run: async ({ input, signal }) => {
486
+ return normalizeJudgeHarnessResult(await options.run(input, { signal }));
490
487
  }
491
488
  });
492
489
  }
493
490
  async function runJudgeHarness(judgeHarness, input, options = {}) {
494
491
  const artifacts = {};
495
492
  const run = await judgeHarness.run(input, {
496
- metadata: options.metadata ?? {},
497
493
  signal: options.signal,
498
494
  artifacts,
499
495
  setArtifact: (name, value) => {
@@ -507,8 +503,7 @@ function createRunJudge(judgeHarness, signal) {
507
503
  return void 0;
508
504
  }
509
505
  return (input, options) => runJudgeHarness(judgeHarness, input, {
510
- metadata: options?.metadata,
511
- signal
506
+ signal: options?.signal ?? signal
512
507
  });
513
508
  }
514
509
  function normalizeJudgeHarnessResult(result) {
@@ -586,22 +581,24 @@ function FactualityJudge(config = {}) {
586
581
  return {
587
582
  name: config.name ?? "FactualityJudge",
588
583
  judgeHarness,
589
- assess: (opts) => assessFactuality(opts, judgeHarness)
584
+ assess: (opts) => assessFactuality(opts, {
585
+ expected: config.expected,
586
+ judgeHarness
587
+ })
590
588
  };
591
589
  }
592
- async function assessFactuality(opts, configuredJudgeHarness) {
593
- const metadata = opts.metadata;
594
- const expected = opts.expected === void 0 ? metadata.expected : opts.expected;
590
+ async function assessFactuality(opts, config) {
591
+ const expected = opts.expected ?? config.expected;
595
592
  if (isMissingExpectedAnswer(expected)) {
596
593
  return {
597
594
  score: 0,
598
595
  metadata: {
599
- rationale: "FactualityJudge requires a non-empty expert answer in `expected` or `metadata.expected`."
596
+ rationale: "FactualityJudge requires a non-empty expert answer in `expected` or FactualityJudge(...) config."
600
597
  }
601
598
  };
602
599
  }
603
600
  const runJudge = opts.runJudge ?? createRunJudge(
604
- configuredJudgeHarness,
601
+ config.judgeHarness,
605
602
  opts.signal
606
603
  );
607
604
  if (!runJudge) {
@@ -1009,15 +1006,15 @@ function StructuredOutputScorer(config = {}) {
1009
1006
 
1010
1007
  // src/judges/structuredOutputJudge.ts
1011
1008
  function StructuredOutputJudge(config = {}) {
1012
- const scorer = StructuredOutputScorer(config);
1009
+ const { expected, ...scorerConfig } = config;
1010
+ const scorer = StructuredOutputScorer(scorerConfig);
1013
1011
  return {
1014
1012
  name: "StructuredOutputJudge",
1015
1013
  assess: (opts) => {
1016
- const metadata = opts.metadata;
1017
1014
  return scorer({
1018
1015
  ...opts,
1019
1016
  input: formatStructuredOutput(opts.input),
1020
- expected: opts.expected ?? metadata.expected,
1017
+ expected: opts.expected ?? expected,
1021
1018
  output: formatStructuredOutput(opts.output)
1022
1019
  });
1023
1020
  }
@@ -1227,17 +1224,17 @@ function evaluateUnorderedTools(expected, actual, options) {
1227
1224
 
1228
1225
  // src/judges/toolCallJudge.ts
1229
1226
  function ToolCallJudge(config = {}) {
1230
- const scorer = ToolCallScorer(config);
1227
+ const { expectedTools, ...scorerConfig } = config;
1228
+ const scorer = ToolCallScorer(scorerConfig);
1231
1229
  return {
1232
1230
  name: "ToolCallJudge",
1233
1231
  assess: (opts) => {
1234
- const metadata = opts.metadata;
1235
1232
  return scorer({
1236
1233
  ...opts,
1237
1234
  input: formatJudgeValue2(opts.input),
1238
1235
  output: formatJudgeValue2(opts.output),
1239
1236
  expectedTools: normalizeExpectedTools(
1240
- opts.expectedTools ?? metadata.expectedTools
1237
+ opts.expectedTools ?? expectedTools
1241
1238
  )
1242
1239
  });
1243
1240
  }
@@ -1268,10 +1265,7 @@ var evalTest = test.extend("harness", async () => {
1268
1265
  throw new Error(
1269
1266
  "describeEval must override the harness fixture before running tests."
1270
1267
  );
1271
- }).extend(
1272
- "automaticJudges",
1273
- []
1274
- ).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
1268
+ }).extend("automaticJudges", []).extend("judgeThreshold", void 0).extend("judgeHarness", void 0).extend("explicitJudgeHarness", void 0).extend(
1275
1269
  "run",
1276
1270
  async ({
1277
1271
  automaticJudges,
@@ -1282,12 +1276,10 @@ var evalTest = test.extend("harness", async () => {
1282
1276
  signal,
1283
1277
  task
1284
1278
  }) => {
1285
- return async (input, options) => {
1279
+ return async (input) => {
1286
1280
  const resolvedHarness = harness;
1287
- const metadata = createMetadata(options?.metadata);
1288
1281
  const artifacts = {};
1289
1282
  const context = {
1290
- metadata,
1291
1283
  signal,
1292
1284
  artifacts,
1293
1285
  setArtifact: (artifactName, value) => {
@@ -1317,7 +1309,6 @@ var evalTest = test.extend("harness", async () => {
1317
1309
  resolvedHarness,
1318
1310
  input,
1319
1311
  explicitJudgeHarness,
1320
- metadata,
1321
1312
  signal
1322
1313
  );
1323
1314
  }
@@ -1336,7 +1327,6 @@ var evalTest = test.extend("harness", async () => {
1336
1327
  resolvedHarness,
1337
1328
  input,
1338
1329
  explicitJudgeHarness,
1339
- metadata,
1340
1330
  signal
1341
1331
  );
1342
1332
  }
@@ -1356,7 +1346,6 @@ var evalTest = test.extend("harness", async () => {
1356
1346
  resolvedHarness,
1357
1347
  input,
1358
1348
  explicitJudgeHarness,
1359
- metadata,
1360
1349
  signal
1361
1350
  );
1362
1351
  if (automaticJudges.length > 0) {
@@ -1367,7 +1356,6 @@ var evalTest = test.extend("harness", async () => {
1367
1356
  resolvedHarness,
1368
1357
  input,
1369
1358
  judgeHarness,
1370
- metadata,
1371
1359
  run,
1372
1360
  signal
1373
1361
  );
@@ -1434,10 +1422,7 @@ function describeEval(name, options, define) {
1434
1422
  define(it);
1435
1423
  });
1436
1424
  }
1437
- function createMetadata(metadata) {
1438
- return { ...metadata ?? {} };
1439
- }
1440
- async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, metadata, run, signal) {
1425
+ async function applyAutomaticJudges(task, judges, threshold, harness, input, judgeHarness, run, signal) {
1441
1426
  const runToolCalls = toolCalls2(run.session);
1442
1427
  const scores = await Promise.all(
1443
1428
  judges.map((judge) => {
@@ -1449,7 +1434,6 @@ async function applyAutomaticJudges(task, judges, threshold, harness, input, jud
1449
1434
  input,
1450
1435
  output: run.output,
1451
1436
  toolCalls: runToolCalls,
1452
- metadata,
1453
1437
  run,
1454
1438
  session: run.session,
1455
1439
  signal,
@@ -1494,12 +1478,11 @@ function setHarnessMeta(task, name, run) {
1494
1478
  run
1495
1479
  };
1496
1480
  }
1497
- function recordJudgeRunContext(run, harness, input, judgeHarness, metadata, signal) {
1481
+ function recordJudgeRunContext(run, harness, input, judgeHarness, signal) {
1498
1482
  const context = {
1499
1483
  harness,
1500
1484
  input,
1501
1485
  judgeHarness,
1502
- metadata,
1503
1486
  run,
1504
1487
  signal
1505
1488
  };
@@ -1569,7 +1552,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
1569
1552
  const judgeHarness = options.judgeHarness ?? resolveJudgeHarnessForJudge(judge, registeredContext?.judgeHarness);
1570
1553
  const runJudge = createRunJudge(judgeHarness, registeredContext?.signal);
1571
1554
  const signal = registeredContext?.signal;
1572
- const metadata = options.metadata ?? registeredContext?.metadata ?? {};
1573
1555
  const input = options.input ?? registeredContext?.input ?? void 0;
1574
1556
  const contextualOptions = {
1575
1557
  ...options,
@@ -1592,7 +1574,6 @@ function buildJudgeAssertionOptions(received, judge, options, task) {
1592
1574
  ...judgeParams,
1593
1575
  input: resolvedInput,
1594
1576
  output,
1595
- metadata,
1596
1577
  run,
1597
1578
  session: options.session ?? run.session,
1598
1579
  signal,