@deepagents/evals 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -330,8 +330,10 @@ function dataset(source) {
330
330
  }
331
331
 
332
332
  // packages/evals/src/scorers/index.ts
333
- import { generateObject } from "ai";
334
- import { z } from "zod";
333
+ import {
334
+ Factuality as AutoevalsFactuality,
335
+ Levenshtein as AutoevalsLevenshtein
336
+ } from "autoevals";
335
337
  var exactMatch = async ({ output, expected }) => {
336
338
  const exp = expected == null ? "" : String(expected);
337
339
  if (output === exp) return { score: 1 };
@@ -353,32 +355,32 @@ function regex(pattern) {
353
355
  return { score: pattern.test(output) ? 1 : 0 };
354
356
  };
355
357
  }
356
- function levenshteinDistance(a, b) {
357
- if (a.length === 0) return b.length;
358
- if (b.length === 0) return a.length;
359
- if (a.length > b.length) [a, b] = [b, a];
360
- let prev = Array.from({ length: a.length + 1 }, (_, i) => i);
361
- let curr = new Array(a.length + 1);
362
- for (let j = 1; j <= b.length; j++) {
363
- curr[0] = j;
364
- for (let i = 1; i <= a.length; i++) {
365
- const cost = a[i - 1] === b[j - 1] ? 0 : 1;
366
- curr[i] = Math.min(prev[i] + 1, curr[i - 1] + 1, prev[i - 1] + cost);
358
+ function normalizeScore(score) {
359
+ if (typeof score !== "number" || !Number.isFinite(score)) return 0;
360
+ return Math.max(0, Math.min(1, score));
361
+ }
362
+ function reasonFromMetadata(metadata) {
363
+ if (!metadata) return void 0;
364
+ const candidates = [
365
+ metadata.reason,
366
+ metadata.rationale,
367
+ metadata.explanation
368
+ ];
369
+ for (const candidate of candidates) {
370
+ if (typeof candidate === "string" && candidate.trim().length > 0) {
371
+ return candidate;
367
372
  }
368
- [prev, curr] = [curr, prev];
369
373
  }
370
- return prev[a.length];
374
+ return void 0;
371
375
  }
372
376
  var levenshtein = async ({ output, expected }) => {
373
377
  const exp = expected == null ? "" : String(expected);
374
- if (output.length === 0 && exp.length === 0) return { score: 1 };
375
- const maxLen = Math.max(output.length, exp.length);
376
- const distance = levenshteinDistance(output, exp);
377
- const score = Math.max(0, 1 - distance / maxLen);
378
- if (score === 1) return { score };
378
+ const result = await AutoevalsLevenshtein({ output, expected: exp });
379
+ const score = normalizeScore(result.score);
379
380
  return {
380
381
  score,
381
- reason: `Levenshtein distance is ${distance} across max length ${maxLen}.`
382
+ reason: reasonFromMetadata(result.metadata),
383
+ metadata: result.metadata
382
384
  };
383
385
  };
384
386
  function deepEqual(a, b) {
@@ -412,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
412
414
  return { score: 0, reason: "Failed to parse JSON" };
413
415
  }
414
416
  };
415
- var llmScorerSchema = z.object({
416
- score: z.number().min(0).max(1),
417
- reason: z.string()
418
- });
419
- function llmJudge(config) {
420
- return async ({ input, output, expected }) => {
421
- const { object } = await generateObject({
422
- model: config.model,
423
- schema: llmScorerSchema,
424
- prompt: `You are an expert evaluator. Grade the output based on the following criteria:
425
- ${config.criteria}
426
-
427
- Input: ${JSON.stringify(input)}
428
- Output: ${output}
429
- ${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
430
-
431
- Return a score from 0.0 to 1.0 and a brief reason.`
432
- });
433
- return { score: object.score, reason: object.reason };
434
- };
435
- }
436
417
  function factuality(config) {
437
418
  return async ({ input, output, expected }) => {
438
- const { object } = await generateObject({
419
+ const result = await AutoevalsFactuality({
439
420
  model: config.model,
440
- schema: llmScorerSchema,
441
- prompt: `You are a factuality evaluator. Determine whether the output is factually consistent with the expected reference.
442
-
443
- Input: ${JSON.stringify(input)}
444
- Output: ${output}
445
- Expected reference: ${JSON.stringify(expected)}
446
-
447
- Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
448
- Return a score from 0.0 to 1.0 and a brief reason.`
421
+ input: typeof input === "string" ? input : JSON.stringify(input),
422
+ output,
423
+ expected: expected == null ? void 0 : String(expected)
449
424
  });
450
- return { score: object.score, reason: object.reason };
425
+ return {
426
+ score: normalizeScore(result.score),
427
+ reason: reasonFromMetadata(result.metadata),
428
+ metadata: result.metadata
429
+ };
451
430
  };
452
431
  }
453
432
  function all(...scorers) {
@@ -1042,7 +1021,8 @@ async function runEval(config) {
1042
1021
  });
1043
1022
  scores[sName] = {
1044
1023
  score: clampScore(sr.score, sName),
1045
- reason: sr.reason
1024
+ reason: sr.reason,
1025
+ metadata: sr.metadata
1046
1026
  };
1047
1027
  }
1048
1028
  trialResults.push({ result, scores });
@@ -1068,7 +1048,8 @@ async function runEval(config) {
1068
1048
  const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
1069
1049
  finalScores[sName] = {
1070
1050
  score: meanScore,
1071
- reason: trialResults[trialResults.length - 1].scores[sName]?.reason
1051
+ reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
1052
+ metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
1072
1053
  };
1073
1054
  }
1074
1055
  } else {
@@ -1085,7 +1066,8 @@ async function runEval(config) {
1085
1066
  });
1086
1067
  finalScores[sName] = {
1087
1068
  score: clampScore(sr.score, sName),
1088
- reason: sr.reason
1069
+ reason: sr.reason,
1070
+ metadata: sr.metadata
1089
1071
  };
1090
1072
  }
1091
1073
  }
@@ -1455,6 +1437,22 @@ function truncateString(text, maxLength) {
1455
1437
  if (text.length <= maxLength) return text;
1456
1438
  return text.slice(0, maxLength) + "\u2026";
1457
1439
  }
1440
+ function stringifyRationale(value) {
1441
+ if (typeof value === "string") {
1442
+ const trimmed = value.trim();
1443
+ return trimmed.length > 0 ? trimmed : void 0;
1444
+ }
1445
+ if (Array.isArray(value)) {
1446
+ const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
1447
+ if (parts.length > 0) return parts.join(" | ");
1448
+ }
1449
+ return void 0;
1450
+ }
1451
+ function scoreReasonWithMetadata(score) {
1452
+ const reason = score.reason?.trim();
1453
+ if (reason) return reason;
1454
+ return stringifyRationale(score.metadata?.["rationale"]);
1455
+ }
1458
1456
  function renderSummaryTable(data) {
1459
1457
  const { summary } = data;
1460
1458
  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
@@ -1516,7 +1514,8 @@ function renderCaseDetail(c, threshold, options) {
1516
1514
  }
1517
1515
  for (const [name, s] of entries) {
1518
1516
  const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
1519
- const reasonStr = s.reason ? ` \u2014 ${s.reason}` : "";
1517
+ const reason = scoreReasonWithMetadata(s);
1518
+ const reasonStr = reason ? ` \u2014 ${reason}` : "";
1520
1519
  console.log(
1521
1520
  ` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
1522
1521
  );
@@ -2073,7 +2072,6 @@ export {
2073
2072
  jsonMatch,
2074
2073
  jsonReporter,
2075
2074
  levenshtein,
2076
- llmJudge,
2077
2075
  markdownReporter,
2078
2076
  parseRecordSelection,
2079
2077
  pickFromArray,