@deepagents/evals 0.20.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -330,8 +330,10 @@ function dataset(source) {
330
330
  }
331
331
 
332
332
  // packages/evals/src/scorers/index.ts
333
- import { generateObject } from "ai";
334
- import { z } from "zod";
333
+ import {
334
+ Factuality as AutoevalsFactuality,
335
+ Levenshtein as AutoevalsLevenshtein
336
+ } from "autoevals";
335
337
  var exactMatch = async ({ output, expected }) => {
336
338
  const exp = expected == null ? "" : String(expected);
337
339
  if (output === exp) return { score: 1 };
@@ -353,32 +355,32 @@ function regex(pattern) {
353
355
  return { score: pattern.test(output) ? 1 : 0 };
354
356
  };
355
357
  }
356
- function levenshteinDistance(a, b) {
357
- if (a.length === 0) return b.length;
358
- if (b.length === 0) return a.length;
359
- if (a.length > b.length) [a, b] = [b, a];
360
- let prev = Array.from({ length: a.length + 1 }, (_, i) => i);
361
- let curr = new Array(a.length + 1);
362
- for (let j = 1; j <= b.length; j++) {
363
- curr[0] = j;
364
- for (let i = 1; i <= a.length; i++) {
365
- const cost = a[i - 1] === b[j - 1] ? 0 : 1;
366
- curr[i] = Math.min(prev[i] + 1, curr[i - 1] + 1, prev[i - 1] + cost);
358
+ function normalizeScore(score) {
359
+ if (typeof score !== "number" || !Number.isFinite(score)) return 0;
360
+ return Math.max(0, Math.min(1, score));
361
+ }
362
+ function reasonFromMetadata(metadata) {
363
+ if (!metadata) return void 0;
364
+ const candidates = [
365
+ metadata.reason,
366
+ metadata.rationale,
367
+ metadata.explanation
368
+ ];
369
+ for (const candidate of candidates) {
370
+ if (typeof candidate === "string" && candidate.trim().length > 0) {
371
+ return candidate;
367
372
  }
368
- [prev, curr] = [curr, prev];
369
373
  }
370
- return prev[a.length];
374
+ return void 0;
371
375
  }
372
376
  var levenshtein = async ({ output, expected }) => {
373
377
  const exp = expected == null ? "" : String(expected);
374
- if (output.length === 0 && exp.length === 0) return { score: 1 };
375
- const maxLen = Math.max(output.length, exp.length);
376
- const distance = levenshteinDistance(output, exp);
377
- const score = Math.max(0, 1 - distance / maxLen);
378
- if (score === 1) return { score };
378
+ const result = await AutoevalsLevenshtein({ output, expected: exp });
379
+ const score = normalizeScore(result.score);
379
380
  return {
380
381
  score,
381
- reason: `Levenshtein distance is ${distance} across max length ${maxLen}.`
382
+ reason: reasonFromMetadata(result.metadata),
383
+ metadata: result.metadata
382
384
  };
383
385
  };
384
386
  function deepEqual(a, b) {
@@ -412,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
412
414
  return { score: 0, reason: "Failed to parse JSON" };
413
415
  }
414
416
  };
415
- var llmScorerSchema = z.object({
416
- score: z.number().min(0).max(1),
417
- reason: z.string()
418
- });
419
- function llmJudge(config) {
420
- return async ({ input, output, expected }) => {
421
- const { object } = await generateObject({
422
- model: config.model,
423
- schema: llmScorerSchema,
424
- prompt: `You are an expert evaluator. Grade the output based on the following criteria:
425
- ${config.criteria}
426
-
427
- Input: ${JSON.stringify(input)}
428
- Output: ${output}
429
- ${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
430
-
431
- Return a score from 0.0 to 1.0 and a brief reason.`
432
- });
433
- return { score: object.score, reason: object.reason };
434
- };
435
- }
436
417
  function factuality(config) {
437
418
  return async ({ input, output, expected }) => {
438
- const { object } = await generateObject({
419
+ const result = await AutoevalsFactuality({
439
420
  model: config.model,
440
- schema: llmScorerSchema,
441
- prompt: `You are a factuality evaluator. Determine whether the output is factually consistent with the expected reference.
442
-
443
- Input: ${JSON.stringify(input)}
444
- Output: ${output}
445
- Expected reference: ${JSON.stringify(expected)}
446
-
447
- Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
448
- Return a score from 0.0 to 1.0 and a brief reason.`
421
+ input: typeof input === "string" ? input : JSON.stringify(input),
422
+ output,
423
+ expected: expected == null ? void 0 : String(expected)
449
424
  });
450
- return { score: object.score, reason: object.reason };
425
+ return {
426
+ score: normalizeScore(result.score),
427
+ reason: reasonFromMetadata(result.metadata),
428
+ metadata: result.metadata
429
+ };
451
430
  };
452
431
  }
453
432
  function all(...scorers) {
@@ -623,6 +602,16 @@ var RunStore = class {
623
602
  ).run(id, name, now);
624
603
  return { id, name, created_at: now };
625
604
  }
605
+ getSuite(id) {
606
+ const row = this.#stmt("SELECT * FROM suites WHERE id = ?").get(id);
607
+ return row ?? void 0;
608
+ }
609
+ renameSuite(id, name) {
610
+ this.#stmt("UPDATE suites SET name = ? WHERE id = ?").run(name, id);
611
+ }
612
+ renameRun(id, name) {
613
+ this.#stmt("UPDATE runs SET name = ? WHERE id = ?").run(name, id);
614
+ }
626
615
  createRun(run) {
627
616
  const id = crypto.randomUUID();
628
617
  const now = Date.now();
@@ -1042,7 +1031,8 @@ async function runEval(config) {
1042
1031
  });
1043
1032
  scores[sName] = {
1044
1033
  score: clampScore(sr.score, sName),
1045
- reason: sr.reason
1034
+ reason: sr.reason,
1035
+ metadata: sr.metadata
1046
1036
  };
1047
1037
  }
1048
1038
  trialResults.push({ result, scores });
@@ -1068,7 +1058,8 @@ async function runEval(config) {
1068
1058
  const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
1069
1059
  finalScores[sName] = {
1070
1060
  score: meanScore,
1071
- reason: trialResults[trialResults.length - 1].scores[sName]?.reason
1061
+ reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
1062
+ metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
1072
1063
  };
1073
1064
  }
1074
1065
  } else {
@@ -1085,7 +1076,8 @@ async function runEval(config) {
1085
1076
  });
1086
1077
  finalScores[sName] = {
1087
1078
  score: clampScore(sr.score, sName),
1088
- reason: sr.reason
1079
+ reason: sr.reason,
1080
+ metadata: sr.metadata
1089
1081
  };
1090
1082
  }
1091
1083
  }
@@ -1455,6 +1447,22 @@ function truncateString(text, maxLength) {
1455
1447
  if (text.length <= maxLength) return text;
1456
1448
  return text.slice(0, maxLength) + "\u2026";
1457
1449
  }
1450
+ function stringifyRationale(value) {
1451
+ if (typeof value === "string") {
1452
+ const trimmed = value.trim();
1453
+ return trimmed.length > 0 ? trimmed : void 0;
1454
+ }
1455
+ if (Array.isArray(value)) {
1456
+ const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
1457
+ if (parts.length > 0) return parts.join(" | ");
1458
+ }
1459
+ return void 0;
1460
+ }
1461
+ function scoreReasonWithMetadata(score) {
1462
+ const reason = score.reason?.trim();
1463
+ if (reason) return reason;
1464
+ return stringifyRationale(score.metadata?.["rationale"]);
1465
+ }
1458
1466
  function renderSummaryTable(data) {
1459
1467
  const { summary } = data;
1460
1468
  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
@@ -1516,7 +1524,8 @@ function renderCaseDetail(c, threshold, options) {
1516
1524
  }
1517
1525
  for (const [name, s] of entries) {
1518
1526
  const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
1519
- const reasonStr = s.reason ? ` \u2014 ${s.reason}` : "";
1527
+ const reason = scoreReasonWithMetadata(s);
1528
+ const reasonStr = reason ? ` \u2014 ${reason}` : "";
1520
1529
  console.log(
1521
1530
  ` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
1522
1531
  );
@@ -2073,7 +2082,6 @@ export {
2073
2082
  jsonMatch,
2074
2083
  jsonReporter,
2075
2084
  levenshtein,
2076
- llmJudge,
2077
2085
  markdownReporter,
2078
2086
  parseRecordSelection,
2079
2087
  pickFromArray,