@deepagents/evals 0.19.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -60,6 +60,70 @@ async function fetchPage(url) {
60
60
  }
61
61
  }
62
62
 
63
+ // packages/evals/src/dataset/record-selection.ts
64
+ function parsePositiveInt(token) {
65
+ if (!/^\d+$/.test(token)) {
66
+ throw new Error(`Invalid record token "${token}"`);
67
+ }
68
+ const value = Number(token);
69
+ if (!Number.isInteger(value) || value < 1) {
70
+ throw new Error(`Record numbers must be >= 1. Received "${token}"`);
71
+ }
72
+ return value;
73
+ }
74
+ function parseRecordSelection(spec) {
75
+ const trimmed = spec.trim();
76
+ if (!trimmed) {
77
+ return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
78
+ }
79
+ const indexes = /* @__PURE__ */ new Set();
80
+ const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
81
+ if (parts.length === 0) {
82
+ throw new Error("Record selection is empty.");
83
+ }
84
+ for (const part of parts) {
85
+ const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
86
+ if (rangeMatch) {
87
+ const start = parsePositiveInt(rangeMatch[1]);
88
+ const end = parsePositiveInt(rangeMatch[2]);
89
+ if (end < start) {
90
+ throw new Error(
91
+ `Invalid range "${part}". Range end must be >= range start.`
92
+ );
93
+ }
94
+ for (let i = start; i <= end; i++) {
95
+ indexes.add(i - 1);
96
+ }
97
+ continue;
98
+ }
99
+ const value = parsePositiveInt(part);
100
+ indexes.add(value - 1);
101
+ }
102
+ return {
103
+ indexes,
104
+ normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
105
+ };
106
+ }
107
+ function pickFromArray(items, indexes) {
108
+ if (indexes.size === 0) return items;
109
+ return items.filter((_, i) => indexes.has(i));
110
+ }
111
+ async function* filterRecordsByIndex(source, indexes) {
112
+ if (indexes.size === 0) {
113
+ for await (const item of source) {
114
+ yield item;
115
+ }
116
+ return;
117
+ }
118
+ let idx = 0;
119
+ for await (const item of source) {
120
+ if (indexes.has(idx)) {
121
+ yield item;
122
+ }
123
+ idx++;
124
+ }
125
+ }
126
+
63
127
  // packages/evals/src/dataset/index.ts
64
128
  var Dataset = class _Dataset {
65
129
  #source;
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
128
192
  }
129
193
  });
130
194
  }
195
+ pick(indexes) {
196
+ const source = this.#source;
197
+ return new _Dataset(async function* () {
198
+ if (indexes.size === 0) {
199
+ yield* source();
200
+ return;
201
+ }
202
+ let idx = 0;
203
+ for await (const item of source()) {
204
+ if (indexes.has(idx)) {
205
+ yield item;
206
+ }
207
+ idx++;
208
+ }
209
+ });
210
+ }
131
211
  async toArray() {
132
212
  const result = [];
133
213
  for await (const item of this.#source()) {
@@ -250,8 +330,10 @@ function dataset(source) {
250
330
  }
251
331
 
252
332
  // packages/evals/src/scorers/index.ts
253
- import { generateObject } from "ai";
254
- import { z } from "zod";
333
+ import {
334
+ Factuality as AutoevalsFactuality,
335
+ Levenshtein as AutoevalsLevenshtein
336
+ } from "autoevals";
255
337
  var exactMatch = async ({ output, expected }) => {
256
338
  const exp = expected == null ? "" : String(expected);
257
339
  if (output === exp) return { score: 1 };
@@ -273,32 +355,32 @@ function regex(pattern) {
273
355
  return { score: pattern.test(output) ? 1 : 0 };
274
356
  };
275
357
  }
276
- function levenshteinDistance(a, b) {
277
- if (a.length === 0) return b.length;
278
- if (b.length === 0) return a.length;
279
- if (a.length > b.length) [a, b] = [b, a];
280
- let prev = Array.from({ length: a.length + 1 }, (_, i) => i);
281
- let curr = new Array(a.length + 1);
282
- for (let j = 1; j <= b.length; j++) {
283
- curr[0] = j;
284
- for (let i = 1; i <= a.length; i++) {
285
- const cost = a[i - 1] === b[j - 1] ? 0 : 1;
286
- curr[i] = Math.min(prev[i] + 1, curr[i - 1] + 1, prev[i - 1] + cost);
358
+ function normalizeScore(score) {
359
+ if (typeof score !== "number" || !Number.isFinite(score)) return 0;
360
+ return Math.max(0, Math.min(1, score));
361
+ }
362
+ function reasonFromMetadata(metadata) {
363
+ if (!metadata) return void 0;
364
+ const candidates = [
365
+ metadata.reason,
366
+ metadata.rationale,
367
+ metadata.explanation
368
+ ];
369
+ for (const candidate of candidates) {
370
+ if (typeof candidate === "string" && candidate.trim().length > 0) {
371
+ return candidate;
287
372
  }
288
- [prev, curr] = [curr, prev];
289
373
  }
290
- return prev[a.length];
374
+ return void 0;
291
375
  }
292
376
  var levenshtein = async ({ output, expected }) => {
293
377
  const exp = expected == null ? "" : String(expected);
294
- if (output.length === 0 && exp.length === 0) return { score: 1 };
295
- const maxLen = Math.max(output.length, exp.length);
296
- const distance = levenshteinDistance(output, exp);
297
- const score = Math.max(0, 1 - distance / maxLen);
298
- if (score === 1) return { score };
378
+ const result = await AutoevalsLevenshtein({ output, expected: exp });
379
+ const score = normalizeScore(result.score);
299
380
  return {
300
381
  score,
301
- reason: `Levenshtein distance is ${distance} across max length ${maxLen}.`
382
+ reason: reasonFromMetadata(result.metadata),
383
+ metadata: result.metadata
302
384
  };
303
385
  };
304
386
  function deepEqual(a, b) {
@@ -332,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
332
414
  return { score: 0, reason: "Failed to parse JSON" };
333
415
  }
334
416
  };
335
- var llmScorerSchema = z.object({
336
- score: z.number().min(0).max(1),
337
- reason: z.string()
338
- });
339
- function llmJudge(config) {
340
- return async ({ input, output, expected }) => {
341
- const { object } = await generateObject({
342
- model: config.model,
343
- schema: llmScorerSchema,
344
- prompt: `You are an expert evaluator. Grade the output based on the following criteria:
345
- ${config.criteria}
346
-
347
- Input: ${JSON.stringify(input)}
348
- Output: ${output}
349
- ${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
350
-
351
- Return a score from 0.0 to 1.0 and a brief reason.`
352
- });
353
- return { score: object.score, reason: object.reason };
354
- };
355
- }
356
417
  function factuality(config) {
357
418
  return async ({ input, output, expected }) => {
358
- const { object } = await generateObject({
419
+ const result = await AutoevalsFactuality({
359
420
  model: config.model,
360
- schema: llmScorerSchema,
361
- prompt: `You are a factuality evaluator. Determine whether the output is factually consistent with the expected reference.
362
-
363
- Input: ${JSON.stringify(input)}
364
- Output: ${output}
365
- Expected reference: ${JSON.stringify(expected)}
366
-
367
- Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
368
- Return a score from 0.0 to 1.0 and a brief reason.`
421
+ input: typeof input === "string" ? input : JSON.stringify(input),
422
+ output,
423
+ expected: expected == null ? void 0 : String(expected)
369
424
  });
370
- return { score: object.score, reason: object.reason };
425
+ return {
426
+ score: normalizeScore(result.score),
427
+ reason: reasonFromMetadata(result.metadata),
428
+ metadata: result.metadata
429
+ };
371
430
  };
372
431
  }
373
432
  function all(...scorers) {
@@ -720,6 +779,28 @@ var RunStore = class {
720
779
  totalTokensOut: totals.totalTokensOut
721
780
  };
722
781
  }
782
+ findSuiteByName(name) {
783
+ const row = this.#stmt(
784
+ "SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
785
+ ).get(name);
786
+ return row ?? void 0;
787
+ }
788
+ getLatestCompletedRun(suiteId, model) {
789
+ const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
790
+ const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
791
+ if (!row) return void 0;
792
+ return {
793
+ id: row.id,
794
+ suite_id: row.suite_id,
795
+ name: row.name,
796
+ model: row.model,
797
+ config: row.config ? JSON.parse(row.config) : null,
798
+ started_at: row.started_at,
799
+ finished_at: row.finished_at,
800
+ status: row.status,
801
+ summary: row.summary ? JSON.parse(row.summary) : null
802
+ };
803
+ }
723
804
  listSuites() {
724
805
  const rows = this.#stmt(
725
806
  "SELECT * FROM suites ORDER BY created_at DESC"
@@ -940,7 +1021,8 @@ async function runEval(config) {
940
1021
  });
941
1022
  scores[sName] = {
942
1023
  score: clampScore(sr.score, sName),
943
- reason: sr.reason
1024
+ reason: sr.reason,
1025
+ metadata: sr.metadata
944
1026
  };
945
1027
  }
946
1028
  trialResults.push({ result, scores });
@@ -966,7 +1048,8 @@ async function runEval(config) {
966
1048
  const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
967
1049
  finalScores[sName] = {
968
1050
  score: meanScore,
969
- reason: trialResults[trialResults.length - 1].scores[sName]?.reason
1051
+ reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
1052
+ metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
970
1053
  };
971
1054
  }
972
1055
  } else {
@@ -983,7 +1066,8 @@ async function runEval(config) {
983
1066
  });
984
1067
  finalScores[sName] = {
985
1068
  score: clampScore(sr.score, sName),
986
- reason: sr.reason
1069
+ reason: sr.reason,
1070
+ metadata: sr.metadata
987
1071
  };
988
1072
  }
989
1073
  }
@@ -1283,26 +1367,51 @@ function createRunEndFileReporter(options) {
1283
1367
  }
1284
1368
 
1285
1369
  // packages/evals/src/reporters/console.ts
1370
+ var BAR_WIDTH = 20;
1371
+ function renderProgressBar(completed, total, elapsedMs) {
1372
+ const pct = total > 0 ? completed / total : 0;
1373
+ const filled = Math.round(pct * BAR_WIDTH);
1374
+ const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
1375
+ const pctStr = `${(pct * 100).toFixed(0)}%`;
1376
+ return ` ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
1377
+ }
1378
+ function statusLabel(status) {
1379
+ if (status === "pass") return chalk.green("PASS");
1380
+ if (status === "error") return chalk.yellow("ERROR");
1381
+ return chalk.red("FAIL");
1382
+ }
1286
1383
  function consoleReporter(options) {
1287
1384
  const verbosity = options?.verbosity ?? "normal";
1288
1385
  let totalCases = 0;
1289
1386
  let completed = 0;
1387
+ let startTime = 0;
1290
1388
  return {
1291
1389
  onRunStart(data) {
1292
1390
  totalCases = data.totalCases;
1293
1391
  completed = 0;
1392
+ startTime = Date.now();
1393
+ if (verbosity !== "quiet") {
1394
+ const label = data.name;
1395
+ console.log("");
1396
+ console.log(
1397
+ ` ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
1398
+ );
1399
+ console.log(` ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
1400
+ console.log("");
1401
+ }
1294
1402
  },
1295
1403
  onCaseEnd() {
1296
1404
  completed++;
1297
1405
  if (verbosity !== "quiet") {
1406
+ const elapsed = Date.now() - startTime;
1298
1407
  process.stdout.write(
1299
- `\r ${chalk.dim(`[${completed}/${totalCases}]`)}`
1408
+ `\r${renderProgressBar(completed, totalCases, elapsed)}`
1300
1409
  );
1301
1410
  }
1302
1411
  },
1303
1412
  onRunEnd(data) {
1304
1413
  if (verbosity !== "quiet") {
1305
- process.stdout.write("\r" + " ".repeat(30) + "\r");
1414
+ process.stdout.write("\r" + " ".repeat(70) + "\r");
1306
1415
  }
1307
1416
  renderSummaryTable(data);
1308
1417
  if (verbosity === "quiet") return;
@@ -1315,19 +1424,7 @@ function consoleReporter(options) {
1315
1424
  });
1316
1425
  }
1317
1426
  } else {
1318
- const failing = sorted.filter(
1319
- (c) => getCaseStatus(c, data.threshold) !== "pass"
1320
- );
1321
- if (failing.length > 0) {
1322
- console.log(chalk.dim(` Failing cases (${failing.length}):`));
1323
- console.log("");
1324
- for (const c of failing) {
1325
- renderCaseDetail(c, data.threshold, {
1326
- includeIO: true,
1327
- maxStringLength: 4e3
1328
- });
1329
- }
1330
- }
1427
+ renderFailuresByScorer(sorted, data.threshold);
1331
1428
  }
1332
1429
  }
1333
1430
  };
@@ -1340,40 +1437,69 @@ function truncateString(text, maxLength) {
1340
1437
  if (text.length <= maxLength) return text;
1341
1438
  return text.slice(0, maxLength) + "\u2026";
1342
1439
  }
1440
+ function stringifyRationale(value) {
1441
+ if (typeof value === "string") {
1442
+ const trimmed = value.trim();
1443
+ return trimmed.length > 0 ? trimmed : void 0;
1444
+ }
1445
+ if (Array.isArray(value)) {
1446
+ const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
1447
+ if (parts.length > 0) return parts.join(" | ");
1448
+ }
1449
+ return void 0;
1450
+ }
1451
+ function scoreReasonWithMetadata(score) {
1452
+ const reason = score.reason?.trim();
1453
+ if (reason) return reason;
1454
+ return stringifyRationale(score.metadata?.["rationale"]);
1455
+ }
1343
1456
  function renderSummaryTable(data) {
1344
1457
  const { summary } = data;
1345
- const scoreStr = Object.entries(summary.meanScores).map(([name, score]) => `${name}: ${score.toFixed(3)}`).join(", ");
1458
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1346
1459
  console.log("");
1347
1460
  console.log(chalk.bold(" Summary"));
1348
1461
  console.log(chalk.dim(" " + "\u2500".repeat(60)));
1349
- console.log(` ${chalk.dim("Eval:")} ${data.name}`);
1350
- console.log(` ${chalk.dim("Model:")} ${data.model}`);
1351
- console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
1462
+ console.log(` ${chalk.dim("Eval:")} ${data.name}`);
1463
+ console.log(` ${chalk.dim("Model:")} ${data.model}`);
1464
+ console.log(` ${chalk.dim("Threshold:")} ${data.threshold}`);
1465
+ console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
1352
1466
  console.log(
1353
- ` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
1467
+ ` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
1354
1468
  );
1355
- console.log(` ${chalk.dim("Scores:")} ${scoreStr}`);
1356
1469
  console.log(
1357
- ` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
1470
+ ` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
1358
1471
  );
1359
1472
  console.log(
1360
- ` ${chalk.dim("Tokens:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1473
+ ` ${chalk.dim("Tokens:")} ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)} ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)} ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1361
1474
  );
1475
+ const scoreEntries = Object.entries(summary.meanScores);
1476
+ if (scoreEntries.length > 0) {
1477
+ console.log("");
1478
+ console.log(chalk.bold(" Scores"));
1479
+ for (const [name, score] of scoreEntries) {
1480
+ const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
1481
+ console.log(
1482
+ ` ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
1483
+ );
1484
+ }
1485
+ }
1362
1486
  console.log(chalk.dim(" " + "\u2500".repeat(60)));
1363
1487
  console.log("");
1364
1488
  }
1365
1489
  function renderCaseDetail(c, threshold, options) {
1366
1490
  const entries = Object.entries(c.scores);
1367
- const failed = entries.some(([, s]) => s.score < threshold);
1368
- const prefix = failed ? chalk.red("FAIL") : chalk.green("PASS");
1491
+ const status = getCaseStatus(c, threshold);
1492
+ const prefix = statusLabel(status);
1369
1493
  const includeIO = options?.includeIO ?? false;
1370
1494
  const maxStringLength = options?.maxStringLength ?? 4e3;
1371
- console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)}`);
1495
+ const meta = `${chalk.dim(formatDuration(c.latencyMs))} ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
1496
+ console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)} ${meta}`);
1372
1497
  const inputStr = stringifyUnknown(c.input, {
1373
1498
  space: 2,
1374
1499
  fallback: String(c.input)
1375
1500
  });
1376
- console.log(` ${chalk.dim("Input:")} ${inputStr}`);
1501
+ console.log(` ${chalk.dim("Input:")}`);
1502
+ console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
1377
1503
  if (includeIO) {
1378
1504
  console.log(` ${chalk.dim("Output:")}`);
1379
1505
  console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
@@ -1388,7 +1514,8 @@ function renderCaseDetail(c, threshold, options) {
1388
1514
  }
1389
1515
  for (const [name, s] of entries) {
1390
1516
  const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
1391
- const reasonStr = s.reason ? ` \u2014 ${s.reason}` : "";
1517
+ const reason = scoreReasonWithMetadata(s);
1518
+ const reasonStr = reason ? ` \u2014 ${reason}` : "";
1392
1519
  console.log(
1393
1520
  ` ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
1394
1521
  );
@@ -1400,6 +1527,37 @@ function renderCaseDetail(c, threshold, options) {
1400
1527
  }
1401
1528
  console.log("");
1402
1529
  }
1530
+ function renderFailuresByScorer(cases, threshold) {
1531
+ const scorerNames = /* @__PURE__ */ new Set();
1532
+ for (const c of cases) {
1533
+ for (const name of Object.keys(c.scores)) {
1534
+ scorerNames.add(name);
1535
+ }
1536
+ }
1537
+ let hasFailures = false;
1538
+ for (const scorer of scorerNames) {
1539
+ const failing = cases.filter((c) => {
1540
+ const s = c.scores[scorer];
1541
+ return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
1542
+ });
1543
+ if (failing.length === 0) continue;
1544
+ if (!hasFailures) {
1545
+ console.log(chalk.dim(" Failing by scorer:"));
1546
+ console.log("");
1547
+ hasFailures = true;
1548
+ }
1549
+ console.log(
1550
+ ` ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
1551
+ );
1552
+ console.log(chalk.dim(" " + "\u2500".repeat(40)));
1553
+ for (const c of failing) {
1554
+ renderCaseDetail(c, threshold, {
1555
+ includeIO: true,
1556
+ maxStringLength: 4e3
1557
+ });
1558
+ }
1559
+ }
1560
+ }
1403
1561
 
1404
1562
  // packages/evals/src/reporters/json.ts
1405
1563
  import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
@@ -1441,6 +1599,7 @@ function csvReporter(options) {
1441
1599
  const scorerNames = Object.keys(data.summary.meanScores);
1442
1600
  const headerParts = [
1443
1601
  "index",
1602
+ "status",
1444
1603
  "input",
1445
1604
  "output",
1446
1605
  "expected",
@@ -1454,8 +1613,10 @@ function csvReporter(options) {
1454
1613
  }
1455
1614
  const rows = [headerParts.join(",")];
1456
1615
  for (const c of data.cases) {
1616
+ const status = getCaseStatus(c, data.threshold);
1457
1617
  const parts = [
1458
1618
  String(c.index),
1619
+ status,
1459
1620
  escapeCsv(c.input),
1460
1621
  escapeCsv(c.output),
1461
1622
  escapeCsv(c.expected),
@@ -1484,15 +1645,17 @@ function markdownReporter(options) {
1484
1645
  const { summary } = data;
1485
1646
  const scorerNames = Object.keys(summary.meanScores);
1486
1647
  const lines = [];
1648
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1487
1649
  lines.push(`# ${data.name}`);
1488
1650
  lines.push("");
1489
1651
  lines.push(`**Model:** ${data.model}`);
1652
+ lines.push(`**Threshold:** ${data.threshold}`);
1490
1653
  lines.push(
1491
- `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
1654
+ `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
1492
1655
  );
1493
1656
  lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
1494
1657
  lines.push(
1495
- `**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1658
+ `**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1496
1659
  );
1497
1660
  lines.push("");
1498
1661
  lines.push("## Scores");
@@ -1511,6 +1674,7 @@ function markdownReporter(options) {
1511
1674
  "Input",
1512
1675
  ...scorerNames,
1513
1676
  "Latency",
1677
+ "Tokens",
1514
1678
  "Error"
1515
1679
  ];
1516
1680
  lines.push(`| ${caseHeader.join(" | ")} |`);
@@ -1528,7 +1692,8 @@ function markdownReporter(options) {
1528
1692
  status,
1529
1693
  input,
1530
1694
  ...scores,
1531
- `${c.latencyMs}ms`,
1695
+ formatDuration(c.latencyMs),
1696
+ `${c.tokensIn}/${c.tokensOut}`,
1532
1697
  error
1533
1698
  ];
1534
1699
  lines.push(`| ${row.join(" | ")} |`);
@@ -1553,9 +1718,10 @@ function esc(str) {
1553
1718
  function renderHtml(data) {
1554
1719
  const { summary } = data;
1555
1720
  const scorerNames = Object.keys(summary.meanScores);
1721
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1556
1722
  const caseRows = data.cases.map((c) => {
1557
1723
  const status = getCaseStatus(c, data.threshold);
1558
- const statusLabel = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
1724
+ const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
1559
1725
  const scoresCells = scorerNames.map((name) => {
1560
1726
  const s = c.scores[name];
1561
1727
  const score = s?.score ?? 0;
@@ -1563,13 +1729,19 @@ function renderHtml(data) {
1563
1729
  const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
1564
1730
  return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
1565
1731
  }).join("");
1732
+ const expectedStr = stringifyUnknown(c.expected, {
1733
+ space: 0,
1734
+ fallback: ""
1735
+ });
1566
1736
  return `<tr class="${status}">
1567
1737
  <td>${c.index}</td>
1568
- <td class="${status}">${statusLabel}</td>
1738
+ <td class="${status}">${statusLabel2}</td>
1569
1739
  <td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
1570
1740
  <td class="text">${esc(c.output.slice(0, 120))}</td>
1741
+ <td class="text">${esc(expectedStr.slice(0, 120))}</td>
1571
1742
  ${scoresCells}
1572
- <td>${c.latencyMs}ms</td>
1743
+ <td>${formatDuration(c.latencyMs)}</td>
1744
+ <td>${c.tokensIn}/${c.tokensOut}</td>
1573
1745
  <td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
1574
1746
  </tr>`;
1575
1747
  }).join("\n");
@@ -1607,11 +1779,14 @@ function renderHtml(data) {
1607
1779
  <h1>${esc(data.name)}</h1>
1608
1780
  <div class="meta">
1609
1781
  <span><strong>Model:</strong> ${esc(data.model)}</span>
1782
+ <span><strong>Threshold:</strong> ${data.threshold}</span>
1610
1783
  <span><strong>Cases:</strong> ${summary.totalCases}</span>
1611
1784
  <span><strong>Pass:</strong> ${summary.passCount}</span>
1612
- <span><strong>Fail:</strong> ${summary.failCount}</span>
1785
+ <span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
1613
1786
  <span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
1614
- <span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
1787
+ <span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
1788
+ <span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
1789
+ <span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
1615
1790
  </div>
1616
1791
 
1617
1792
  <h2>Mean Scores</h2>
@@ -1628,8 +1803,10 @@ function renderHtml(data) {
1628
1803
  <th>Status</th>
1629
1804
  <th>Input</th>
1630
1805
  <th>Output</th>
1806
+ <th>Expected</th>
1631
1807
  ${scorerHeaders}
1632
1808
  <th>Latency</th>
1809
+ <th>Tokens</th>
1633
1810
  <th>Error</th>
1634
1811
  </tr>
1635
1812
  </thead>
@@ -1642,14 +1819,149 @@ function renderHtml(data) {
1642
1819
  }
1643
1820
 
1644
1821
  // packages/evals/src/evaluate/index.ts
1645
- async function evaluate(options) {
1646
- if ("models" in options) {
1647
- return evaluateEach(options);
1822
+ var EvalAssertionError = class extends Error {
1823
+ summary;
1824
+ constructor(summary) {
1825
+ const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
1826
+ super(msg);
1827
+ this.name = "EvalAssertionError";
1828
+ this.summary = summary;
1829
+ }
1830
+ };
1831
+ function resolveFailedIndexes(store, suiteName, model, threshold) {
1832
+ const suite = store.findSuiteByName(suiteName);
1833
+ if (!suite) {
1834
+ console.warn(
1835
+ `No previous suite found for '${suiteName}'. Running all cases.`
1836
+ );
1837
+ return /* @__PURE__ */ new Set();
1838
+ }
1839
+ const run = store.getLatestCompletedRun(suite.id, model);
1840
+ if (!run) {
1841
+ console.warn(
1842
+ `No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
1843
+ );
1844
+ return /* @__PURE__ */ new Set();
1648
1845
  }
1649
- return evaluateSingle(options);
1846
+ const failingCases = store.getFailingCases(run.id, threshold);
1847
+ if (failingCases.length === 0) {
1848
+ console.warn(`No failed cases in previous run. Running all cases.`);
1849
+ return /* @__PURE__ */ new Set();
1850
+ }
1851
+ console.warn(
1852
+ `Retrying ${failingCases.length} failed cases from previous run`
1853
+ );
1854
+ return new Set(failingCases.map((c) => c.idx));
1650
1855
  }
1651
- function resolveStore(store) {
1652
- return store instanceof RunStore ? store : new RunStore(store);
1856
+ var EvalBuilder = class {
1857
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
1858
+ #options;
1859
+ #selection = { type: "all" };
1860
+ #shouldAssert = false;
1861
+ constructor(options) {
1862
+ this.#options = options;
1863
+ }
1864
+ #setSelection(selection) {
1865
+ if (this.#selection.type !== "all") {
1866
+ throw new Error(
1867
+ `Cannot combine .${this.#selection.type}() with .${selection.type}()`
1868
+ );
1869
+ }
1870
+ this.#selection = selection;
1871
+ return this;
1872
+ }
1873
+ failed() {
1874
+ return this.#setSelection({ type: "failed" });
1875
+ }
1876
+ cases(spec) {
1877
+ const { indexes } = parseRecordSelection(spec);
1878
+ return this.#setSelection({ type: "cases", indexes });
1879
+ }
1880
+ sample(count) {
1881
+ if (count < 1) {
1882
+ throw new Error("Sample count must be >= 1");
1883
+ }
1884
+ return this.#setSelection({ type: "sample", count });
1885
+ }
1886
+ assert() {
1887
+ this.#shouldAssert = true;
1888
+ return this;
1889
+ }
1890
+ then(onfulfilled, onrejected) {
1891
+ return this.#execute().then(onfulfilled, onrejected);
1892
+ }
1893
+ async #execute() {
1894
+ if ("models" in this.#options) {
1895
+ return this.#executeMulti();
1896
+ }
1897
+ return this.#executeSingle();
1898
+ }
1899
+ #applyDatasetFilter(ds) {
1900
+ switch (this.#selection.type) {
1901
+ case "all":
1902
+ return ds;
1903
+ case "cases":
1904
+ return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
1905
+ case "sample":
1906
+ return dataset(ds).sample(this.#selection.count);
1907
+ case "failed":
1908
+ return ds;
1909
+ }
1910
+ }
1911
+ async #executeSingle() {
1912
+ const options = this.#options;
1913
+ let ds = options.dataset;
1914
+ if (this.#selection.type === "failed") {
1915
+ const indexes = resolveFailedIndexes(
1916
+ options.store,
1917
+ options.name,
1918
+ options.model,
1919
+ options.threshold
1920
+ );
1921
+ if (indexes.size > 0) {
1922
+ ds = filterRecordsByIndex(ds, indexes);
1923
+ }
1924
+ } else {
1925
+ ds = this.#applyDatasetFilter(ds);
1926
+ }
1927
+ const result = await evaluateSingle({ ...options, dataset: ds });
1928
+ if (this.#shouldAssert && result.failCount > 0) {
1929
+ throw new EvalAssertionError(result);
1930
+ }
1931
+ return result;
1932
+ }
1933
+ async #executeMulti() {
1934
+ const options = this.#options;
1935
+ let result;
1936
+ if (this.#selection.type === "failed") {
1937
+ const perModelIndexes = /* @__PURE__ */ new Map();
1938
+ for (const variant of options.models) {
1939
+ perModelIndexes.set(
1940
+ variant.name,
1941
+ resolveFailedIndexes(
1942
+ options.store,
1943
+ options.name,
1944
+ variant.name,
1945
+ options.threshold
1946
+ )
1947
+ );
1948
+ }
1949
+ result = await evaluateEach(options, perModelIndexes);
1950
+ } else {
1951
+ const filtered = this.#applyDatasetFilter(options.dataset);
1952
+ result = await evaluateEach({ ...options, dataset: filtered });
1953
+ }
1954
+ if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
1955
+ throw new EvalAssertionError(result);
1956
+ }
1957
+ return result;
1958
+ }
1959
+ };
1960
+ function evaluate(options) {
1961
+ if ("models" in options) {
1962
+ return new EvalBuilder(options);
1963
+ }
1964
+ return new EvalBuilder(options);
1653
1965
  }
1654
1966
  function wireReporters(reporters) {
1655
1967
  const emitter = new EvalEmitter();
@@ -1682,7 +1994,6 @@ async function notifyRunEnd(reporters, data) {
1682
1994
  await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
1683
1995
  }
1684
1996
  async function evaluateSingle(options) {
1685
- const store = resolveStore(options.store);
1686
1997
  const threshold = options.threshold ?? 0.5;
1687
1998
  const { emitter, cases, getRunId } = wireReporters(options.reporters);
1688
1999
  const summary = await runEval({
@@ -1691,7 +2002,7 @@ async function evaluateSingle(options) {
1691
2002
  dataset: options.dataset,
1692
2003
  task: options.task,
1693
2004
  scorers: options.scorers,
1694
- store,
2005
+ store: options.store,
1695
2006
  emitter,
1696
2007
  suiteId: options.suiteId,
1697
2008
  maxConcurrency: options.maxConcurrency,
@@ -1709,34 +2020,40 @@ async function evaluateSingle(options) {
1709
2020
  });
1710
2021
  return summary;
1711
2022
  }
1712
- async function evaluateEach(options) {
1713
- const store = resolveStore(options.store);
2023
+ async function evaluateEach(options, perModelFailedIndexes) {
1714
2024
  const items = [];
1715
2025
  for await (const item of options.dataset) {
1716
2026
  items.push(item);
1717
2027
  }
1718
- const suite = store.createSuite(options.name);
2028
+ const suite = options.store.createSuite(options.name);
1719
2029
  return Promise.all(
1720
- options.models.map(
1721
- (variant) => evaluateSingle({
2030
+ options.models.map((variant) => {
2031
+ let ds = dataset(items);
2032
+ const failedIndexes = perModelFailedIndexes?.get(variant.name);
2033
+ if (failedIndexes && failedIndexes.size > 0) {
2034
+ ds = filterRecordsByIndex(ds, failedIndexes);
2035
+ }
2036
+ return evaluateSingle({
1722
2037
  name: `${options.name} [${variant.name}]`,
1723
2038
  model: variant.name,
1724
- dataset: dataset(items),
2039
+ dataset: ds,
1725
2040
  task: (input) => options.task(input, variant),
1726
2041
  scorers: options.scorers,
1727
2042
  reporters: options.reporters,
1728
- store,
2043
+ store: options.store,
1729
2044
  suiteId: suite.id,
1730
2045
  maxConcurrency: options.maxConcurrency,
1731
2046
  timeout: options.timeout,
1732
2047
  trials: options.trials,
1733
2048
  threshold: options.threshold
1734
- })
1735
- )
2049
+ });
2050
+ })
1736
2051
  );
1737
2052
  }
1738
2053
  export {
1739
2054
  Dataset,
2055
+ EvalAssertionError,
2056
+ EvalBuilder,
1740
2057
  EvalEmitter,
1741
2058
  RunStore,
1742
2059
  all,
@@ -1748,14 +2065,16 @@ export {
1748
2065
  evaluate,
1749
2066
  exactMatch,
1750
2067
  factuality,
2068
+ filterRecordsByIndex,
1751
2069
  hf,
1752
2070
  htmlReporter,
1753
2071
  includes,
1754
2072
  jsonMatch,
1755
2073
  jsonReporter,
1756
2074
  levenshtein,
1757
- llmJudge,
1758
2075
  markdownReporter,
2076
+ parseRecordSelection,
2077
+ pickFromArray,
1759
2078
  regex,
1760
2079
  runEval,
1761
2080
  weighted