npm - @deepagents/evals - Versions diffs - 0.19.0 → 0.22.0 - Mend

@deepagents/evals 0.19.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/README.md +5 -4
package/dist/dataset/index.d.ts +3 -0
package/dist/dataset/index.d.ts.map +1 -1
package/dist/dataset/index.js +84 -1
package/dist/dataset/index.js.map +3 -3
package/dist/dataset/record-selection.d.ts +8 -0
package/dist/dataset/record-selection.d.ts.map +1 -0
package/dist/engine/index.d.ts.map +1 -1
package/dist/engine/index.js +6 -3
package/dist/engine/index.js.map +2 -2
package/dist/evaluate/index.d.ts +16 -3
package/dist/evaluate/index.d.ts.map +1 -1
package/dist/evaluate/index.js +225 -359
package/dist/evaluate/index.js.map +3 -3
package/dist/index.d.ts +5 -5
package/dist/index.d.ts.map +1 -1
package/dist/index.js +429 -110
package/dist/index.js.map +4 -4
package/dist/reporters/console.d.ts.map +1 -1
package/dist/reporters/csv.d.ts.map +1 -1
package/dist/reporters/html.d.ts.map +1 -1
package/dist/reporters/index.js +129 -36
package/dist/reporters/index.js.map +3 -3
package/dist/reporters/markdown.d.ts.map +1 -1
package/dist/scorers/index.d.ts +2 -6
package/dist/scorers/index.d.ts.map +1 -1
package/dist/scorers/index.js +32 -54
package/dist/scorers/index.js.map +2 -2
package/dist/store/index.d.ts +2 -0
package/dist/store/index.d.ts.map +1 -1
package/dist/store/index.js +22 -0
package/dist/store/index.js.map +2 -2
package/package.json +3 -2

package/dist/index.js CHANGED Viewed

@@ -60,6 +60,70 @@ async function fetchPage(url) {
   }
 }
+// packages/evals/src/dataset/record-selection.ts
+function parsePositiveInt(token) {
+  if (!/^\d+$/.test(token)) {
+    throw new Error(`Invalid record token "${token}"`);
+  }
+  const value = Number(token);
+  if (!Number.isInteger(value) || value < 1) {
+    throw new Error(`Record numbers must be >= 1. Received "${token}"`);
+  }
+  return value;
+}
+function parseRecordSelection(spec) {
+  const trimmed = spec.trim();
+  if (!trimmed) {
+    return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
+  }
+  const indexes = /* @__PURE__ */ new Set();
+  const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
+  if (parts.length === 0) {
+    throw new Error("Record selection is empty.");
+  }
+  for (const part of parts) {
+    const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
+    if (rangeMatch) {
+      const start = parsePositiveInt(rangeMatch[1]);
+      const end = parsePositiveInt(rangeMatch[2]);
+      if (end < start) {
+        throw new Error(
+          `Invalid range "${part}". Range end must be >= range start.`
+        );
+      }
+      for (let i = start; i <= end; i++) {
+        indexes.add(i - 1);
+      }
+      continue;
+    }
+    const value = parsePositiveInt(part);
+    indexes.add(value - 1);
+  }
+  return {
+    indexes,
+    normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
+  };
+}
+function pickFromArray(items, indexes) {
+  if (indexes.size === 0) return items;
+  return items.filter((_, i) => indexes.has(i));
+}
+async function* filterRecordsByIndex(source, indexes) {
+  if (indexes.size === 0) {
+    for await (const item of source) {
+      yield item;
+    }
+    return;
+  }
+  let idx = 0;
+  for await (const item of source) {
+    if (indexes.has(idx)) {
+      yield item;
+    }
+    idx++;
+  }
+}
 // packages/evals/src/dataset/index.ts
 var Dataset = class _Dataset {
   #source;
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
       }
     });
   }
+  pick(indexes) {
+    const source = this.#source;
+    return new _Dataset(async function* () {
+      if (indexes.size === 0) {
+        yield* source();
+        return;
+      }
+      let idx = 0;
+      for await (const item of source()) {
+        if (indexes.has(idx)) {
+          yield item;
+        }
+        idx++;
+      }
+    });
+  }
   async toArray() {
     const result = [];
     for await (const item of this.#source()) {
@@ -250,8 +330,10 @@ function dataset(source) {
 }
 // packages/evals/src/scorers/index.ts
-import { generateObject } from "ai";
-import { z } from "zod";
+import {
+  Factuality as AutoevalsFactuality,
+  Levenshtein as AutoevalsLevenshtein
+} from "autoevals";
 var exactMatch = async ({ output, expected }) => {
   const exp = expected == null ? "" : String(expected);
   if (output === exp) return { score: 1 };
@@ -273,32 +355,32 @@ function regex(pattern) {
     return { score: pattern.test(output) ? 1 : 0 };
   };
 }
-function levenshteinDistance(a, b) {
-  if (a.length === 0) return b.length;
-  if (b.length === 0) return a.length;
-  if (a.length > b.length) [a, b] = [b, a];
-  let prev = Array.from({ length: a.length + 1 }, (_, i) => i);
-  let curr = new Array(a.length + 1);
-  for (let j = 1; j <= b.length; j++) {
-    curr[0] = j;
-    for (let i = 1; i <= a.length; i++) {
-      const cost = a[i - 1] === b[j - 1] ? 0 : 1;
-      curr[i] = Math.min(prev[i] + 1, curr[i - 1] + 1, prev[i - 1] + cost);
+function normalizeScore(score) {
+  if (typeof score !== "number" || !Number.isFinite(score)) return 0;
+  return Math.max(0, Math.min(1, score));
+}
+function reasonFromMetadata(metadata) {
+  if (!metadata) return void 0;
+  const candidates = [
+    metadata.reason,
+    metadata.rationale,
+    metadata.explanation
+  ];
+  for (const candidate of candidates) {
+    if (typeof candidate === "string" && candidate.trim().length > 0) {
+      return candidate;
     }
-    [prev, curr] = [curr, prev];
   }
-  return prev[a.length];
+  return void 0;
 }
 var levenshtein = async ({ output, expected }) => {
   const exp = expected == null ? "" : String(expected);
-  if (output.length === 0 && exp.length === 0) return { score: 1 };
-  const maxLen = Math.max(output.length, exp.length);
-  const distance = levenshteinDistance(output, exp);
-  const score = Math.max(0, 1 - distance / maxLen);
-  if (score === 1) return { score };
+  const result = await AutoevalsLevenshtein({ output, expected: exp });
+  const score = normalizeScore(result.score);
   return {
     score,
-    reason: `Levenshtein distance is ${distance} across max length ${maxLen}.`
+    reason: reasonFromMetadata(result.metadata),
+    metadata: result.metadata
   };
 };
 function deepEqual(a, b) {
@@ -332,42 +414,19 @@ var jsonMatch = async ({ output, expected }) => {
     return { score: 0, reason: "Failed to parse JSON" };
   }
 };
-var llmScorerSchema = z.object({
-  score: z.number().min(0).max(1),
-  reason: z.string()
-});
-function llmJudge(config) {
-  return async ({ input, output, expected }) => {
-    const { object } = await generateObject({
-      model: config.model,
-      schema: llmScorerSchema,
-      prompt: `You are an expert evaluator. Grade the output based on the following criteria:
-${config.criteria}
-Input: ${JSON.stringify(input)}
-Output: ${output}
-${expected != null ? `Expected: ${JSON.stringify(expected)}` : ""}
-Return a score from 0.0 to 1.0 and a brief reason.`
-    });
-    return { score: object.score, reason: object.reason };
-  };
-}
 function factuality(config) {
   return async ({ input, output, expected }) => {
-    const { object } = await generateObject({
+    const result = await AutoevalsFactuality({
       model: config.model,
-      schema: llmScorerSchema,
-      prompt: `You are a factuality evaluator. Determine whether the output is factually consistent with the expected reference.
-Input: ${JSON.stringify(input)}
-Output: ${output}
-Expected reference: ${JSON.stringify(expected)}
-Score 1.0 if the output is factually consistent with the reference, 0.0 if it contradicts it. Use intermediate scores for partial consistency.
-Return a score from 0.0 to 1.0 and a brief reason.`
+      input: typeof input === "string" ? input : JSON.stringify(input),
+      output,
+      expected: expected == null ? void 0 : String(expected)
     });
-    return { score: object.score, reason: object.reason };
+    return {
+      score: normalizeScore(result.score),
+      reason: reasonFromMetadata(result.metadata),
+      metadata: result.metadata
+    };
   };
 }
 function all(...scorers) {
@@ -720,6 +779,28 @@ var RunStore = class {
       totalTokensOut: totals.totalTokensOut
     };
   }
+  findSuiteByName(name) {
+    const row = this.#stmt(
+      "SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
+    ).get(name);
+    return row ?? void 0;
+  }
+  getLatestCompletedRun(suiteId, model) {
+    const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
+    const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
+    if (!row) return void 0;
+    return {
+      id: row.id,
+      suite_id: row.suite_id,
+      name: row.name,
+      model: row.model,
+      config: row.config ? JSON.parse(row.config) : null,
+      started_at: row.started_at,
+      finished_at: row.finished_at,
+      status: row.status,
+      summary: row.summary ? JSON.parse(row.summary) : null
+    };
+  }
   listSuites() {
     const rows = this.#stmt(
       "SELECT * FROM suites ORDER BY created_at DESC"
@@ -940,7 +1021,8 @@ async function runEval(config) {
               });
               scores[sName] = {
                 score: clampScore(sr.score, sName),
-                reason: sr.reason
+                reason: sr.reason,
+                metadata: sr.metadata
               };
             }
             trialResults.push({ result, scores });
@@ -966,7 +1048,8 @@ async function runEval(config) {
           const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
           finalScores[sName] = {
             score: meanScore,
-            reason: trialResults[trialResults.length - 1].scores[sName]?.reason
+            reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
+            metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
           };
         }
       } else {
@@ -983,7 +1066,8 @@ async function runEval(config) {
             });
             finalScores[sName] = {
               score: clampScore(sr.score, sName),
-              reason: sr.reason
+              reason: sr.reason,
+              metadata: sr.metadata
             };
           }
         }
@@ -1283,26 +1367,51 @@ function createRunEndFileReporter(options) {
 }
 // packages/evals/src/reporters/console.ts
+var BAR_WIDTH = 20;
+function renderProgressBar(completed, total, elapsedMs) {
+  const pct = total > 0 ? completed / total : 0;
+  const filled = Math.round(pct * BAR_WIDTH);
+  const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
+  const pctStr = `${(pct * 100).toFixed(0)}%`;
+  return `  ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
+}
+function statusLabel(status) {
+  if (status === "pass") return chalk.green("PASS");
+  if (status === "error") return chalk.yellow("ERROR");
+  return chalk.red("FAIL");
+}
 function consoleReporter(options) {
   const verbosity = options?.verbosity ?? "normal";
   let totalCases = 0;
   let completed = 0;
+  let startTime = 0;
   return {
     onRunStart(data) {
       totalCases = data.totalCases;
       completed = 0;
+      startTime = Date.now();
+      if (verbosity !== "quiet") {
+        const label = data.name;
+        console.log("");
+        console.log(
+          `  ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
+        );
+        console.log(`  ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
+        console.log("");
+      }
     },
     onCaseEnd() {
       completed++;
       if (verbosity !== "quiet") {
+        const elapsed = Date.now() - startTime;
         process.stdout.write(
-          `\r  ${chalk.dim(`[${completed}/${totalCases}]`)}`
+          `\r${renderProgressBar(completed, totalCases, elapsed)}`
         );
       }
     },
     onRunEnd(data) {
       if (verbosity !== "quiet") {
-        process.stdout.write("\r" + " ".repeat(30) + "\r");
+        process.stdout.write("\r" + " ".repeat(70) + "\r");
       }
       renderSummaryTable(data);
       if (verbosity === "quiet") return;
@@ -1315,19 +1424,7 @@ function consoleReporter(options) {
           });
         }
       } else {
-        const failing = sorted.filter(
-          (c) => getCaseStatus(c, data.threshold) !== "pass"
-        );
-        if (failing.length > 0) {
-          console.log(chalk.dim(`  Failing cases (${failing.length}):`));
-          console.log("");
-          for (const c of failing) {
-            renderCaseDetail(c, data.threshold, {
-              includeIO: true,
-              maxStringLength: 4e3
-            });
-          }
-        }
+        renderFailuresByScorer(sorted, data.threshold);
       }
     }
   };
@@ -1340,40 +1437,69 @@ function truncateString(text, maxLength) {
   if (text.length <= maxLength) return text;
   return text.slice(0, maxLength) + "\u2026";
 }
+function stringifyRationale(value) {
+  if (typeof value === "string") {
+    const trimmed = value.trim();
+    return trimmed.length > 0 ? trimmed : void 0;
+  }
+  if (Array.isArray(value)) {
+    const parts = value.map((item) => typeof item === "string" ? item.trim() : "").filter(Boolean);
+    if (parts.length > 0) return parts.join(" | ");
+  }
+  return void 0;
+}
+function scoreReasonWithMetadata(score) {
+  const reason = score.reason?.trim();
+  if (reason) return reason;
+  return stringifyRationale(score.metadata?.["rationale"]);
+}
 function renderSummaryTable(data) {
   const { summary } = data;
-  const scoreStr = Object.entries(summary.meanScores).map(([name, score]) => `${name}: ${score.toFixed(3)}`).join(", ");
+  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
   console.log("");
   console.log(chalk.bold("  Summary"));
   console.log(chalk.dim("  " + "\u2500".repeat(60)));
-  console.log(`  ${chalk.dim("Eval:")}     ${data.name}`);
-  console.log(`  ${chalk.dim("Model:")}    ${data.model}`);
-  console.log(`  ${chalk.dim("Cases:")}    ${summary.totalCases}`);
+  console.log(`  ${chalk.dim("Eval:")}      ${data.name}`);
+  console.log(`  ${chalk.dim("Model:")}     ${data.model}`);
+  console.log(`  ${chalk.dim("Threshold:")} ${data.threshold}`);
+  console.log(`  ${chalk.dim("Cases:")}     ${summary.totalCases}`);
   console.log(
-    `  ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
+    `  ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
   );
-  console.log(`  ${chalk.dim("Scores:")}   ${scoreStr}`);
   console.log(
-    `  ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
+    `  ${chalk.dim("Duration:")}  ${formatDuration(summary.totalLatencyMs)}`
   );
   console.log(
-    `  ${chalk.dim("Tokens:")}   ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
+    `  ${chalk.dim("Tokens:")}    ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)}  ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)}  ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
   );
+  const scoreEntries = Object.entries(summary.meanScores);
+  if (scoreEntries.length > 0) {
+    console.log("");
+    console.log(chalk.bold("  Scores"));
+    for (const [name, score] of scoreEntries) {
+      const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
+      console.log(
+        `    ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
+      );
+    }
+  }
   console.log(chalk.dim("  " + "\u2500".repeat(60)));
   console.log("");
 }
 function renderCaseDetail(c, threshold, options) {
   const entries = Object.entries(c.scores);
-  const failed = entries.some(([, s]) => s.score < threshold);
-  const prefix = failed ? chalk.red("FAIL") : chalk.green("PASS");
+  const status = getCaseStatus(c, threshold);
+  const prefix = statusLabel(status);
   const includeIO = options?.includeIO ?? false;
   const maxStringLength = options?.maxStringLength ?? 4e3;
-  console.log(`  ${prefix} ${chalk.dim(`Case #${c.index}`)}`);
+  const meta = `${chalk.dim(formatDuration(c.latencyMs))}  ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
+  console.log(`  ${prefix} ${chalk.dim(`Case #${c.index}`)}  ${meta}`);
   const inputStr = stringifyUnknown(c.input, {
     space: 2,
     fallback: String(c.input)
   });
-  console.log(`    ${chalk.dim("Input:")}  ${inputStr}`);
+  console.log(`    ${chalk.dim("Input:")}`);
+  console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
   if (includeIO) {
     console.log(`    ${chalk.dim("Output:")}`);
     console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
@@ -1388,7 +1514,8 @@ function renderCaseDetail(c, threshold, options) {
   }
   for (const [name, s] of entries) {
     const scoreColor = s.score >= threshold ? chalk.green : chalk.red;
-    const reasonStr = s.reason ? ` \u2014 ${s.reason}` : "";
+    const reason = scoreReasonWithMetadata(s);
+    const reasonStr = reason ? ` \u2014 ${reason}` : "";
     console.log(
       `    ${chalk.dim(name + ":")} ${scoreColor(s.score.toFixed(3))}${reasonStr}`
     );
@@ -1400,6 +1527,37 @@ function renderCaseDetail(c, threshold, options) {
   }
   console.log("");
 }
+function renderFailuresByScorer(cases, threshold) {
+  const scorerNames = /* @__PURE__ */ new Set();
+  for (const c of cases) {
+    for (const name of Object.keys(c.scores)) {
+      scorerNames.add(name);
+    }
+  }
+  let hasFailures = false;
+  for (const scorer of scorerNames) {
+    const failing = cases.filter((c) => {
+      const s = c.scores[scorer];
+      return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
+    });
+    if (failing.length === 0) continue;
+    if (!hasFailures) {
+      console.log(chalk.dim("  Failing by scorer:"));
+      console.log("");
+      hasFailures = true;
+    }
+    console.log(
+      `  ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
+    );
+    console.log(chalk.dim("  " + "\u2500".repeat(40)));
+    for (const c of failing) {
+      renderCaseDetail(c, threshold, {
+        includeIO: true,
+        maxStringLength: 4e3
+      });
+    }
+  }
+}
 // packages/evals/src/reporters/json.ts
 import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
@@ -1441,6 +1599,7 @@ function csvReporter(options) {
       const scorerNames = Object.keys(data.summary.meanScores);
       const headerParts = [
         "index",
+        "status",
         "input",
         "output",
         "expected",
@@ -1454,8 +1613,10 @@ function csvReporter(options) {
       }
       const rows = [headerParts.join(",")];
       for (const c of data.cases) {
+        const status = getCaseStatus(c, data.threshold);
         const parts = [
           String(c.index),
+          status,
           escapeCsv(c.input),
           escapeCsv(c.output),
           escapeCsv(c.expected),
@@ -1484,15 +1645,17 @@ function markdownReporter(options) {
       const { summary } = data;
       const scorerNames = Object.keys(summary.meanScores);
       const lines = [];
+      const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
       lines.push(`# ${data.name}`);
       lines.push("");
       lines.push(`**Model:** ${data.model}`);
+      lines.push(`**Threshold:** ${data.threshold}`);
       lines.push(
-        `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
+        `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
       );
       lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
       lines.push(
-        `**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
+        `**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
       );
       lines.push("");
       lines.push("## Scores");
@@ -1511,6 +1674,7 @@ function markdownReporter(options) {
         "Input",
         ...scorerNames,
         "Latency",
+        "Tokens",
         "Error"
       ];
       lines.push(`| ${caseHeader.join(" | ")} |`);
@@ -1528,7 +1692,8 @@ function markdownReporter(options) {
           status,
           input,
           ...scores,
-          `${c.latencyMs}ms`,
+          formatDuration(c.latencyMs),
+          `${c.tokensIn}/${c.tokensOut}`,
           error
         ];
         lines.push(`| ${row.join(" | ")} |`);
@@ -1553,9 +1718,10 @@ function esc(str) {
 function renderHtml(data) {
   const { summary } = data;
   const scorerNames = Object.keys(summary.meanScores);
+  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
   const caseRows = data.cases.map((c) => {
     const status = getCaseStatus(c, data.threshold);
-    const statusLabel = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
+    const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
     const scoresCells = scorerNames.map((name) => {
       const s = c.scores[name];
       const score = s?.score ?? 0;
@@ -1563,13 +1729,19 @@ function renderHtml(data) {
       const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
       return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
     }).join("");
+    const expectedStr = stringifyUnknown(c.expected, {
+      space: 0,
+      fallback: ""
+    });
     return `<tr class="${status}">
         <td>${c.index}</td>
-        <td class="${status}">${statusLabel}</td>
+        <td class="${status}">${statusLabel2}</td>
         <td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
         <td class="text">${esc(c.output.slice(0, 120))}</td>
+        <td class="text">${esc(expectedStr.slice(0, 120))}</td>
         ${scoresCells}
-        <td>${c.latencyMs}ms</td>
+        <td>${formatDuration(c.latencyMs)}</td>
+        <td>${c.tokensIn}/${c.tokensOut}</td>
         <td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
       </tr>`;
   }).join("\n");
@@ -1607,11 +1779,14 @@ function renderHtml(data) {
   <h1>${esc(data.name)}</h1>
   <div class="meta">
     <span><strong>Model:</strong> ${esc(data.model)}</span>
+    <span><strong>Threshold:</strong> ${data.threshold}</span>
     <span><strong>Cases:</strong> ${summary.totalCases}</span>
     <span><strong>Pass:</strong> ${summary.passCount}</span>
-    <span><strong>Fail:</strong> ${summary.failCount}</span>
+    <span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
     <span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
-    <span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
+    <span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
+    <span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
+    <span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
   </div>
   <h2>Mean Scores</h2>
@@ -1628,8 +1803,10 @@ function renderHtml(data) {
         <th>Status</th>
         <th>Input</th>
         <th>Output</th>
+        <th>Expected</th>
         ${scorerHeaders}
         <th>Latency</th>
+        <th>Tokens</th>
         <th>Error</th>
       </tr>
     </thead>
@@ -1642,14 +1819,149 @@ function renderHtml(data) {
 }
 // packages/evals/src/evaluate/index.ts
-async function evaluate(options) {
-  if ("models" in options) {
-    return evaluateEach(options);
+var EvalAssertionError = class extends Error {
+  summary;
+  constructor(summary) {
+    const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
+    super(msg);
+    this.name = "EvalAssertionError";
+    this.summary = summary;
+  }
+};
+function resolveFailedIndexes(store, suiteName, model, threshold) {
+  const suite = store.findSuiteByName(suiteName);
+  if (!suite) {
+    console.warn(
+      `No previous suite found for '${suiteName}'. Running all cases.`
+    );
+    return /* @__PURE__ */ new Set();
+  }
+  const run = store.getLatestCompletedRun(suite.id, model);
+  if (!run) {
+    console.warn(
+      `No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
+    );
+    return /* @__PURE__ */ new Set();
   }
-  return evaluateSingle(options);
+  const failingCases = store.getFailingCases(run.id, threshold);
+  if (failingCases.length === 0) {
+    console.warn(`No failed cases in previous run. Running all cases.`);
+    return /* @__PURE__ */ new Set();
+  }
+  console.warn(
+    `Retrying ${failingCases.length} failed cases from previous run`
+  );
+  return new Set(failingCases.map((c) => c.idx));
 }
-function resolveStore(store) {
-  return store instanceof RunStore ? store : new RunStore(store);
+var EvalBuilder = class {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  #options;
+  #selection = { type: "all" };
+  #shouldAssert = false;
+  constructor(options) {
+    this.#options = options;
+  }
+  #setSelection(selection) {
+    if (this.#selection.type !== "all") {
+      throw new Error(
+        `Cannot combine .${this.#selection.type}() with .${selection.type}()`
+      );
+    }
+    this.#selection = selection;
+    return this;
+  }
+  failed() {
+    return this.#setSelection({ type: "failed" });
+  }
+  cases(spec) {
+    const { indexes } = parseRecordSelection(spec);
+    return this.#setSelection({ type: "cases", indexes });
+  }
+  sample(count) {
+    if (count < 1) {
+      throw new Error("Sample count must be >= 1");
+    }
+    return this.#setSelection({ type: "sample", count });
+  }
+  assert() {
+    this.#shouldAssert = true;
+    return this;
+  }
+  then(onfulfilled, onrejected) {
+    return this.#execute().then(onfulfilled, onrejected);
+  }
+  async #execute() {
+    if ("models" in this.#options) {
+      return this.#executeMulti();
+    }
+    return this.#executeSingle();
+  }
+  #applyDatasetFilter(ds) {
+    switch (this.#selection.type) {
+      case "all":
+        return ds;
+      case "cases":
+        return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
+      case "sample":
+        return dataset(ds).sample(this.#selection.count);
+      case "failed":
+        return ds;
+    }
+  }
+  async #executeSingle() {
+    const options = this.#options;
+    let ds = options.dataset;
+    if (this.#selection.type === "failed") {
+      const indexes = resolveFailedIndexes(
+        options.store,
+        options.name,
+        options.model,
+        options.threshold
+      );
+      if (indexes.size > 0) {
+        ds = filterRecordsByIndex(ds, indexes);
+      }
+    } else {
+      ds = this.#applyDatasetFilter(ds);
+    }
+    const result = await evaluateSingle({ ...options, dataset: ds });
+    if (this.#shouldAssert && result.failCount > 0) {
+      throw new EvalAssertionError(result);
+    }
+    return result;
+  }
+  async #executeMulti() {
+    const options = this.#options;
+    let result;
+    if (this.#selection.type === "failed") {
+      const perModelIndexes = /* @__PURE__ */ new Map();
+      for (const variant of options.models) {
+        perModelIndexes.set(
+          variant.name,
+          resolveFailedIndexes(
+            options.store,
+            options.name,
+            variant.name,
+            options.threshold
+          )
+        );
+      }
+      result = await evaluateEach(options, perModelIndexes);
+    } else {
+      const filtered = this.#applyDatasetFilter(options.dataset);
+      result = await evaluateEach({ ...options, dataset: filtered });
+    }
+    if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
+      throw new EvalAssertionError(result);
+    }
+    return result;
+  }
+};
+function evaluate(options) {
+  if ("models" in options) {
+    return new EvalBuilder(options);
+  }
+  return new EvalBuilder(options);
 }
 function wireReporters(reporters) {
   const emitter = new EvalEmitter();
@@ -1682,7 +1994,6 @@ async function notifyRunEnd(reporters, data) {
   await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
 }
 async function evaluateSingle(options) {
-  const store = resolveStore(options.store);
   const threshold = options.threshold ?? 0.5;
   const { emitter, cases, getRunId } = wireReporters(options.reporters);
   const summary = await runEval({
@@ -1691,7 +2002,7 @@ async function evaluateSingle(options) {
     dataset: options.dataset,
     task: options.task,
     scorers: options.scorers,
-    store,
+    store: options.store,
     emitter,
     suiteId: options.suiteId,
     maxConcurrency: options.maxConcurrency,
@@ -1709,34 +2020,40 @@ async function evaluateSingle(options) {
   });
   return summary;
 }
-async function evaluateEach(options) {
-  const store = resolveStore(options.store);
+async function evaluateEach(options, perModelFailedIndexes) {
   const items = [];
   for await (const item of options.dataset) {
     items.push(item);
   }
-  const suite = store.createSuite(options.name);
+  const suite = options.store.createSuite(options.name);
   return Promise.all(
-    options.models.map(
-      (variant) => evaluateSingle({
+    options.models.map((variant) => {
+      let ds = dataset(items);
+      const failedIndexes = perModelFailedIndexes?.get(variant.name);
+      if (failedIndexes && failedIndexes.size > 0) {
+        ds = filterRecordsByIndex(ds, failedIndexes);
+      }
+      return evaluateSingle({
         name: `${options.name} [${variant.name}]`,
         model: variant.name,
-        dataset: dataset(items),
+        dataset: ds,
         task: (input) => options.task(input, variant),
         scorers: options.scorers,
         reporters: options.reporters,
-        store,
+        store: options.store,
         suiteId: suite.id,
         maxConcurrency: options.maxConcurrency,
         timeout: options.timeout,
         trials: options.trials,
         threshold: options.threshold
-      })
-    )
+      });
+    })
   );
 }
 export {
   Dataset,
+  EvalAssertionError,
+  EvalBuilder,
   EvalEmitter,
   RunStore,
   all,
@@ -1748,14 +2065,16 @@ export {
   evaluate,
   exactMatch,
   factuality,
+  filterRecordsByIndex,
   hf,
   htmlReporter,
   includes,
   jsonMatch,
   jsonReporter,
   levenshtein,
-  llmJudge,
   markdownReporter,
+  parseRecordSelection,
+  pickFromArray,
   regex,
   runEval,
   weighted