npm - @deepagents/evals - Versions diffs - 0.19.0 → 0.20.0 - Mend

@deepagents/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/dist/dataset/index.d.ts +3 -0
package/dist/dataset/index.d.ts.map +1 -1
package/dist/dataset/index.js +84 -1
package/dist/dataset/index.js.map +3 -3
package/dist/dataset/record-selection.d.ts +8 -0
package/dist/dataset/record-selection.d.ts.map +1 -0
package/dist/evaluate/index.d.ts +16 -3
package/dist/evaluate/index.d.ts.map +1 -1
package/dist/evaluate/index.js +219 -356
package/dist/evaluate/index.js.map +3 -3
package/dist/index.d.ts +4 -4
package/dist/index.d.ts.map +1 -1
package/dist/index.js +373 -52
package/dist/index.js.map +4 -4
package/dist/reporters/console.d.ts.map +1 -1
package/dist/reporters/csv.d.ts.map +1 -1
package/dist/reporters/html.d.ts.map +1 -1
package/dist/reporters/index.js +111 -35
package/dist/reporters/index.js.map +3 -3
package/dist/reporters/markdown.d.ts.map +1 -1
package/dist/store/index.d.ts +2 -0
package/dist/store/index.d.ts.map +1 -1
package/dist/store/index.js +22 -0
package/dist/store/index.js.map +2 -2
package/package.json +2 -2

package/dist/index.js CHANGED Viewed

@@ -60,6 +60,70 @@ async function fetchPage(url) {
   }
 }
+// packages/evals/src/dataset/record-selection.ts
+function parsePositiveInt(token) {
+  if (!/^\d+$/.test(token)) {
+    throw new Error(`Invalid record token "${token}"`);
+  }
+  const value = Number(token);
+  if (!Number.isInteger(value) || value < 1) {
+    throw new Error(`Record numbers must be >= 1. Received "${token}"`);
+  }
+  return value;
+}
+function parseRecordSelection(spec) {
+  const trimmed = spec.trim();
+  if (!trimmed) {
+    return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
+  }
+  const indexes = /* @__PURE__ */ new Set();
+  const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
+  if (parts.length === 0) {
+    throw new Error("Record selection is empty.");
+  }
+  for (const part of parts) {
+    const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
+    if (rangeMatch) {
+      const start = parsePositiveInt(rangeMatch[1]);
+      const end = parsePositiveInt(rangeMatch[2]);
+      if (end < start) {
+        throw new Error(
+          `Invalid range "${part}". Range end must be >= range start.`
+        );
+      }
+      for (let i = start; i <= end; i++) {
+        indexes.add(i - 1);
+      }
+      continue;
+    }
+    const value = parsePositiveInt(part);
+    indexes.add(value - 1);
+  }
+  return {
+    indexes,
+    normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
+  };
+}
+function pickFromArray(items, indexes) {
+  if (indexes.size === 0) return items;
+  return items.filter((_, i) => indexes.has(i));
+}
+async function* filterRecordsByIndex(source, indexes) {
+  if (indexes.size === 0) {
+    for await (const item of source) {
+      yield item;
+    }
+    return;
+  }
+  let idx = 0;
+  for await (const item of source) {
+    if (indexes.has(idx)) {
+      yield item;
+    }
+    idx++;
+  }
+}
 // packages/evals/src/dataset/index.ts
 var Dataset = class _Dataset {
   #source;
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
       }
     });
   }
+  pick(indexes) {
+    const source = this.#source;
+    return new _Dataset(async function* () {
+      if (indexes.size === 0) {
+        yield* source();
+        return;
+      }
+      let idx = 0;
+      for await (const item of source()) {
+        if (indexes.has(idx)) {
+          yield item;
+        }
+        idx++;
+      }
+    });
+  }
   async toArray() {
     const result = [];
     for await (const item of this.#source()) {
@@ -720,6 +800,28 @@ var RunStore = class {
       totalTokensOut: totals.totalTokensOut
     };
   }
+  findSuiteByName(name) {
+    const row = this.#stmt(
+      "SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
+    ).get(name);
+    return row ?? void 0;
+  }
+  getLatestCompletedRun(suiteId, model) {
+    const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
+    const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
+    if (!row) return void 0;
+    return {
+      id: row.id,
+      suite_id: row.suite_id,
+      name: row.name,
+      model: row.model,
+      config: row.config ? JSON.parse(row.config) : null,
+      started_at: row.started_at,
+      finished_at: row.finished_at,
+      status: row.status,
+      summary: row.summary ? JSON.parse(row.summary) : null
+    };
+  }
   listSuites() {
     const rows = this.#stmt(
       "SELECT * FROM suites ORDER BY created_at DESC"
@@ -1283,26 +1385,51 @@ function createRunEndFileReporter(options) {
 }
 // packages/evals/src/reporters/console.ts
+var BAR_WIDTH = 20;
+function renderProgressBar(completed, total, elapsedMs) {
+  const pct = total > 0 ? completed / total : 0;
+  const filled = Math.round(pct * BAR_WIDTH);
+  const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
+  const pctStr = `${(pct * 100).toFixed(0)}%`;
+  return `  ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
+}
+function statusLabel(status) {
+  if (status === "pass") return chalk.green("PASS");
+  if (status === "error") return chalk.yellow("ERROR");
+  return chalk.red("FAIL");
+}
 function consoleReporter(options) {
   const verbosity = options?.verbosity ?? "normal";
   let totalCases = 0;
   let completed = 0;
+  let startTime = 0;
   return {
     onRunStart(data) {
       totalCases = data.totalCases;
       completed = 0;
+      startTime = Date.now();
+      if (verbosity !== "quiet") {
+        const label = data.name;
+        console.log("");
+        console.log(
+          `  ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
+        );
+        console.log(`  ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
+        console.log("");
+      }
     },
     onCaseEnd() {
       completed++;
       if (verbosity !== "quiet") {
+        const elapsed = Date.now() - startTime;
         process.stdout.write(
-          `\r  ${chalk.dim(`[${completed}/${totalCases}]`)}`
+          `\r${renderProgressBar(completed, totalCases, elapsed)}`
         );
       }
     },
     onRunEnd(data) {
       if (verbosity !== "quiet") {
-        process.stdout.write("\r" + " ".repeat(30) + "\r");
+        process.stdout.write("\r" + " ".repeat(70) + "\r");
       }
       renderSummaryTable(data);
       if (verbosity === "quiet") return;
@@ -1315,19 +1442,7 @@ function consoleReporter(options) {
           });
         }
       } else {
-        const failing = sorted.filter(
-          (c) => getCaseStatus(c, data.threshold) !== "pass"
-        );
-        if (failing.length > 0) {
-          console.log(chalk.dim(`  Failing cases (${failing.length}):`));
-          console.log("");
-          for (const c of failing) {
-            renderCaseDetail(c, data.threshold, {
-              includeIO: true,
-              maxStringLength: 4e3
-            });
-          }
-        }
+        renderFailuresByScorer(sorted, data.threshold);
       }
     }
   };
@@ -1342,38 +1457,51 @@ function truncateString(text, maxLength) {
 }
 function renderSummaryTable(data) {
   const { summary } = data;
-  const scoreStr = Object.entries(summary.meanScores).map(([name, score]) => `${name}: ${score.toFixed(3)}`).join(", ");
+  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
   console.log("");
   console.log(chalk.bold("  Summary"));
   console.log(chalk.dim("  " + "\u2500".repeat(60)));
-  console.log(`  ${chalk.dim("Eval:")}     ${data.name}`);
-  console.log(`  ${chalk.dim("Model:")}    ${data.model}`);
-  console.log(`  ${chalk.dim("Cases:")}    ${summary.totalCases}`);
+  console.log(`  ${chalk.dim("Eval:")}      ${data.name}`);
+  console.log(`  ${chalk.dim("Model:")}     ${data.model}`);
+  console.log(`  ${chalk.dim("Threshold:")} ${data.threshold}`);
+  console.log(`  ${chalk.dim("Cases:")}     ${summary.totalCases}`);
   console.log(
-    `  ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
+    `  ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
   );
-  console.log(`  ${chalk.dim("Scores:")}   ${scoreStr}`);
   console.log(
-    `  ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
+    `  ${chalk.dim("Duration:")}  ${formatDuration(summary.totalLatencyMs)}`
   );
   console.log(
-    `  ${chalk.dim("Tokens:")}   ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
+    `  ${chalk.dim("Tokens:")}    ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)}  ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)}  ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
   );
+  const scoreEntries = Object.entries(summary.meanScores);
+  if (scoreEntries.length > 0) {
+    console.log("");
+    console.log(chalk.bold("  Scores"));
+    for (const [name, score] of scoreEntries) {
+      const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
+      console.log(
+        `    ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
+      );
+    }
+  }
   console.log(chalk.dim("  " + "\u2500".repeat(60)));
   console.log("");
 }
 function renderCaseDetail(c, threshold, options) {
   const entries = Object.entries(c.scores);
-  const failed = entries.some(([, s]) => s.score < threshold);
-  const prefix = failed ? chalk.red("FAIL") : chalk.green("PASS");
+  const status = getCaseStatus(c, threshold);
+  const prefix = statusLabel(status);
   const includeIO = options?.includeIO ?? false;
   const maxStringLength = options?.maxStringLength ?? 4e3;
-  console.log(`  ${prefix} ${chalk.dim(`Case #${c.index}`)}`);
+  const meta = `${chalk.dim(formatDuration(c.latencyMs))}  ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
+  console.log(`  ${prefix} ${chalk.dim(`Case #${c.index}`)}  ${meta}`);
   const inputStr = stringifyUnknown(c.input, {
     space: 2,
     fallback: String(c.input)
   });
-  console.log(`    ${chalk.dim("Input:")}  ${inputStr}`);
+  console.log(`    ${chalk.dim("Input:")}`);
+  console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
   if (includeIO) {
     console.log(`    ${chalk.dim("Output:")}`);
     console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
@@ -1400,6 +1528,37 @@ function renderCaseDetail(c, threshold, options) {
   }
   console.log("");
 }
+function renderFailuresByScorer(cases, threshold) {
+  const scorerNames = /* @__PURE__ */ new Set();
+  for (const c of cases) {
+    for (const name of Object.keys(c.scores)) {
+      scorerNames.add(name);
+    }
+  }
+  let hasFailures = false;
+  for (const scorer of scorerNames) {
+    const failing = cases.filter((c) => {
+      const s = c.scores[scorer];
+      return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
+    });
+    if (failing.length === 0) continue;
+    if (!hasFailures) {
+      console.log(chalk.dim("  Failing by scorer:"));
+      console.log("");
+      hasFailures = true;
+    }
+    console.log(
+      `  ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
+    );
+    console.log(chalk.dim("  " + "\u2500".repeat(40)));
+    for (const c of failing) {
+      renderCaseDetail(c, threshold, {
+        includeIO: true,
+        maxStringLength: 4e3
+      });
+    }
+  }
+}
 // packages/evals/src/reporters/json.ts
 import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
@@ -1441,6 +1600,7 @@ function csvReporter(options) {
       const scorerNames = Object.keys(data.summary.meanScores);
       const headerParts = [
         "index",
+        "status",
         "input",
         "output",
         "expected",
@@ -1454,8 +1614,10 @@ function csvReporter(options) {
       }
       const rows = [headerParts.join(",")];
       for (const c of data.cases) {
+        const status = getCaseStatus(c, data.threshold);
         const parts = [
           String(c.index),
+          status,
           escapeCsv(c.input),
           escapeCsv(c.output),
           escapeCsv(c.expected),
@@ -1484,15 +1646,17 @@ function markdownReporter(options) {
       const { summary } = data;
       const scorerNames = Object.keys(summary.meanScores);
       const lines = [];
+      const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
       lines.push(`# ${data.name}`);
       lines.push("");
       lines.push(`**Model:** ${data.model}`);
+      lines.push(`**Threshold:** ${data.threshold}`);
       lines.push(
-        `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
+        `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
       );
       lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
       lines.push(
-        `**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
+        `**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
       );
       lines.push("");
       lines.push("## Scores");
@@ -1511,6 +1675,7 @@ function markdownReporter(options) {
         "Input",
         ...scorerNames,
         "Latency",
+        "Tokens",
         "Error"
       ];
       lines.push(`| ${caseHeader.join(" | ")} |`);
@@ -1528,7 +1693,8 @@ function markdownReporter(options) {
           status,
           input,
           ...scores,
-          `${c.latencyMs}ms`,
+          formatDuration(c.latencyMs),
+          `${c.tokensIn}/${c.tokensOut}`,
           error
         ];
         lines.push(`| ${row.join(" | ")} |`);
@@ -1553,9 +1719,10 @@ function esc(str) {
 function renderHtml(data) {
   const { summary } = data;
   const scorerNames = Object.keys(summary.meanScores);
+  const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
   const caseRows = data.cases.map((c) => {
     const status = getCaseStatus(c, data.threshold);
-    const statusLabel = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
+    const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
     const scoresCells = scorerNames.map((name) => {
       const s = c.scores[name];
       const score = s?.score ?? 0;
@@ -1563,13 +1730,19 @@ function renderHtml(data) {
       const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
       return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
     }).join("");
+    const expectedStr = stringifyUnknown(c.expected, {
+      space: 0,
+      fallback: ""
+    });
     return `<tr class="${status}">
         <td>${c.index}</td>
-        <td class="${status}">${statusLabel}</td>
+        <td class="${status}">${statusLabel2}</td>
         <td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
         <td class="text">${esc(c.output.slice(0, 120))}</td>
+        <td class="text">${esc(expectedStr.slice(0, 120))}</td>
         ${scoresCells}
-        <td>${c.latencyMs}ms</td>
+        <td>${formatDuration(c.latencyMs)}</td>
+        <td>${c.tokensIn}/${c.tokensOut}</td>
         <td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
       </tr>`;
   }).join("\n");
@@ -1607,11 +1780,14 @@ function renderHtml(data) {
   <h1>${esc(data.name)}</h1>
   <div class="meta">
     <span><strong>Model:</strong> ${esc(data.model)}</span>
+    <span><strong>Threshold:</strong> ${data.threshold}</span>
     <span><strong>Cases:</strong> ${summary.totalCases}</span>
     <span><strong>Pass:</strong> ${summary.passCount}</span>
-    <span><strong>Fail:</strong> ${summary.failCount}</span>
+    <span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
     <span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
-    <span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
+    <span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
+    <span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
+    <span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
   </div>
   <h2>Mean Scores</h2>
@@ -1628,8 +1804,10 @@ function renderHtml(data) {
         <th>Status</th>
         <th>Input</th>
         <th>Output</th>
+        <th>Expected</th>
         ${scorerHeaders}
         <th>Latency</th>
+        <th>Tokens</th>
         <th>Error</th>
       </tr>
     </thead>
@@ -1642,14 +1820,149 @@ function renderHtml(data) {
 }
 // packages/evals/src/evaluate/index.ts
-async function evaluate(options) {
-  if ("models" in options) {
-    return evaluateEach(options);
+var EvalAssertionError = class extends Error {
+  summary;
+  constructor(summary) {
+    const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
+    super(msg);
+    this.name = "EvalAssertionError";
+    this.summary = summary;
+  }
+};
+function resolveFailedIndexes(store, suiteName, model, threshold) {
+  const suite = store.findSuiteByName(suiteName);
+  if (!suite) {
+    console.warn(
+      `No previous suite found for '${suiteName}'. Running all cases.`
+    );
+    return /* @__PURE__ */ new Set();
+  }
+  const run = store.getLatestCompletedRun(suite.id, model);
+  if (!run) {
+    console.warn(
+      `No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
+    );
+    return /* @__PURE__ */ new Set();
+  }
+  const failingCases = store.getFailingCases(run.id, threshold);
+  if (failingCases.length === 0) {
+    console.warn(`No failed cases in previous run. Running all cases.`);
+    return /* @__PURE__ */ new Set();
   }
-  return evaluateSingle(options);
+  console.warn(
+    `Retrying ${failingCases.length} failed cases from previous run`
+  );
+  return new Set(failingCases.map((c) => c.idx));
 }
-function resolveStore(store) {
-  return store instanceof RunStore ? store : new RunStore(store);
+var EvalBuilder = class {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  #options;
+  #selection = { type: "all" };
+  #shouldAssert = false;
+  constructor(options) {
+    this.#options = options;
+  }
+  #setSelection(selection) {
+    if (this.#selection.type !== "all") {
+      throw new Error(
+        `Cannot combine .${this.#selection.type}() with .${selection.type}()`
+      );
+    }
+    this.#selection = selection;
+    return this;
+  }
+  failed() {
+    return this.#setSelection({ type: "failed" });
+  }
+  cases(spec) {
+    const { indexes } = parseRecordSelection(spec);
+    return this.#setSelection({ type: "cases", indexes });
+  }
+  sample(count) {
+    if (count < 1) {
+      throw new Error("Sample count must be >= 1");
+    }
+    return this.#setSelection({ type: "sample", count });
+  }
+  assert() {
+    this.#shouldAssert = true;
+    return this;
+  }
+  then(onfulfilled, onrejected) {
+    return this.#execute().then(onfulfilled, onrejected);
+  }
+  async #execute() {
+    if ("models" in this.#options) {
+      return this.#executeMulti();
+    }
+    return this.#executeSingle();
+  }
+  #applyDatasetFilter(ds) {
+    switch (this.#selection.type) {
+      case "all":
+        return ds;
+      case "cases":
+        return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
+      case "sample":
+        return dataset(ds).sample(this.#selection.count);
+      case "failed":
+        return ds;
+    }
+  }
+  async #executeSingle() {
+    const options = this.#options;
+    let ds = options.dataset;
+    if (this.#selection.type === "failed") {
+      const indexes = resolveFailedIndexes(
+        options.store,
+        options.name,
+        options.model,
+        options.threshold
+      );
+      if (indexes.size > 0) {
+        ds = filterRecordsByIndex(ds, indexes);
+      }
+    } else {
+      ds = this.#applyDatasetFilter(ds);
+    }
+    const result = await evaluateSingle({ ...options, dataset: ds });
+    if (this.#shouldAssert && result.failCount > 0) {
+      throw new EvalAssertionError(result);
+    }
+    return result;
+  }
+  async #executeMulti() {
+    const options = this.#options;
+    let result;
+    if (this.#selection.type === "failed") {
+      const perModelIndexes = /* @__PURE__ */ new Map();
+      for (const variant of options.models) {
+        perModelIndexes.set(
+          variant.name,
+          resolveFailedIndexes(
+            options.store,
+            options.name,
+            variant.name,
+            options.threshold
+          )
+        );
+      }
+      result = await evaluateEach(options, perModelIndexes);
+    } else {
+      const filtered = this.#applyDatasetFilter(options.dataset);
+      result = await evaluateEach({ ...options, dataset: filtered });
+    }
+    if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
+      throw new EvalAssertionError(result);
+    }
+    return result;
+  }
+};
+function evaluate(options) {
+  if ("models" in options) {
+    return new EvalBuilder(options);
+  }
+  return new EvalBuilder(options);
 }
 function wireReporters(reporters) {
   const emitter = new EvalEmitter();
@@ -1682,7 +1995,6 @@ async function notifyRunEnd(reporters, data) {
   await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
 }
 async function evaluateSingle(options) {
-  const store = resolveStore(options.store);
   const threshold = options.threshold ?? 0.5;
   const { emitter, cases, getRunId } = wireReporters(options.reporters);
   const summary = await runEval({
@@ -1691,7 +2003,7 @@ async function evaluateSingle(options) {
     dataset: options.dataset,
     task: options.task,
     scorers: options.scorers,
-    store,
+    store: options.store,
     emitter,
     suiteId: options.suiteId,
     maxConcurrency: options.maxConcurrency,
@@ -1709,34 +2021,40 @@ async function evaluateSingle(options) {
   });
   return summary;
 }
-async function evaluateEach(options) {
-  const store = resolveStore(options.store);
+async function evaluateEach(options, perModelFailedIndexes) {
   const items = [];
   for await (const item of options.dataset) {
     items.push(item);
   }
-  const suite = store.createSuite(options.name);
+  const suite = options.store.createSuite(options.name);
   return Promise.all(
-    options.models.map(
-      (variant) => evaluateSingle({
+    options.models.map((variant) => {
+      let ds = dataset(items);
+      const failedIndexes = perModelFailedIndexes?.get(variant.name);
+      if (failedIndexes && failedIndexes.size > 0) {
+        ds = filterRecordsByIndex(ds, failedIndexes);
+      }
+      return evaluateSingle({
         name: `${options.name} [${variant.name}]`,
         model: variant.name,
-        dataset: dataset(items),
+        dataset: ds,
         task: (input) => options.task(input, variant),
         scorers: options.scorers,
         reporters: options.reporters,
-        store,
+        store: options.store,
         suiteId: suite.id,
         maxConcurrency: options.maxConcurrency,
         timeout: options.timeout,
         trials: options.trials,
         threshold: options.threshold
-      })
-    )
+      });
+    })
   );
 }
 export {
   Dataset,
+  EvalAssertionError,
+  EvalBuilder,
   EvalEmitter,
   RunStore,
   all,
@@ -1748,6 +2066,7 @@ export {
   evaluate,
   exactMatch,
   factuality,
+  filterRecordsByIndex,
   hf,
   htmlReporter,
   includes,
@@ -1756,6 +2075,8 @@ export {
   levenshtein,
   llmJudge,
   markdownReporter,
+  parseRecordSelection,
+  pickFromArray,
   regex,
   runEval,
   weighted