@deepagents/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -60,6 +60,70 @@ async function fetchPage(url) {
60
60
  }
61
61
  }
62
62
 
63
+ // packages/evals/src/dataset/record-selection.ts
64
+ function parsePositiveInt(token) {
65
+ if (!/^\d+$/.test(token)) {
66
+ throw new Error(`Invalid record token "${token}"`);
67
+ }
68
+ const value = Number(token);
69
+ if (!Number.isInteger(value) || value < 1) {
70
+ throw new Error(`Record numbers must be >= 1. Received "${token}"`);
71
+ }
72
+ return value;
73
+ }
74
+ function parseRecordSelection(spec) {
75
+ const trimmed = spec.trim();
76
+ if (!trimmed) {
77
+ return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
78
+ }
79
+ const indexes = /* @__PURE__ */ new Set();
80
+ const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
81
+ if (parts.length === 0) {
82
+ throw new Error("Record selection is empty.");
83
+ }
84
+ for (const part of parts) {
85
+ const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
86
+ if (rangeMatch) {
87
+ const start = parsePositiveInt(rangeMatch[1]);
88
+ const end = parsePositiveInt(rangeMatch[2]);
89
+ if (end < start) {
90
+ throw new Error(
91
+ `Invalid range "${part}". Range end must be >= range start.`
92
+ );
93
+ }
94
+ for (let i = start; i <= end; i++) {
95
+ indexes.add(i - 1);
96
+ }
97
+ continue;
98
+ }
99
+ const value = parsePositiveInt(part);
100
+ indexes.add(value - 1);
101
+ }
102
+ return {
103
+ indexes,
104
+ normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
105
+ };
106
+ }
107
+ function pickFromArray(items, indexes) {
108
+ if (indexes.size === 0) return items;
109
+ return items.filter((_, i) => indexes.has(i));
110
+ }
111
+ async function* filterRecordsByIndex(source, indexes) {
112
+ if (indexes.size === 0) {
113
+ for await (const item of source) {
114
+ yield item;
115
+ }
116
+ return;
117
+ }
118
+ let idx = 0;
119
+ for await (const item of source) {
120
+ if (indexes.has(idx)) {
121
+ yield item;
122
+ }
123
+ idx++;
124
+ }
125
+ }
126
+
63
127
  // packages/evals/src/dataset/index.ts
64
128
  var Dataset = class _Dataset {
65
129
  #source;
@@ -128,6 +192,22 @@ var Dataset = class _Dataset {
128
192
  }
129
193
  });
130
194
  }
195
+ pick(indexes) {
196
+ const source = this.#source;
197
+ return new _Dataset(async function* () {
198
+ if (indexes.size === 0) {
199
+ yield* source();
200
+ return;
201
+ }
202
+ let idx = 0;
203
+ for await (const item of source()) {
204
+ if (indexes.has(idx)) {
205
+ yield item;
206
+ }
207
+ idx++;
208
+ }
209
+ });
210
+ }
131
211
  async toArray() {
132
212
  const result = [];
133
213
  for await (const item of this.#source()) {
@@ -720,6 +800,28 @@ var RunStore = class {
720
800
  totalTokensOut: totals.totalTokensOut
721
801
  };
722
802
  }
803
+ findSuiteByName(name) {
804
+ const row = this.#stmt(
805
+ "SELECT * FROM suites WHERE name = ? ORDER BY created_at DESC LIMIT 1"
806
+ ).get(name);
807
+ return row ?? void 0;
808
+ }
809
+ getLatestCompletedRun(suiteId, model) {
810
+ const sql = model ? "SELECT * FROM runs WHERE suite_id = ? AND status = ? AND model = ? ORDER BY started_at DESC LIMIT 1" : "SELECT * FROM runs WHERE suite_id = ? AND status = ? ORDER BY started_at DESC LIMIT 1";
811
+ const row = model ? this.#stmt(sql).get(suiteId, "completed", model) : this.#stmt(sql).get(suiteId, "completed");
812
+ if (!row) return void 0;
813
+ return {
814
+ id: row.id,
815
+ suite_id: row.suite_id,
816
+ name: row.name,
817
+ model: row.model,
818
+ config: row.config ? JSON.parse(row.config) : null,
819
+ started_at: row.started_at,
820
+ finished_at: row.finished_at,
821
+ status: row.status,
822
+ summary: row.summary ? JSON.parse(row.summary) : null
823
+ };
824
+ }
723
825
  listSuites() {
724
826
  const rows = this.#stmt(
725
827
  "SELECT * FROM suites ORDER BY created_at DESC"
@@ -1283,26 +1385,51 @@ function createRunEndFileReporter(options) {
1283
1385
  }
1284
1386
 
1285
1387
  // packages/evals/src/reporters/console.ts
1388
+ var BAR_WIDTH = 20;
1389
+ function renderProgressBar(completed, total, elapsedMs) {
1390
+ const pct = total > 0 ? completed / total : 0;
1391
+ const filled = Math.round(pct * BAR_WIDTH);
1392
+ const bar = "\u2593".repeat(filled) + "\u2591".repeat(BAR_WIDTH - filled);
1393
+ const pctStr = `${(pct * 100).toFixed(0)}%`;
1394
+ return ` ${bar} ${pctStr} (${completed}/${total}) ${formatDuration(elapsedMs)}`;
1395
+ }
1396
+ function statusLabel(status) {
1397
+ if (status === "pass") return chalk.green("PASS");
1398
+ if (status === "error") return chalk.yellow("ERROR");
1399
+ return chalk.red("FAIL");
1400
+ }
1286
1401
  function consoleReporter(options) {
1287
1402
  const verbosity = options?.verbosity ?? "normal";
1288
1403
  let totalCases = 0;
1289
1404
  let completed = 0;
1405
+ let startTime = 0;
1290
1406
  return {
1291
1407
  onRunStart(data) {
1292
1408
  totalCases = data.totalCases;
1293
1409
  completed = 0;
1410
+ startTime = Date.now();
1411
+ if (verbosity !== "quiet") {
1412
+ const label = data.name;
1413
+ console.log("");
1414
+ console.log(
1415
+ ` ${chalk.dim("\u2500\u2500")} ${chalk.bold(label)} ${chalk.dim("\u2500".repeat(Math.max(0, 56 - label.length)))}`
1416
+ );
1417
+ console.log(` ${chalk.dim(`Running ${data.totalCases} cases...`)}`);
1418
+ console.log("");
1419
+ }
1294
1420
  },
1295
1421
  onCaseEnd() {
1296
1422
  completed++;
1297
1423
  if (verbosity !== "quiet") {
1424
+ const elapsed = Date.now() - startTime;
1298
1425
  process.stdout.write(
1299
- `\r ${chalk.dim(`[${completed}/${totalCases}]`)}`
1426
+ `\r${renderProgressBar(completed, totalCases, elapsed)}`
1300
1427
  );
1301
1428
  }
1302
1429
  },
1303
1430
  onRunEnd(data) {
1304
1431
  if (verbosity !== "quiet") {
1305
- process.stdout.write("\r" + " ".repeat(30) + "\r");
1432
+ process.stdout.write("\r" + " ".repeat(70) + "\r");
1306
1433
  }
1307
1434
  renderSummaryTable(data);
1308
1435
  if (verbosity === "quiet") return;
@@ -1315,19 +1442,7 @@ function consoleReporter(options) {
1315
1442
  });
1316
1443
  }
1317
1444
  } else {
1318
- const failing = sorted.filter(
1319
- (c) => getCaseStatus(c, data.threshold) !== "pass"
1320
- );
1321
- if (failing.length > 0) {
1322
- console.log(chalk.dim(` Failing cases (${failing.length}):`));
1323
- console.log("");
1324
- for (const c of failing) {
1325
- renderCaseDetail(c, data.threshold, {
1326
- includeIO: true,
1327
- maxStringLength: 4e3
1328
- });
1329
- }
1330
- }
1445
+ renderFailuresByScorer(sorted, data.threshold);
1331
1446
  }
1332
1447
  }
1333
1448
  };
@@ -1342,38 +1457,51 @@ function truncateString(text, maxLength) {
1342
1457
  }
1343
1458
  function renderSummaryTable(data) {
1344
1459
  const { summary } = data;
1345
- const scoreStr = Object.entries(summary.meanScores).map(([name, score]) => `${name}: ${score.toFixed(3)}`).join(", ");
1460
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1346
1461
  console.log("");
1347
1462
  console.log(chalk.bold(" Summary"));
1348
1463
  console.log(chalk.dim(" " + "\u2500".repeat(60)));
1349
- console.log(` ${chalk.dim("Eval:")} ${data.name}`);
1350
- console.log(` ${chalk.dim("Model:")} ${data.model}`);
1351
- console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
1464
+ console.log(` ${chalk.dim("Eval:")} ${data.name}`);
1465
+ console.log(` ${chalk.dim("Model:")} ${data.model}`);
1466
+ console.log(` ${chalk.dim("Threshold:")} ${data.threshold}`);
1467
+ console.log(` ${chalk.dim("Cases:")} ${summary.totalCases}`);
1352
1468
  console.log(
1353
- ` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))}`
1469
+ ` ${chalk.dim("Pass/Fail:")} ${chalk.green(String(summary.passCount))} / ${chalk.red(String(summary.failCount))} ${chalk.dim(`(${passRate}%)`)}`
1354
1470
  );
1355
- console.log(` ${chalk.dim("Scores:")} ${scoreStr}`);
1356
1471
  console.log(
1357
- ` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
1472
+ ` ${chalk.dim("Duration:")} ${formatDuration(summary.totalLatencyMs)}`
1358
1473
  );
1359
1474
  console.log(
1360
- ` ${chalk.dim("Tokens:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1475
+ ` ${chalk.dim("Tokens:")} ${chalk.dim("In:")} ${formatTokens(summary.totalTokensIn)} ${chalk.dim("Out:")} ${formatTokens(summary.totalTokensOut)} ${chalk.dim("Total:")} ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1361
1476
  );
1477
+ const scoreEntries = Object.entries(summary.meanScores);
1478
+ if (scoreEntries.length > 0) {
1479
+ console.log("");
1480
+ console.log(chalk.bold(" Scores"));
1481
+ for (const [name, score] of scoreEntries) {
1482
+ const scoreColor = score >= data.threshold ? chalk.green : chalk.red;
1483
+ console.log(
1484
+ ` ${chalk.dim(name + ":")}${" ".repeat(Math.max(1, 12 - name.length))}${scoreColor(score.toFixed(3))}`
1485
+ );
1486
+ }
1487
+ }
1362
1488
  console.log(chalk.dim(" " + "\u2500".repeat(60)));
1363
1489
  console.log("");
1364
1490
  }
1365
1491
  function renderCaseDetail(c, threshold, options) {
1366
1492
  const entries = Object.entries(c.scores);
1367
- const failed = entries.some(([, s]) => s.score < threshold);
1368
- const prefix = failed ? chalk.red("FAIL") : chalk.green("PASS");
1493
+ const status = getCaseStatus(c, threshold);
1494
+ const prefix = statusLabel(status);
1369
1495
  const includeIO = options?.includeIO ?? false;
1370
1496
  const maxStringLength = options?.maxStringLength ?? 4e3;
1371
- console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)}`);
1497
+ const meta = `${chalk.dim(formatDuration(c.latencyMs))} ${chalk.dim(`${c.tokensIn}/${c.tokensOut} tokens`)}`;
1498
+ console.log(` ${prefix} ${chalk.dim(`Case #${c.index}`)} ${meta}`);
1372
1499
  const inputStr = stringifyUnknown(c.input, {
1373
1500
  space: 2,
1374
1501
  fallback: String(c.input)
1375
1502
  });
1376
- console.log(` ${chalk.dim("Input:")} ${inputStr}`);
1503
+ console.log(` ${chalk.dim("Input:")}`);
1504
+ console.log(indentBlock(truncateString(inputStr, maxStringLength), 6));
1377
1505
  if (includeIO) {
1378
1506
  console.log(` ${chalk.dim("Output:")}`);
1379
1507
  console.log(indentBlock(truncateString(c.output, maxStringLength), 6));
@@ -1400,6 +1528,37 @@ function renderCaseDetail(c, threshold, options) {
1400
1528
  }
1401
1529
  console.log("");
1402
1530
  }
1531
+ function renderFailuresByScorer(cases, threshold) {
1532
+ const scorerNames = /* @__PURE__ */ new Set();
1533
+ for (const c of cases) {
1534
+ for (const name of Object.keys(c.scores)) {
1535
+ scorerNames.add(name);
1536
+ }
1537
+ }
1538
+ let hasFailures = false;
1539
+ for (const scorer of scorerNames) {
1540
+ const failing = cases.filter((c) => {
1541
+ const s = c.scores[scorer];
1542
+ return s && s.score < threshold || getCaseStatus(c, threshold) === "error";
1543
+ });
1544
+ if (failing.length === 0) continue;
1545
+ if (!hasFailures) {
1546
+ console.log(chalk.dim(" Failing by scorer:"));
1547
+ console.log("");
1548
+ hasFailures = true;
1549
+ }
1550
+ console.log(
1551
+ ` ${chalk.bold(scorer)} ${chalk.dim(`(${failing.length} failures)`)}`
1552
+ );
1553
+ console.log(chalk.dim(" " + "\u2500".repeat(40)));
1554
+ for (const c of failing) {
1555
+ renderCaseDetail(c, threshold, {
1556
+ includeIO: true,
1557
+ maxStringLength: 4e3
1558
+ });
1559
+ }
1560
+ }
1561
+ }
1403
1562
 
1404
1563
  // packages/evals/src/reporters/json.ts
1405
1564
  import { appendFile, mkdir as mkdir2 } from "node:fs/promises";
@@ -1441,6 +1600,7 @@ function csvReporter(options) {
1441
1600
  const scorerNames = Object.keys(data.summary.meanScores);
1442
1601
  const headerParts = [
1443
1602
  "index",
1603
+ "status",
1444
1604
  "input",
1445
1605
  "output",
1446
1606
  "expected",
@@ -1454,8 +1614,10 @@ function csvReporter(options) {
1454
1614
  }
1455
1615
  const rows = [headerParts.join(",")];
1456
1616
  for (const c of data.cases) {
1617
+ const status = getCaseStatus(c, data.threshold);
1457
1618
  const parts = [
1458
1619
  String(c.index),
1620
+ status,
1459
1621
  escapeCsv(c.input),
1460
1622
  escapeCsv(c.output),
1461
1623
  escapeCsv(c.expected),
@@ -1484,15 +1646,17 @@ function markdownReporter(options) {
1484
1646
  const { summary } = data;
1485
1647
  const scorerNames = Object.keys(summary.meanScores);
1486
1648
  const lines = [];
1649
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1487
1650
  lines.push(`# ${data.name}`);
1488
1651
  lines.push("");
1489
1652
  lines.push(`**Model:** ${data.model}`);
1653
+ lines.push(`**Threshold:** ${data.threshold}`);
1490
1654
  lines.push(
1491
- `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail)`
1655
+ `**Cases:** ${summary.totalCases} (${summary.passCount} pass, ${summary.failCount} fail, ${passRate}%)`
1492
1656
  );
1493
1657
  lines.push(`**Duration:** ${formatDuration(summary.totalLatencyMs)}`);
1494
1658
  lines.push(
1495
- `**Tokens:** ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1659
+ `**Tokens:** In: ${formatTokens(summary.totalTokensIn)} | Out: ${formatTokens(summary.totalTokensOut)} | Total: ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}`
1496
1660
  );
1497
1661
  lines.push("");
1498
1662
  lines.push("## Scores");
@@ -1511,6 +1675,7 @@ function markdownReporter(options) {
1511
1675
  "Input",
1512
1676
  ...scorerNames,
1513
1677
  "Latency",
1678
+ "Tokens",
1514
1679
  "Error"
1515
1680
  ];
1516
1681
  lines.push(`| ${caseHeader.join(" | ")} |`);
@@ -1528,7 +1693,8 @@ function markdownReporter(options) {
1528
1693
  status,
1529
1694
  input,
1530
1695
  ...scores,
1531
- `${c.latencyMs}ms`,
1696
+ formatDuration(c.latencyMs),
1697
+ `${c.tokensIn}/${c.tokensOut}`,
1532
1698
  error
1533
1699
  ];
1534
1700
  lines.push(`| ${row.join(" | ")} |`);
@@ -1553,9 +1719,10 @@ function esc(str) {
1553
1719
  function renderHtml(data) {
1554
1720
  const { summary } = data;
1555
1721
  const scorerNames = Object.keys(summary.meanScores);
1722
+ const passRate = summary.totalCases > 0 ? (summary.passCount / summary.totalCases * 100).toFixed(1) : "0.0";
1556
1723
  const caseRows = data.cases.map((c) => {
1557
1724
  const status = getCaseStatus(c, data.threshold);
1558
- const statusLabel = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
1725
+ const statusLabel2 = status === "error" ? "ERROR" : status === "pass" ? "PASS" : "FAIL";
1559
1726
  const scoresCells = scorerNames.map((name) => {
1560
1727
  const s = c.scores[name];
1561
1728
  const score = s?.score ?? 0;
@@ -1563,13 +1730,19 @@ function renderHtml(data) {
1563
1730
  const reason = s?.reason ? ` title="${esc(s.reason)}"` : "";
1564
1731
  return `<td class="${cls}"${reason}>${score.toFixed(3)}</td>`;
1565
1732
  }).join("");
1733
+ const expectedStr = stringifyUnknown(c.expected, {
1734
+ space: 0,
1735
+ fallback: ""
1736
+ });
1566
1737
  return `<tr class="${status}">
1567
1738
  <td>${c.index}</td>
1568
- <td class="${status}">${statusLabel}</td>
1739
+ <td class="${status}">${statusLabel2}</td>
1569
1740
  <td class="text">${esc(formatInputValue(c.input).slice(0, 120))}</td>
1570
1741
  <td class="text">${esc(c.output.slice(0, 120))}</td>
1742
+ <td class="text">${esc(expectedStr.slice(0, 120))}</td>
1571
1743
  ${scoresCells}
1572
- <td>${c.latencyMs}ms</td>
1744
+ <td>${formatDuration(c.latencyMs)}</td>
1745
+ <td>${c.tokensIn}/${c.tokensOut}</td>
1573
1746
  <td class="error-text">${c.error ? esc(formatErrorValue(c.error)) : ""}</td>
1574
1747
  </tr>`;
1575
1748
  }).join("\n");
@@ -1607,11 +1780,14 @@ function renderHtml(data) {
1607
1780
  <h1>${esc(data.name)}</h1>
1608
1781
  <div class="meta">
1609
1782
  <span><strong>Model:</strong> ${esc(data.model)}</span>
1783
+ <span><strong>Threshold:</strong> ${data.threshold}</span>
1610
1784
  <span><strong>Cases:</strong> ${summary.totalCases}</span>
1611
1785
  <span><strong>Pass:</strong> ${summary.passCount}</span>
1612
- <span><strong>Fail:</strong> ${summary.failCount}</span>
1786
+ <span><strong>Fail:</strong> ${summary.failCount} (${passRate}%)</span>
1613
1787
  <span><strong>Duration:</strong> ${formatDuration(summary.totalLatencyMs)}</span>
1614
- <span><strong>Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
1788
+ <span><strong>Tokens In:</strong> ${formatTokens(summary.totalTokensIn)}</span>
1789
+ <span><strong>Tokens Out:</strong> ${formatTokens(summary.totalTokensOut)}</span>
1790
+ <span><strong>Total Tokens:</strong> ${formatTokens(summary.totalTokensIn + summary.totalTokensOut)}</span>
1615
1791
  </div>
1616
1792
 
1617
1793
  <h2>Mean Scores</h2>
@@ -1628,8 +1804,10 @@ function renderHtml(data) {
1628
1804
  <th>Status</th>
1629
1805
  <th>Input</th>
1630
1806
  <th>Output</th>
1807
+ <th>Expected</th>
1631
1808
  ${scorerHeaders}
1632
1809
  <th>Latency</th>
1810
+ <th>Tokens</th>
1633
1811
  <th>Error</th>
1634
1812
  </tr>
1635
1813
  </thead>
@@ -1642,14 +1820,149 @@ function renderHtml(data) {
1642
1820
  }
1643
1821
 
1644
1822
  // packages/evals/src/evaluate/index.ts
1645
- async function evaluate(options) {
1646
- if ("models" in options) {
1647
- return evaluateEach(options);
1823
+ var EvalAssertionError = class extends Error {
1824
+ summary;
1825
+ constructor(summary) {
1826
+ const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
1827
+ super(msg);
1828
+ this.name = "EvalAssertionError";
1829
+ this.summary = summary;
1830
+ }
1831
+ };
1832
+ function resolveFailedIndexes(store, suiteName, model, threshold) {
1833
+ const suite = store.findSuiteByName(suiteName);
1834
+ if (!suite) {
1835
+ console.warn(
1836
+ `No previous suite found for '${suiteName}'. Running all cases.`
1837
+ );
1838
+ return /* @__PURE__ */ new Set();
1839
+ }
1840
+ const run = store.getLatestCompletedRun(suite.id, model);
1841
+ if (!run) {
1842
+ console.warn(
1843
+ `No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
1844
+ );
1845
+ return /* @__PURE__ */ new Set();
1846
+ }
1847
+ const failingCases = store.getFailingCases(run.id, threshold);
1848
+ if (failingCases.length === 0) {
1849
+ console.warn(`No failed cases in previous run. Running all cases.`);
1850
+ return /* @__PURE__ */ new Set();
1648
1851
  }
1649
- return evaluateSingle(options);
1852
+ console.warn(
1853
+ `Retrying ${failingCases.length} failed cases from previous run`
1854
+ );
1855
+ return new Set(failingCases.map((c) => c.idx));
1650
1856
  }
1651
- function resolveStore(store) {
1652
- return store instanceof RunStore ? store : new RunStore(store);
1857
+ var EvalBuilder = class {
1858
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
1859
+ #options;
1860
+ #selection = { type: "all" };
1861
+ #shouldAssert = false;
1862
+ constructor(options) {
1863
+ this.#options = options;
1864
+ }
1865
+ #setSelection(selection) {
1866
+ if (this.#selection.type !== "all") {
1867
+ throw new Error(
1868
+ `Cannot combine .${this.#selection.type}() with .${selection.type}()`
1869
+ );
1870
+ }
1871
+ this.#selection = selection;
1872
+ return this;
1873
+ }
1874
+ failed() {
1875
+ return this.#setSelection({ type: "failed" });
1876
+ }
1877
+ cases(spec) {
1878
+ const { indexes } = parseRecordSelection(spec);
1879
+ return this.#setSelection({ type: "cases", indexes });
1880
+ }
1881
+ sample(count) {
1882
+ if (count < 1) {
1883
+ throw new Error("Sample count must be >= 1");
1884
+ }
1885
+ return this.#setSelection({ type: "sample", count });
1886
+ }
1887
+ assert() {
1888
+ this.#shouldAssert = true;
1889
+ return this;
1890
+ }
1891
+ then(onfulfilled, onrejected) {
1892
+ return this.#execute().then(onfulfilled, onrejected);
1893
+ }
1894
+ async #execute() {
1895
+ if ("models" in this.#options) {
1896
+ return this.#executeMulti();
1897
+ }
1898
+ return this.#executeSingle();
1899
+ }
1900
+ #applyDatasetFilter(ds) {
1901
+ switch (this.#selection.type) {
1902
+ case "all":
1903
+ return ds;
1904
+ case "cases":
1905
+ return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
1906
+ case "sample":
1907
+ return dataset(ds).sample(this.#selection.count);
1908
+ case "failed":
1909
+ return ds;
1910
+ }
1911
+ }
1912
+ async #executeSingle() {
1913
+ const options = this.#options;
1914
+ let ds = options.dataset;
1915
+ if (this.#selection.type === "failed") {
1916
+ const indexes = resolveFailedIndexes(
1917
+ options.store,
1918
+ options.name,
1919
+ options.model,
1920
+ options.threshold
1921
+ );
1922
+ if (indexes.size > 0) {
1923
+ ds = filterRecordsByIndex(ds, indexes);
1924
+ }
1925
+ } else {
1926
+ ds = this.#applyDatasetFilter(ds);
1927
+ }
1928
+ const result = await evaluateSingle({ ...options, dataset: ds });
1929
+ if (this.#shouldAssert && result.failCount > 0) {
1930
+ throw new EvalAssertionError(result);
1931
+ }
1932
+ return result;
1933
+ }
1934
+ async #executeMulti() {
1935
+ const options = this.#options;
1936
+ let result;
1937
+ if (this.#selection.type === "failed") {
1938
+ const perModelIndexes = /* @__PURE__ */ new Map();
1939
+ for (const variant of options.models) {
1940
+ perModelIndexes.set(
1941
+ variant.name,
1942
+ resolveFailedIndexes(
1943
+ options.store,
1944
+ options.name,
1945
+ variant.name,
1946
+ options.threshold
1947
+ )
1948
+ );
1949
+ }
1950
+ result = await evaluateEach(options, perModelIndexes);
1951
+ } else {
1952
+ const filtered = this.#applyDatasetFilter(options.dataset);
1953
+ result = await evaluateEach({ ...options, dataset: filtered });
1954
+ }
1955
+ if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
1956
+ throw new EvalAssertionError(result);
1957
+ }
1958
+ return result;
1959
+ }
1960
+ };
1961
+ function evaluate(options) {
1962
+ if ("models" in options) {
1963
+ return new EvalBuilder(options);
1964
+ }
1965
+ return new EvalBuilder(options);
1653
1966
  }
1654
1967
  function wireReporters(reporters) {
1655
1968
  const emitter = new EvalEmitter();
@@ -1682,7 +1995,6 @@ async function notifyRunEnd(reporters, data) {
1682
1995
  await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
1683
1996
  }
1684
1997
  async function evaluateSingle(options) {
1685
- const store = resolveStore(options.store);
1686
1998
  const threshold = options.threshold ?? 0.5;
1687
1999
  const { emitter, cases, getRunId } = wireReporters(options.reporters);
1688
2000
  const summary = await runEval({
@@ -1691,7 +2003,7 @@ async function evaluateSingle(options) {
1691
2003
  dataset: options.dataset,
1692
2004
  task: options.task,
1693
2005
  scorers: options.scorers,
1694
- store,
2006
+ store: options.store,
1695
2007
  emitter,
1696
2008
  suiteId: options.suiteId,
1697
2009
  maxConcurrency: options.maxConcurrency,
@@ -1709,34 +2021,40 @@ async function evaluateSingle(options) {
1709
2021
  });
1710
2022
  return summary;
1711
2023
  }
1712
- async function evaluateEach(options) {
1713
- const store = resolveStore(options.store);
2024
+ async function evaluateEach(options, perModelFailedIndexes) {
1714
2025
  const items = [];
1715
2026
  for await (const item of options.dataset) {
1716
2027
  items.push(item);
1717
2028
  }
1718
- const suite = store.createSuite(options.name);
2029
+ const suite = options.store.createSuite(options.name);
1719
2030
  return Promise.all(
1720
- options.models.map(
1721
- (variant) => evaluateSingle({
2031
+ options.models.map((variant) => {
2032
+ let ds = dataset(items);
2033
+ const failedIndexes = perModelFailedIndexes?.get(variant.name);
2034
+ if (failedIndexes && failedIndexes.size > 0) {
2035
+ ds = filterRecordsByIndex(ds, failedIndexes);
2036
+ }
2037
+ return evaluateSingle({
1722
2038
  name: `${options.name} [${variant.name}]`,
1723
2039
  model: variant.name,
1724
- dataset: dataset(items),
2040
+ dataset: ds,
1725
2041
  task: (input) => options.task(input, variant),
1726
2042
  scorers: options.scorers,
1727
2043
  reporters: options.reporters,
1728
- store,
2044
+ store: options.store,
1729
2045
  suiteId: suite.id,
1730
2046
  maxConcurrency: options.maxConcurrency,
1731
2047
  timeout: options.timeout,
1732
2048
  trials: options.trials,
1733
2049
  threshold: options.threshold
1734
- })
1735
- )
2050
+ });
2051
+ })
1736
2052
  );
1737
2053
  }
1738
2054
  export {
1739
2055
  Dataset,
2056
+ EvalAssertionError,
2057
+ EvalBuilder,
1740
2058
  EvalEmitter,
1741
2059
  RunStore,
1742
2060
  all,
@@ -1748,6 +2066,7 @@ export {
1748
2066
  evaluate,
1749
2067
  exactMatch,
1750
2068
  factuality,
2069
+ filterRecordsByIndex,
1751
2070
  hf,
1752
2071
  htmlReporter,
1753
2072
  includes,
@@ -1756,6 +2075,8 @@ export {
1756
2075
  levenshtein,
1757
2076
  llmJudge,
1758
2077
  markdownReporter,
2078
+ parseRecordSelection,
2079
+ pickFromArray,
1759
2080
  regex,
1760
2081
  runEval,
1761
2082
  weighted