@deepagents/evals 0.19.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,68 @@ import { createReadStream } from "node:fs";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { extname } from "node:path";
5
5
  import { createInterface } from "node:readline";
6
+
7
+ // packages/evals/src/dataset/record-selection.ts
8
+ function parsePositiveInt(token) {
9
+ if (!/^\d+$/.test(token)) {
10
+ throw new Error(`Invalid record token "${token}"`);
11
+ }
12
+ const value = Number(token);
13
+ if (!Number.isInteger(value) || value < 1) {
14
+ throw new Error(`Record numbers must be >= 1. Received "${token}"`);
15
+ }
16
+ return value;
17
+ }
18
+ function parseRecordSelection(spec) {
19
+ const trimmed = spec.trim();
20
+ if (!trimmed) {
21
+ return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
22
+ }
23
+ const indexes = /* @__PURE__ */ new Set();
24
+ const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
25
+ if (parts.length === 0) {
26
+ throw new Error("Record selection is empty.");
27
+ }
28
+ for (const part of parts) {
29
+ const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
30
+ if (rangeMatch) {
31
+ const start = parsePositiveInt(rangeMatch[1]);
32
+ const end = parsePositiveInt(rangeMatch[2]);
33
+ if (end < start) {
34
+ throw new Error(
35
+ `Invalid range "${part}". Range end must be >= range start.`
36
+ );
37
+ }
38
+ for (let i = start; i <= end; i++) {
39
+ indexes.add(i - 1);
40
+ }
41
+ continue;
42
+ }
43
+ const value = parsePositiveInt(part);
44
+ indexes.add(value - 1);
45
+ }
46
+ return {
47
+ indexes,
48
+ normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
49
+ };
50
+ }
51
+ async function* filterRecordsByIndex(source, indexes) {
52
+ if (indexes.size === 0) {
53
+ for await (const item of source) {
54
+ yield item;
55
+ }
56
+ return;
57
+ }
58
+ let idx = 0;
59
+ for await (const item of source) {
60
+ if (indexes.has(idx)) {
61
+ yield item;
62
+ }
63
+ idx++;
64
+ }
65
+ }
66
+
67
+ // packages/evals/src/dataset/index.ts
6
68
  var Dataset = class _Dataset {
7
69
  #source;
8
70
  constructor(source) {
@@ -70,6 +132,22 @@ var Dataset = class _Dataset {
70
132
  }
71
133
  });
72
134
  }
135
+ pick(indexes) {
136
+ const source = this.#source;
137
+ return new _Dataset(async function* () {
138
+ if (indexes.size === 0) {
139
+ yield* source();
140
+ return;
141
+ }
142
+ let idx = 0;
143
+ for await (const item of source()) {
144
+ if (indexes.has(idx)) {
145
+ yield item;
146
+ }
147
+ idx++;
148
+ }
149
+ });
150
+ }
73
151
  async toArray() {
74
152
  const result = [];
75
153
  for await (const item of this.#source()) {
@@ -361,7 +439,8 @@ async function runEval(config) {
361
439
  });
362
440
  scores[sName] = {
363
441
  score: clampScore(sr.score, sName),
364
- reason: sr.reason
442
+ reason: sr.reason,
443
+ metadata: sr.metadata
365
444
  };
366
445
  }
367
446
  trialResults.push({ result, scores });
@@ -387,7 +466,8 @@ async function runEval(config) {
387
466
  const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
388
467
  finalScores[sName] = {
389
468
  score: meanScore,
390
- reason: trialResults[trialResults.length - 1].scores[sName]?.reason
469
+ reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
470
+ metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
391
471
  };
392
472
  }
393
473
  } else {
@@ -404,7 +484,8 @@ async function runEval(config) {
404
484
  });
405
485
  finalScores[sName] = {
406
486
  score: clampScore(sr.score, sName),
407
- reason: sr.reason
487
+ reason: sr.reason,
488
+ metadata: sr.metadata
408
489
  };
409
490
  }
410
491
  }
@@ -520,372 +601,152 @@ function computeSummary(cases, scorerNames, threshold) {
520
601
  }
521
602
 
522
603
  // packages/evals/src/store/index.ts
523
- import { mkdirSync } from "node:fs";
524
- import { dirname } from "node:path";
525
604
  import { DatabaseSync } from "node:sqlite";
526
605
 
527
- // packages/evals/src/store/ddl.sqlite.sql
528
- var ddl_sqlite_default = "PRAGMA journal_mode = WAL;\nPRAGMA synchronous = NORMAL;\nPRAGMA foreign_keys = ON;\n\nCREATE TABLE IF NOT EXISTS suites (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE TABLE IF NOT EXISTS runs (\n id TEXT PRIMARY KEY,\n suite_id TEXT NOT NULL,\n name TEXT NOT NULL,\n model TEXT NOT NULL,\n config TEXT,\n started_at INTEGER NOT NULL,\n finished_at INTEGER,\n status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),\n summary TEXT,\n FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id);\nCREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at);\n\nCREATE TABLE IF NOT EXISTS cases (\n id TEXT PRIMARY KEY,\n run_id TEXT NOT NULL,\n idx INTEGER NOT NULL,\n input TEXT NOT NULL,\n output TEXT,\n expected TEXT,\n latency_ms INTEGER,\n tokens_in INTEGER,\n tokens_out INTEGER,\n error TEXT,\n FOREIGN KEY (run_id) REFERENCES runs(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_cases_run_id ON cases(run_id);\n\nCREATE TABLE IF NOT EXISTS scores (\n id TEXT PRIMARY KEY,\n case_id TEXT NOT NULL,\n scorer_name TEXT NOT NULL,\n score REAL NOT NULL,\n reason TEXT,\n FOREIGN KEY (case_id) REFERENCES cases(id) ON DELETE CASCADE\n);\n\nCREATE INDEX IF NOT EXISTS idx_scores_case_id ON scores(case_id);\n\nCREATE TABLE IF NOT EXISTS prompts (\n id TEXT PRIMARY KEY,\n name TEXT NOT NULL UNIQUE,\n content TEXT NOT NULL,\n created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000)\n);\n\nCREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at);\n";
529
-
530
- // packages/evals/src/store/index.ts
531
- var RunStore = class {
532
- #db;
533
- #statements = /* @__PURE__ */ new Map();
534
- #stmt(sql) {
535
- let stmt = this.#statements.get(sql);
536
- if (!stmt) {
537
- stmt = this.#db.prepare(sql);
538
- this.#statements.set(sql, stmt);
539
- }
540
- return stmt;
541
- }
542
- #transaction(fn) {
543
- this.#db.exec("BEGIN TRANSACTION");
544
- try {
545
- const result = fn();
546
- this.#db.exec("COMMIT");
547
- return result;
548
- } catch (error) {
549
- this.#db.exec("ROLLBACK");
550
- throw error;
551
- }
606
+ // packages/evals/src/evaluate/index.ts
607
+ var EvalAssertionError = class extends Error {
608
+ summary;
609
+ constructor(summary) {
610
+ const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
611
+ super(msg);
612
+ this.name = "EvalAssertionError";
613
+ this.summary = summary;
552
614
  }
553
- constructor(pathOrDb) {
554
- if (pathOrDb instanceof DatabaseSync) {
555
- this.#db = pathOrDb;
556
- } else {
557
- const dbPath = pathOrDb ?? ".evals/store.db";
558
- mkdirSync(dirname(dbPath), { recursive: true });
559
- this.#db = new DatabaseSync(dbPath);
560
- }
561
- this.#db.exec(ddl_sqlite_default);
562
- this.#migrateRunsTableToSuiteRequired();
563
- this.#migratePromptsTableIfNeeded();
564
- this.#db.exec(
565
- "CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
615
+ };
616
+ function resolveFailedIndexes(store, suiteName, model, threshold) {
617
+ const suite = store.findSuiteByName(suiteName);
618
+ if (!suite) {
619
+ console.warn(
620
+ `No previous suite found for '${suiteName}'. Running all cases.`
566
621
  );
622
+ return /* @__PURE__ */ new Set();
567
623
  }
568
- #migratePromptsTableIfNeeded() {
569
- const columns = this.#stmt("PRAGMA table_info(prompts)").all();
570
- if (columns.length === 0) return;
571
- if (columns.some((column) => column.name === "version")) return;
572
- this.#transaction(() => {
573
- this.#db.exec("ALTER TABLE prompts RENAME TO prompts_legacy");
574
- this.#db.exec(`
575
- CREATE TABLE prompts (
576
- id TEXT PRIMARY KEY,
577
- name TEXT NOT NULL,
578
- version INTEGER NOT NULL,
579
- content TEXT NOT NULL,
580
- created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000),
581
- UNIQUE(name, version)
582
- )
583
- `);
584
- this.#db.exec(`
585
- INSERT INTO prompts (id, name, version, content, created_at)
586
- SELECT id, name, 1, content, created_at
587
- FROM prompts_legacy
588
- `);
589
- this.#db.exec("DROP TABLE prompts_legacy");
590
- this.#db.exec(
591
- "CREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at)"
592
- );
593
- this.#db.exec(
594
- "CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
595
- );
596
- });
597
- }
598
- #migrateRunsTableToSuiteRequired() {
599
- const runColumns = this.#stmt("PRAGMA table_info(runs)").all();
600
- if (runColumns.length === 0) return;
601
- const suiteColumn = runColumns.find((column) => column.name === "suite_id");
602
- const hasNonNullSuite = suiteColumn?.notnull === 1;
603
- const runForeignKeys = this.#stmt(
604
- "PRAGMA foreign_key_list(runs)"
605
- ).all();
606
- const suiteForeignKey = runForeignKeys.find(
607
- (fk) => fk.from === "suite_id" && fk.table === "suites"
608
- );
609
- const hasCascadeDelete = suiteForeignKey?.on_delete === "CASCADE";
610
- if (hasNonNullSuite && hasCascadeDelete) return;
611
- this.#statements.clear();
612
- this.#transaction(() => {
613
- this.#db.exec(`
614
- CREATE TABLE runs_next (
615
- id TEXT PRIMARY KEY,
616
- suite_id TEXT NOT NULL,
617
- name TEXT NOT NULL,
618
- model TEXT NOT NULL,
619
- config TEXT,
620
- started_at INTEGER NOT NULL,
621
- finished_at INTEGER,
622
- status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),
623
- summary TEXT,
624
- FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE
625
- )
626
- `);
627
- this.#db.exec("DELETE FROM runs WHERE suite_id IS NULL");
628
- this.#db.exec(`
629
- INSERT INTO runs_next (id, suite_id, name, model, config, started_at, finished_at, status, summary)
630
- SELECT r.id, r.suite_id, r.name, r.model, r.config, r.started_at, r.finished_at, r.status, r.summary
631
- FROM runs r
632
- JOIN suites s ON s.id = r.suite_id
633
- `);
634
- this.#db.exec("DROP TABLE runs");
635
- this.#db.exec("ALTER TABLE runs_next RENAME TO runs");
636
- this.#db.exec(
637
- "CREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id)"
638
- );
639
- this.#db.exec(
640
- "CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at)"
641
- );
642
- });
643
- this.#statements.clear();
644
- }
645
- createSuite(name) {
646
- const id = crypto.randomUUID();
647
- const now = Date.now();
648
- this.#stmt(
649
- "INSERT INTO suites (id, name, created_at) VALUES (?, ?, ?)"
650
- ).run(id, name, now);
651
- return { id, name, created_at: now };
652
- }
653
- createRun(run) {
654
- const id = crypto.randomUUID();
655
- const now = Date.now();
656
- this.#stmt(
657
- "INSERT INTO runs (id, suite_id, name, model, config, started_at) VALUES (?, ?, ?, ?, ?, ?)"
658
- ).run(
659
- id,
660
- run.suite_id,
661
- run.name,
662
- run.model,
663
- run.config ? JSON.stringify(run.config) : null,
664
- now
624
+ const run = store.getLatestCompletedRun(suite.id, model);
625
+ if (!run) {
626
+ console.warn(
627
+ `No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
665
628
  );
666
- return id;
629
+ return /* @__PURE__ */ new Set();
667
630
  }
668
- finishRun(runId, status, summary) {
669
- this.#stmt(
670
- "UPDATE runs SET finished_at = ?, status = ?, summary = ? WHERE id = ?"
671
- ).run(Date.now(), status, summary ? JSON.stringify(summary) : null, runId);
631
+ const failingCases = store.getFailingCases(run.id, threshold);
632
+ if (failingCases.length === 0) {
633
+ console.warn(`No failed cases in previous run. Running all cases.`);
634
+ return /* @__PURE__ */ new Set();
672
635
  }
673
- saveCases(cases) {
674
- this.#transaction(() => {
675
- const stmt = this.#stmt(
676
- "INSERT INTO cases (id, run_id, idx, input, output, expected, latency_ms, tokens_in, tokens_out, error) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
636
+ console.warn(
637
+ `Retrying ${failingCases.length} failed cases from previous run`
638
+ );
639
+ return new Set(failingCases.map((c) => c.idx));
640
+ }
641
+ var EvalBuilder = class {
642
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
643
+ #options;
644
+ #selection = { type: "all" };
645
+ #shouldAssert = false;
646
+ constructor(options) {
647
+ this.#options = options;
648
+ }
649
+ #setSelection(selection) {
650
+ if (this.#selection.type !== "all") {
651
+ throw new Error(
652
+ `Cannot combine .${this.#selection.type}() with .${selection.type}()`
677
653
  );
678
- for (const c of cases) {
679
- stmt.run(
680
- c.id,
681
- c.run_id,
682
- c.idx,
683
- JSON.stringify(c.input),
684
- c.output,
685
- c.expected != null ? JSON.stringify(c.expected) : null,
686
- c.latency_ms,
687
- c.tokens_in,
688
- c.tokens_out,
689
- c.error ?? null
690
- );
691
- }
692
- });
654
+ }
655
+ this.#selection = selection;
656
+ return this;
693
657
  }
694
- saveScores(scores) {
695
- this.#transaction(() => {
696
- const stmt = this.#stmt(
697
- "INSERT INTO scores (id, case_id, scorer_name, score, reason) VALUES (?, ?, ?, ?, ?)"
698
- );
699
- for (const s of scores) {
700
- stmt.run(s.id, s.case_id, s.scorer_name, s.score, s.reason ?? null);
701
- }
702
- });
658
+ failed() {
659
+ return this.#setSelection({ type: "failed" });
703
660
  }
704
- getRun(runId) {
705
- const row = this.#stmt("SELECT * FROM runs WHERE id = ?").get(runId);
706
- if (!row) return void 0;
707
- return {
708
- id: row.id,
709
- suite_id: row.suite_id,
710
- name: row.name,
711
- model: row.model,
712
- config: row.config ? JSON.parse(row.config) : null,
713
- started_at: row.started_at,
714
- finished_at: row.finished_at,
715
- status: row.status,
716
- summary: row.summary ? JSON.parse(row.summary) : null
717
- };
661
+ cases(spec) {
662
+ const { indexes } = parseRecordSelection(spec);
663
+ return this.#setSelection({ type: "cases", indexes });
718
664
  }
719
- listRuns(suiteId) {
720
- const sql = suiteId ? "SELECT * FROM runs WHERE suite_id = ? ORDER BY started_at" : "SELECT * FROM runs ORDER BY started_at";
721
- const rows = suiteId ? this.#stmt(sql).all(suiteId) : this.#stmt(sql).all();
722
- return rows.map((row) => ({
723
- id: row.id,
724
- suite_id: row.suite_id,
725
- name: row.name,
726
- model: row.model,
727
- config: row.config ? JSON.parse(row.config) : null,
728
- started_at: row.started_at,
729
- finished_at: row.finished_at,
730
- status: row.status,
731
- summary: row.summary ? JSON.parse(row.summary) : null
732
- }));
733
- }
734
- getCases(runId) {
735
- const rows = this.#stmt(
736
- "SELECT * FROM cases WHERE run_id = ? ORDER BY idx"
737
- ).all(runId);
738
- return rows.map((row) => ({
739
- id: row.id,
740
- run_id: row.run_id,
741
- idx: row.idx,
742
- input: JSON.parse(row.input),
743
- output: row.output,
744
- expected: row.expected ? JSON.parse(row.expected) : null,
745
- latency_ms: row.latency_ms,
746
- tokens_in: row.tokens_in,
747
- tokens_out: row.tokens_out,
748
- error: row.error
749
- }));
750
- }
751
- getFailingCases(runId, threshold = 0.5) {
752
- const rows = this.#stmt(
753
- `SELECT c.*, s.scorer_name, s.score, s.reason as score_reason
754
- FROM cases c
755
- JOIN scores s ON s.case_id = c.id
756
- WHERE c.run_id = ? AND s.score < ?
757
- ORDER BY c.idx`
758
- ).all(runId, threshold);
759
- const caseMap = /* @__PURE__ */ new Map();
760
- for (const row of rows) {
761
- let c = caseMap.get(row.id);
762
- if (!c) {
763
- c = {
764
- id: row.id,
765
- run_id: row.run_id,
766
- idx: row.idx,
767
- input: JSON.parse(row.input),
768
- output: row.output,
769
- expected: row.expected ? JSON.parse(row.expected) : null,
770
- latency_ms: row.latency_ms,
771
- tokens_in: row.tokens_in,
772
- tokens_out: row.tokens_out,
773
- error: row.error,
774
- scores: []
775
- };
776
- caseMap.set(row.id, c);
777
- }
778
- c.scores.push({
779
- scorer_name: row.scorer_name,
780
- score: row.score,
781
- reason: row.score_reason
782
- });
665
+ sample(count) {
666
+ if (count < 1) {
667
+ throw new Error("Sample count must be >= 1");
783
668
  }
784
- return Array.from(caseMap.values());
785
- }
786
- getRunSummary(runId, threshold = 0.5) {
787
- const totals = this.#stmt(
788
- `SELECT
789
- COUNT(DISTINCT c.id) as totalCases,
790
- COALESCE(SUM(c.latency_ms), 0) as totalLatencyMs,
791
- COALESCE(SUM(c.tokens_in), 0) as totalTokensIn,
792
- COALESCE(SUM(c.tokens_out), 0) as totalTokensOut
793
- FROM cases c WHERE c.run_id = ?`
794
- ).get(runId);
795
- const scorerMeans = this.#stmt(
796
- `SELECT s.scorer_name, AVG(s.score) as meanScore
797
- FROM scores s
798
- JOIN cases c ON c.id = s.case_id
799
- WHERE c.run_id = ?
800
- GROUP BY s.scorer_name`
801
- ).all(runId);
802
- const meanScores = {};
803
- for (const row of scorerMeans) {
804
- meanScores[row.scorer_name] = row.meanScore;
669
+ return this.#setSelection({ type: "sample", count });
670
+ }
671
+ assert() {
672
+ this.#shouldAssert = true;
673
+ return this;
674
+ }
675
+ then(onfulfilled, onrejected) {
676
+ return this.#execute().then(onfulfilled, onrejected);
677
+ }
678
+ async #execute() {
679
+ if ("models" in this.#options) {
680
+ return this.#executeMulti();
805
681
  }
806
- const passFail = this.#stmt(
807
- `SELECT c.id,
808
- MIN(s.score) as minScore
809
- FROM cases c
810
- JOIN scores s ON s.case_id = c.id
811
- WHERE c.run_id = ?
812
- GROUP BY c.id`
813
- ).all(runId);
814
- let passCount = 0;
815
- let failCount = 0;
816
- for (const row of passFail) {
817
- if (row.minScore >= threshold) passCount++;
818
- else failCount++;
682
+ return this.#executeSingle();
683
+ }
684
+ #applyDatasetFilter(ds) {
685
+ switch (this.#selection.type) {
686
+ case "all":
687
+ return ds;
688
+ case "cases":
689
+ return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
690
+ case "sample":
691
+ return dataset(ds).sample(this.#selection.count);
692
+ case "failed":
693
+ return ds;
819
694
  }
820
- return {
821
- totalCases: totals.totalCases,
822
- passCount,
823
- failCount,
824
- meanScores,
825
- totalLatencyMs: totals.totalLatencyMs,
826
- totalTokensIn: totals.totalTokensIn,
827
- totalTokensOut: totals.totalTokensOut
828
- };
829
695
  }
830
- listSuites() {
831
- const rows = this.#stmt(
832
- "SELECT * FROM suites ORDER BY created_at DESC"
833
- ).all();
834
- return rows.map((row) => ({
835
- id: row.id,
836
- name: row.name,
837
- created_at: row.created_at
838
- }));
839
- }
840
- createPrompt(name, content) {
841
- const id = crypto.randomUUID();
842
- const now = Date.now();
843
- const latest = this.#stmt(
844
- "SELECT MAX(version) as latestVersion FROM prompts WHERE name = ?"
845
- ).get(name);
846
- const version = (latest?.latestVersion ?? 0) + 1;
847
- this.#stmt(
848
- "INSERT INTO prompts (id, name, version, content, created_at) VALUES (?, ?, ?, ?, ?)"
849
- ).run(id, name, version, content, now);
850
- return { id, name, version, content, created_at: now };
851
- }
852
- listPrompts() {
853
- const rows = this.#stmt(
854
- "SELECT * FROM prompts ORDER BY name COLLATE NOCASE ASC, version DESC"
855
- ).all();
856
- return rows.map((row) => ({
857
- id: row.id,
858
- name: row.name,
859
- version: row.version,
860
- content: row.content,
861
- created_at: row.created_at
862
- }));
863
- }
864
- getPrompt(id) {
865
- const row = this.#stmt("SELECT * FROM prompts WHERE id = ?").get(id);
866
- if (!row) return void 0;
867
- return {
868
- id: row.id,
869
- name: row.name,
870
- version: row.version,
871
- content: row.content,
872
- created_at: row.created_at
873
- };
696
+ async #executeSingle() {
697
+ const options = this.#options;
698
+ let ds = options.dataset;
699
+ if (this.#selection.type === "failed") {
700
+ const indexes = resolveFailedIndexes(
701
+ options.store,
702
+ options.name,
703
+ options.model,
704
+ options.threshold
705
+ );
706
+ if (indexes.size > 0) {
707
+ ds = filterRecordsByIndex(ds, indexes);
708
+ }
709
+ } else {
710
+ ds = this.#applyDatasetFilter(ds);
711
+ }
712
+ const result = await evaluateSingle({ ...options, dataset: ds });
713
+ if (this.#shouldAssert && result.failCount > 0) {
714
+ throw new EvalAssertionError(result);
715
+ }
716
+ return result;
874
717
  }
875
- deletePrompt(id) {
876
- this.#stmt("DELETE FROM prompts WHERE id = ?").run(id);
718
+ async #executeMulti() {
719
+ const options = this.#options;
720
+ let result;
721
+ if (this.#selection.type === "failed") {
722
+ const perModelIndexes = /* @__PURE__ */ new Map();
723
+ for (const variant of options.models) {
724
+ perModelIndexes.set(
725
+ variant.name,
726
+ resolveFailedIndexes(
727
+ options.store,
728
+ options.name,
729
+ variant.name,
730
+ options.threshold
731
+ )
732
+ );
733
+ }
734
+ result = await evaluateEach(options, perModelIndexes);
735
+ } else {
736
+ const filtered = this.#applyDatasetFilter(options.dataset);
737
+ result = await evaluateEach({ ...options, dataset: filtered });
738
+ }
739
+ if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
740
+ throw new EvalAssertionError(result);
741
+ }
742
+ return result;
877
743
  }
878
744
  };
879
-
880
- // packages/evals/src/evaluate/index.ts
881
- async function evaluate(options) {
745
+ function evaluate(options) {
882
746
  if ("models" in options) {
883
- return evaluateEach(options);
747
+ return new EvalBuilder(options);
884
748
  }
885
- return evaluateSingle(options);
886
- }
887
- function resolveStore(store) {
888
- return store instanceof RunStore ? store : new RunStore(store);
749
+ return new EvalBuilder(options);
889
750
  }
890
751
  function wireReporters(reporters) {
891
752
  const emitter = new EvalEmitter();
@@ -918,7 +779,6 @@ async function notifyRunEnd(reporters, data) {
918
779
  await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
919
780
  }
920
781
  async function evaluateSingle(options) {
921
- const store = resolveStore(options.store);
922
782
  const threshold = options.threshold ?? 0.5;
923
783
  const { emitter, cases, getRunId } = wireReporters(options.reporters);
924
784
  const summary = await runEval({
@@ -927,7 +787,7 @@ async function evaluateSingle(options) {
927
787
  dataset: options.dataset,
928
788
  task: options.task,
929
789
  scorers: options.scorers,
930
- store,
790
+ store: options.store,
931
791
  emitter,
932
792
  suiteId: options.suiteId,
933
793
  maxConcurrency: options.maxConcurrency,
@@ -945,33 +805,39 @@ async function evaluateSingle(options) {
945
805
  });
946
806
  return summary;
947
807
  }
948
- async function evaluateEach(options) {
949
- const store = resolveStore(options.store);
808
+ async function evaluateEach(options, perModelFailedIndexes) {
950
809
  const items = [];
951
810
  for await (const item of options.dataset) {
952
811
  items.push(item);
953
812
  }
954
- const suite = store.createSuite(options.name);
813
+ const suite = options.store.createSuite(options.name);
955
814
  return Promise.all(
956
- options.models.map(
957
- (variant) => evaluateSingle({
815
+ options.models.map((variant) => {
816
+ let ds = dataset(items);
817
+ const failedIndexes = perModelFailedIndexes?.get(variant.name);
818
+ if (failedIndexes && failedIndexes.size > 0) {
819
+ ds = filterRecordsByIndex(ds, failedIndexes);
820
+ }
821
+ return evaluateSingle({
958
822
  name: `${options.name} [${variant.name}]`,
959
823
  model: variant.name,
960
- dataset: dataset(items),
824
+ dataset: ds,
961
825
  task: (input) => options.task(input, variant),
962
826
  scorers: options.scorers,
963
827
  reporters: options.reporters,
964
- store,
828
+ store: options.store,
965
829
  suiteId: suite.id,
966
830
  maxConcurrency: options.maxConcurrency,
967
831
  timeout: options.timeout,
968
832
  trials: options.trials,
969
833
  threshold: options.threshold
970
- })
971
- )
834
+ });
835
+ })
972
836
  );
973
837
  }
974
838
  export {
839
+ EvalAssertionError,
840
+ EvalBuilder,
975
841
  evaluate
976
842
  };
977
843
  //# sourceMappingURL=index.js.map