@deepagents/evals 0.19.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/dist/dataset/index.d.ts +3 -0
- package/dist/dataset/index.d.ts.map +1 -1
- package/dist/dataset/index.js +84 -1
- package/dist/dataset/index.js.map +3 -3
- package/dist/dataset/record-selection.d.ts +8 -0
- package/dist/dataset/record-selection.d.ts.map +1 -0
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js +6 -3
- package/dist/engine/index.js.map +2 -2
- package/dist/evaluate/index.d.ts +16 -3
- package/dist/evaluate/index.d.ts.map +1 -1
- package/dist/evaluate/index.js +225 -359
- package/dist/evaluate/index.js.map +3 -3
- package/dist/index.d.ts +5 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +429 -110
- package/dist/index.js.map +4 -4
- package/dist/reporters/console.d.ts.map +1 -1
- package/dist/reporters/csv.d.ts.map +1 -1
- package/dist/reporters/html.d.ts.map +1 -1
- package/dist/reporters/index.js +129 -36
- package/dist/reporters/index.js.map +3 -3
- package/dist/reporters/markdown.d.ts.map +1 -1
- package/dist/scorers/index.d.ts +2 -6
- package/dist/scorers/index.d.ts.map +1 -1
- package/dist/scorers/index.js +32 -54
- package/dist/scorers/index.js.map +2 -2
- package/dist/store/index.d.ts +2 -0
- package/dist/store/index.d.ts.map +1 -1
- package/dist/store/index.js +22 -0
- package/dist/store/index.js.map +2 -2
- package/package.json +3 -2
package/dist/evaluate/index.js
CHANGED
|
@@ -3,6 +3,68 @@ import { createReadStream } from "node:fs";
|
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { extname } from "node:path";
|
|
5
5
|
import { createInterface } from "node:readline";
|
|
6
|
+
|
|
7
|
+
// packages/evals/src/dataset/record-selection.ts
|
|
8
|
+
function parsePositiveInt(token) {
|
|
9
|
+
if (!/^\d+$/.test(token)) {
|
|
10
|
+
throw new Error(`Invalid record token "${token}"`);
|
|
11
|
+
}
|
|
12
|
+
const value = Number(token);
|
|
13
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
14
|
+
throw new Error(`Record numbers must be >= 1. Received "${token}"`);
|
|
15
|
+
}
|
|
16
|
+
return value;
|
|
17
|
+
}
|
|
18
|
+
function parseRecordSelection(spec) {
|
|
19
|
+
const trimmed = spec.trim();
|
|
20
|
+
if (!trimmed) {
|
|
21
|
+
return { indexes: /* @__PURE__ */ new Set(), normalized: "" };
|
|
22
|
+
}
|
|
23
|
+
const indexes = /* @__PURE__ */ new Set();
|
|
24
|
+
const parts = trimmed.split(",").map((part) => part.trim()).filter(Boolean);
|
|
25
|
+
if (parts.length === 0) {
|
|
26
|
+
throw new Error("Record selection is empty.");
|
|
27
|
+
}
|
|
28
|
+
for (const part of parts) {
|
|
29
|
+
const rangeMatch = /^(\d+)\s*-\s*(\d+)$/.exec(part);
|
|
30
|
+
if (rangeMatch) {
|
|
31
|
+
const start = parsePositiveInt(rangeMatch[1]);
|
|
32
|
+
const end = parsePositiveInt(rangeMatch[2]);
|
|
33
|
+
if (end < start) {
|
|
34
|
+
throw new Error(
|
|
35
|
+
`Invalid range "${part}". Range end must be >= range start.`
|
|
36
|
+
);
|
|
37
|
+
}
|
|
38
|
+
for (let i = start; i <= end; i++) {
|
|
39
|
+
indexes.add(i - 1);
|
|
40
|
+
}
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
const value = parsePositiveInt(part);
|
|
44
|
+
indexes.add(value - 1);
|
|
45
|
+
}
|
|
46
|
+
return {
|
|
47
|
+
indexes,
|
|
48
|
+
normalized: Array.from(indexes).sort((a, b) => a - b).map((i) => String(i + 1)).join(",")
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
async function* filterRecordsByIndex(source, indexes) {
|
|
52
|
+
if (indexes.size === 0) {
|
|
53
|
+
for await (const item of source) {
|
|
54
|
+
yield item;
|
|
55
|
+
}
|
|
56
|
+
return;
|
|
57
|
+
}
|
|
58
|
+
let idx = 0;
|
|
59
|
+
for await (const item of source) {
|
|
60
|
+
if (indexes.has(idx)) {
|
|
61
|
+
yield item;
|
|
62
|
+
}
|
|
63
|
+
idx++;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// packages/evals/src/dataset/index.ts
|
|
6
68
|
var Dataset = class _Dataset {
|
|
7
69
|
#source;
|
|
8
70
|
constructor(source) {
|
|
@@ -70,6 +132,22 @@ var Dataset = class _Dataset {
|
|
|
70
132
|
}
|
|
71
133
|
});
|
|
72
134
|
}
|
|
135
|
+
pick(indexes) {
|
|
136
|
+
const source = this.#source;
|
|
137
|
+
return new _Dataset(async function* () {
|
|
138
|
+
if (indexes.size === 0) {
|
|
139
|
+
yield* source();
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
let idx = 0;
|
|
143
|
+
for await (const item of source()) {
|
|
144
|
+
if (indexes.has(idx)) {
|
|
145
|
+
yield item;
|
|
146
|
+
}
|
|
147
|
+
idx++;
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
}
|
|
73
151
|
async toArray() {
|
|
74
152
|
const result = [];
|
|
75
153
|
for await (const item of this.#source()) {
|
|
@@ -361,7 +439,8 @@ async function runEval(config) {
|
|
|
361
439
|
});
|
|
362
440
|
scores[sName] = {
|
|
363
441
|
score: clampScore(sr.score, sName),
|
|
364
|
-
reason: sr.reason
|
|
442
|
+
reason: sr.reason,
|
|
443
|
+
metadata: sr.metadata
|
|
365
444
|
};
|
|
366
445
|
}
|
|
367
446
|
trialResults.push({ result, scores });
|
|
@@ -387,7 +466,8 @@ async function runEval(config) {
|
|
|
387
466
|
const meanScore = trialResults.reduce((sum, t) => sum + t.scores[sName].score, 0) / trials;
|
|
388
467
|
finalScores[sName] = {
|
|
389
468
|
score: meanScore,
|
|
390
|
-
reason: trialResults[trialResults.length - 1].scores[sName]?.reason
|
|
469
|
+
reason: trialResults[trialResults.length - 1].scores[sName]?.reason,
|
|
470
|
+
metadata: trialResults[trialResults.length - 1].scores[sName]?.metadata
|
|
391
471
|
};
|
|
392
472
|
}
|
|
393
473
|
} else {
|
|
@@ -404,7 +484,8 @@ async function runEval(config) {
|
|
|
404
484
|
});
|
|
405
485
|
finalScores[sName] = {
|
|
406
486
|
score: clampScore(sr.score, sName),
|
|
407
|
-
reason: sr.reason
|
|
487
|
+
reason: sr.reason,
|
|
488
|
+
metadata: sr.metadata
|
|
408
489
|
};
|
|
409
490
|
}
|
|
410
491
|
}
|
|
@@ -520,372 +601,152 @@ function computeSummary(cases, scorerNames, threshold) {
|
|
|
520
601
|
}
|
|
521
602
|
|
|
522
603
|
// packages/evals/src/store/index.ts
|
|
523
|
-
import { mkdirSync } from "node:fs";
|
|
524
|
-
import { dirname } from "node:path";
|
|
525
604
|
import { DatabaseSync } from "node:sqlite";
|
|
526
605
|
|
|
527
|
-
// packages/evals/src/
|
|
528
|
-
var
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
let stmt = this.#statements.get(sql);
|
|
536
|
-
if (!stmt) {
|
|
537
|
-
stmt = this.#db.prepare(sql);
|
|
538
|
-
this.#statements.set(sql, stmt);
|
|
539
|
-
}
|
|
540
|
-
return stmt;
|
|
541
|
-
}
|
|
542
|
-
#transaction(fn) {
|
|
543
|
-
this.#db.exec("BEGIN TRANSACTION");
|
|
544
|
-
try {
|
|
545
|
-
const result = fn();
|
|
546
|
-
this.#db.exec("COMMIT");
|
|
547
|
-
return result;
|
|
548
|
-
} catch (error) {
|
|
549
|
-
this.#db.exec("ROLLBACK");
|
|
550
|
-
throw error;
|
|
551
|
-
}
|
|
606
|
+
// packages/evals/src/evaluate/index.ts
|
|
607
|
+
var EvalAssertionError = class extends Error {
|
|
608
|
+
summary;
|
|
609
|
+
constructor(summary) {
|
|
610
|
+
const msg = Array.isArray(summary) ? `Eval assertion failed: ${summary.filter((s) => s.failCount > 0).length} of ${summary.length} model runs have failures` : `Eval assertion failed: ${summary.failCount} of ${summary.totalCases} cases failed`;
|
|
611
|
+
super(msg);
|
|
612
|
+
this.name = "EvalAssertionError";
|
|
613
|
+
this.summary = summary;
|
|
552
614
|
}
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
this.#db = new DatabaseSync(dbPath);
|
|
560
|
-
}
|
|
561
|
-
this.#db.exec(ddl_sqlite_default);
|
|
562
|
-
this.#migrateRunsTableToSuiteRequired();
|
|
563
|
-
this.#migratePromptsTableIfNeeded();
|
|
564
|
-
this.#db.exec(
|
|
565
|
-
"CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
|
|
615
|
+
};
|
|
616
|
+
function resolveFailedIndexes(store, suiteName, model, threshold) {
|
|
617
|
+
const suite = store.findSuiteByName(suiteName);
|
|
618
|
+
if (!suite) {
|
|
619
|
+
console.warn(
|
|
620
|
+
`No previous suite found for '${suiteName}'. Running all cases.`
|
|
566
621
|
);
|
|
622
|
+
return /* @__PURE__ */ new Set();
|
|
567
623
|
}
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
this.#transaction(() => {
|
|
573
|
-
this.#db.exec("ALTER TABLE prompts RENAME TO prompts_legacy");
|
|
574
|
-
this.#db.exec(`
|
|
575
|
-
CREATE TABLE prompts (
|
|
576
|
-
id TEXT PRIMARY KEY,
|
|
577
|
-
name TEXT NOT NULL,
|
|
578
|
-
version INTEGER NOT NULL,
|
|
579
|
-
content TEXT NOT NULL,
|
|
580
|
-
created_at INTEGER NOT NULL DEFAULT (unixepoch() * 1000),
|
|
581
|
-
UNIQUE(name, version)
|
|
582
|
-
)
|
|
583
|
-
`);
|
|
584
|
-
this.#db.exec(`
|
|
585
|
-
INSERT INTO prompts (id, name, version, content, created_at)
|
|
586
|
-
SELECT id, name, 1, content, created_at
|
|
587
|
-
FROM prompts_legacy
|
|
588
|
-
`);
|
|
589
|
-
this.#db.exec("DROP TABLE prompts_legacy");
|
|
590
|
-
this.#db.exec(
|
|
591
|
-
"CREATE INDEX IF NOT EXISTS idx_prompts_created_at ON prompts(created_at)"
|
|
592
|
-
);
|
|
593
|
-
this.#db.exec(
|
|
594
|
-
"CREATE INDEX IF NOT EXISTS idx_prompts_name_version ON prompts(name, version DESC)"
|
|
595
|
-
);
|
|
596
|
-
});
|
|
597
|
-
}
|
|
598
|
-
#migrateRunsTableToSuiteRequired() {
|
|
599
|
-
const runColumns = this.#stmt("PRAGMA table_info(runs)").all();
|
|
600
|
-
if (runColumns.length === 0) return;
|
|
601
|
-
const suiteColumn = runColumns.find((column) => column.name === "suite_id");
|
|
602
|
-
const hasNonNullSuite = suiteColumn?.notnull === 1;
|
|
603
|
-
const runForeignKeys = this.#stmt(
|
|
604
|
-
"PRAGMA foreign_key_list(runs)"
|
|
605
|
-
).all();
|
|
606
|
-
const suiteForeignKey = runForeignKeys.find(
|
|
607
|
-
(fk) => fk.from === "suite_id" && fk.table === "suites"
|
|
608
|
-
);
|
|
609
|
-
const hasCascadeDelete = suiteForeignKey?.on_delete === "CASCADE";
|
|
610
|
-
if (hasNonNullSuite && hasCascadeDelete) return;
|
|
611
|
-
this.#statements.clear();
|
|
612
|
-
this.#transaction(() => {
|
|
613
|
-
this.#db.exec(`
|
|
614
|
-
CREATE TABLE runs_next (
|
|
615
|
-
id TEXT PRIMARY KEY,
|
|
616
|
-
suite_id TEXT NOT NULL,
|
|
617
|
-
name TEXT NOT NULL,
|
|
618
|
-
model TEXT NOT NULL,
|
|
619
|
-
config TEXT,
|
|
620
|
-
started_at INTEGER NOT NULL,
|
|
621
|
-
finished_at INTEGER,
|
|
622
|
-
status TEXT NOT NULL DEFAULT 'running' CHECK(status IN ('running', 'completed', 'failed')),
|
|
623
|
-
summary TEXT,
|
|
624
|
-
FOREIGN KEY (suite_id) REFERENCES suites(id) ON DELETE CASCADE
|
|
625
|
-
)
|
|
626
|
-
`);
|
|
627
|
-
this.#db.exec("DELETE FROM runs WHERE suite_id IS NULL");
|
|
628
|
-
this.#db.exec(`
|
|
629
|
-
INSERT INTO runs_next (id, suite_id, name, model, config, started_at, finished_at, status, summary)
|
|
630
|
-
SELECT r.id, r.suite_id, r.name, r.model, r.config, r.started_at, r.finished_at, r.status, r.summary
|
|
631
|
-
FROM runs r
|
|
632
|
-
JOIN suites s ON s.id = r.suite_id
|
|
633
|
-
`);
|
|
634
|
-
this.#db.exec("DROP TABLE runs");
|
|
635
|
-
this.#db.exec("ALTER TABLE runs_next RENAME TO runs");
|
|
636
|
-
this.#db.exec(
|
|
637
|
-
"CREATE INDEX IF NOT EXISTS idx_runs_suite_id ON runs(suite_id)"
|
|
638
|
-
);
|
|
639
|
-
this.#db.exec(
|
|
640
|
-
"CREATE INDEX IF NOT EXISTS idx_runs_started_at ON runs(started_at)"
|
|
641
|
-
);
|
|
642
|
-
});
|
|
643
|
-
this.#statements.clear();
|
|
644
|
-
}
|
|
645
|
-
createSuite(name) {
|
|
646
|
-
const id = crypto.randomUUID();
|
|
647
|
-
const now = Date.now();
|
|
648
|
-
this.#stmt(
|
|
649
|
-
"INSERT INTO suites (id, name, created_at) VALUES (?, ?, ?)"
|
|
650
|
-
).run(id, name, now);
|
|
651
|
-
return { id, name, created_at: now };
|
|
652
|
-
}
|
|
653
|
-
createRun(run) {
|
|
654
|
-
const id = crypto.randomUUID();
|
|
655
|
-
const now = Date.now();
|
|
656
|
-
this.#stmt(
|
|
657
|
-
"INSERT INTO runs (id, suite_id, name, model, config, started_at) VALUES (?, ?, ?, ?, ?, ?)"
|
|
658
|
-
).run(
|
|
659
|
-
id,
|
|
660
|
-
run.suite_id,
|
|
661
|
-
run.name,
|
|
662
|
-
run.model,
|
|
663
|
-
run.config ? JSON.stringify(run.config) : null,
|
|
664
|
-
now
|
|
624
|
+
const run = store.getLatestCompletedRun(suite.id, model);
|
|
625
|
+
if (!run) {
|
|
626
|
+
console.warn(
|
|
627
|
+
`No previous completed run found for '${suiteName}'${model ? ` [${model}]` : ""}. Running all cases.`
|
|
665
628
|
);
|
|
666
|
-
return
|
|
629
|
+
return /* @__PURE__ */ new Set();
|
|
667
630
|
}
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
631
|
+
const failingCases = store.getFailingCases(run.id, threshold);
|
|
632
|
+
if (failingCases.length === 0) {
|
|
633
|
+
console.warn(`No failed cases in previous run. Running all cases.`);
|
|
634
|
+
return /* @__PURE__ */ new Set();
|
|
672
635
|
}
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
636
|
+
console.warn(
|
|
637
|
+
`Retrying ${failingCases.length} failed cases from previous run`
|
|
638
|
+
);
|
|
639
|
+
return new Set(failingCases.map((c) => c.idx));
|
|
640
|
+
}
|
|
641
|
+
var EvalBuilder = class {
|
|
642
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
643
|
+
#options;
|
|
644
|
+
#selection = { type: "all" };
|
|
645
|
+
#shouldAssert = false;
|
|
646
|
+
constructor(options) {
|
|
647
|
+
this.#options = options;
|
|
648
|
+
}
|
|
649
|
+
#setSelection(selection) {
|
|
650
|
+
if (this.#selection.type !== "all") {
|
|
651
|
+
throw new Error(
|
|
652
|
+
`Cannot combine .${this.#selection.type}() with .${selection.type}()`
|
|
677
653
|
);
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
c.run_id,
|
|
682
|
-
c.idx,
|
|
683
|
-
JSON.stringify(c.input),
|
|
684
|
-
c.output,
|
|
685
|
-
c.expected != null ? JSON.stringify(c.expected) : null,
|
|
686
|
-
c.latency_ms,
|
|
687
|
-
c.tokens_in,
|
|
688
|
-
c.tokens_out,
|
|
689
|
-
c.error ?? null
|
|
690
|
-
);
|
|
691
|
-
}
|
|
692
|
-
});
|
|
654
|
+
}
|
|
655
|
+
this.#selection = selection;
|
|
656
|
+
return this;
|
|
693
657
|
}
|
|
694
|
-
|
|
695
|
-
this.#
|
|
696
|
-
const stmt = this.#stmt(
|
|
697
|
-
"INSERT INTO scores (id, case_id, scorer_name, score, reason) VALUES (?, ?, ?, ?, ?)"
|
|
698
|
-
);
|
|
699
|
-
for (const s of scores) {
|
|
700
|
-
stmt.run(s.id, s.case_id, s.scorer_name, s.score, s.reason ?? null);
|
|
701
|
-
}
|
|
702
|
-
});
|
|
658
|
+
failed() {
|
|
659
|
+
return this.#setSelection({ type: "failed" });
|
|
703
660
|
}
|
|
704
|
-
|
|
705
|
-
const
|
|
706
|
-
|
|
707
|
-
return {
|
|
708
|
-
id: row.id,
|
|
709
|
-
suite_id: row.suite_id,
|
|
710
|
-
name: row.name,
|
|
711
|
-
model: row.model,
|
|
712
|
-
config: row.config ? JSON.parse(row.config) : null,
|
|
713
|
-
started_at: row.started_at,
|
|
714
|
-
finished_at: row.finished_at,
|
|
715
|
-
status: row.status,
|
|
716
|
-
summary: row.summary ? JSON.parse(row.summary) : null
|
|
717
|
-
};
|
|
661
|
+
cases(spec) {
|
|
662
|
+
const { indexes } = parseRecordSelection(spec);
|
|
663
|
+
return this.#setSelection({ type: "cases", indexes });
|
|
718
664
|
}
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
return rows.map((row) => ({
|
|
723
|
-
id: row.id,
|
|
724
|
-
suite_id: row.suite_id,
|
|
725
|
-
name: row.name,
|
|
726
|
-
model: row.model,
|
|
727
|
-
config: row.config ? JSON.parse(row.config) : null,
|
|
728
|
-
started_at: row.started_at,
|
|
729
|
-
finished_at: row.finished_at,
|
|
730
|
-
status: row.status,
|
|
731
|
-
summary: row.summary ? JSON.parse(row.summary) : null
|
|
732
|
-
}));
|
|
733
|
-
}
|
|
734
|
-
getCases(runId) {
|
|
735
|
-
const rows = this.#stmt(
|
|
736
|
-
"SELECT * FROM cases WHERE run_id = ? ORDER BY idx"
|
|
737
|
-
).all(runId);
|
|
738
|
-
return rows.map((row) => ({
|
|
739
|
-
id: row.id,
|
|
740
|
-
run_id: row.run_id,
|
|
741
|
-
idx: row.idx,
|
|
742
|
-
input: JSON.parse(row.input),
|
|
743
|
-
output: row.output,
|
|
744
|
-
expected: row.expected ? JSON.parse(row.expected) : null,
|
|
745
|
-
latency_ms: row.latency_ms,
|
|
746
|
-
tokens_in: row.tokens_in,
|
|
747
|
-
tokens_out: row.tokens_out,
|
|
748
|
-
error: row.error
|
|
749
|
-
}));
|
|
750
|
-
}
|
|
751
|
-
getFailingCases(runId, threshold = 0.5) {
|
|
752
|
-
const rows = this.#stmt(
|
|
753
|
-
`SELECT c.*, s.scorer_name, s.score, s.reason as score_reason
|
|
754
|
-
FROM cases c
|
|
755
|
-
JOIN scores s ON s.case_id = c.id
|
|
756
|
-
WHERE c.run_id = ? AND s.score < ?
|
|
757
|
-
ORDER BY c.idx`
|
|
758
|
-
).all(runId, threshold);
|
|
759
|
-
const caseMap = /* @__PURE__ */ new Map();
|
|
760
|
-
for (const row of rows) {
|
|
761
|
-
let c = caseMap.get(row.id);
|
|
762
|
-
if (!c) {
|
|
763
|
-
c = {
|
|
764
|
-
id: row.id,
|
|
765
|
-
run_id: row.run_id,
|
|
766
|
-
idx: row.idx,
|
|
767
|
-
input: JSON.parse(row.input),
|
|
768
|
-
output: row.output,
|
|
769
|
-
expected: row.expected ? JSON.parse(row.expected) : null,
|
|
770
|
-
latency_ms: row.latency_ms,
|
|
771
|
-
tokens_in: row.tokens_in,
|
|
772
|
-
tokens_out: row.tokens_out,
|
|
773
|
-
error: row.error,
|
|
774
|
-
scores: []
|
|
775
|
-
};
|
|
776
|
-
caseMap.set(row.id, c);
|
|
777
|
-
}
|
|
778
|
-
c.scores.push({
|
|
779
|
-
scorer_name: row.scorer_name,
|
|
780
|
-
score: row.score,
|
|
781
|
-
reason: row.score_reason
|
|
782
|
-
});
|
|
665
|
+
sample(count) {
|
|
666
|
+
if (count < 1) {
|
|
667
|
+
throw new Error("Sample count must be >= 1");
|
|
783
668
|
}
|
|
784
|
-
return
|
|
785
|
-
}
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
`SELECT s.scorer_name, AVG(s.score) as meanScore
|
|
797
|
-
FROM scores s
|
|
798
|
-
JOIN cases c ON c.id = s.case_id
|
|
799
|
-
WHERE c.run_id = ?
|
|
800
|
-
GROUP BY s.scorer_name`
|
|
801
|
-
).all(runId);
|
|
802
|
-
const meanScores = {};
|
|
803
|
-
for (const row of scorerMeans) {
|
|
804
|
-
meanScores[row.scorer_name] = row.meanScore;
|
|
669
|
+
return this.#setSelection({ type: "sample", count });
|
|
670
|
+
}
|
|
671
|
+
assert() {
|
|
672
|
+
this.#shouldAssert = true;
|
|
673
|
+
return this;
|
|
674
|
+
}
|
|
675
|
+
then(onfulfilled, onrejected) {
|
|
676
|
+
return this.#execute().then(onfulfilled, onrejected);
|
|
677
|
+
}
|
|
678
|
+
async #execute() {
|
|
679
|
+
if ("models" in this.#options) {
|
|
680
|
+
return this.#executeMulti();
|
|
805
681
|
}
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
else failCount++;
|
|
682
|
+
return this.#executeSingle();
|
|
683
|
+
}
|
|
684
|
+
#applyDatasetFilter(ds) {
|
|
685
|
+
switch (this.#selection.type) {
|
|
686
|
+
case "all":
|
|
687
|
+
return ds;
|
|
688
|
+
case "cases":
|
|
689
|
+
return this.#selection.indexes.size > 0 ? filterRecordsByIndex(ds, this.#selection.indexes) : ds;
|
|
690
|
+
case "sample":
|
|
691
|
+
return dataset(ds).sample(this.#selection.count);
|
|
692
|
+
case "failed":
|
|
693
|
+
return ds;
|
|
819
694
|
}
|
|
820
|
-
return {
|
|
821
|
-
totalCases: totals.totalCases,
|
|
822
|
-
passCount,
|
|
823
|
-
failCount,
|
|
824
|
-
meanScores,
|
|
825
|
-
totalLatencyMs: totals.totalLatencyMs,
|
|
826
|
-
totalTokensIn: totals.totalTokensIn,
|
|
827
|
-
totalTokensOut: totals.totalTokensOut
|
|
828
|
-
};
|
|
829
695
|
}
|
|
830
|
-
|
|
831
|
-
const
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
const
|
|
847
|
-
this.#
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
return
|
|
851
|
-
}
|
|
852
|
-
listPrompts() {
|
|
853
|
-
const rows = this.#stmt(
|
|
854
|
-
"SELECT * FROM prompts ORDER BY name COLLATE NOCASE ASC, version DESC"
|
|
855
|
-
).all();
|
|
856
|
-
return rows.map((row) => ({
|
|
857
|
-
id: row.id,
|
|
858
|
-
name: row.name,
|
|
859
|
-
version: row.version,
|
|
860
|
-
content: row.content,
|
|
861
|
-
created_at: row.created_at
|
|
862
|
-
}));
|
|
863
|
-
}
|
|
864
|
-
getPrompt(id) {
|
|
865
|
-
const row = this.#stmt("SELECT * FROM prompts WHERE id = ?").get(id);
|
|
866
|
-
if (!row) return void 0;
|
|
867
|
-
return {
|
|
868
|
-
id: row.id,
|
|
869
|
-
name: row.name,
|
|
870
|
-
version: row.version,
|
|
871
|
-
content: row.content,
|
|
872
|
-
created_at: row.created_at
|
|
873
|
-
};
|
|
696
|
+
async #executeSingle() {
|
|
697
|
+
const options = this.#options;
|
|
698
|
+
let ds = options.dataset;
|
|
699
|
+
if (this.#selection.type === "failed") {
|
|
700
|
+
const indexes = resolveFailedIndexes(
|
|
701
|
+
options.store,
|
|
702
|
+
options.name,
|
|
703
|
+
options.model,
|
|
704
|
+
options.threshold
|
|
705
|
+
);
|
|
706
|
+
if (indexes.size > 0) {
|
|
707
|
+
ds = filterRecordsByIndex(ds, indexes);
|
|
708
|
+
}
|
|
709
|
+
} else {
|
|
710
|
+
ds = this.#applyDatasetFilter(ds);
|
|
711
|
+
}
|
|
712
|
+
const result = await evaluateSingle({ ...options, dataset: ds });
|
|
713
|
+
if (this.#shouldAssert && result.failCount > 0) {
|
|
714
|
+
throw new EvalAssertionError(result);
|
|
715
|
+
}
|
|
716
|
+
return result;
|
|
874
717
|
}
|
|
875
|
-
|
|
876
|
-
|
|
718
|
+
async #executeMulti() {
|
|
719
|
+
const options = this.#options;
|
|
720
|
+
let result;
|
|
721
|
+
if (this.#selection.type === "failed") {
|
|
722
|
+
const perModelIndexes = /* @__PURE__ */ new Map();
|
|
723
|
+
for (const variant of options.models) {
|
|
724
|
+
perModelIndexes.set(
|
|
725
|
+
variant.name,
|
|
726
|
+
resolveFailedIndexes(
|
|
727
|
+
options.store,
|
|
728
|
+
options.name,
|
|
729
|
+
variant.name,
|
|
730
|
+
options.threshold
|
|
731
|
+
)
|
|
732
|
+
);
|
|
733
|
+
}
|
|
734
|
+
result = await evaluateEach(options, perModelIndexes);
|
|
735
|
+
} else {
|
|
736
|
+
const filtered = this.#applyDatasetFilter(options.dataset);
|
|
737
|
+
result = await evaluateEach({ ...options, dataset: filtered });
|
|
738
|
+
}
|
|
739
|
+
if (this.#shouldAssert && result.some((s) => s.failCount > 0)) {
|
|
740
|
+
throw new EvalAssertionError(result);
|
|
741
|
+
}
|
|
742
|
+
return result;
|
|
877
743
|
}
|
|
878
744
|
};
|
|
879
|
-
|
|
880
|
-
// packages/evals/src/evaluate/index.ts
|
|
881
|
-
async function evaluate(options) {
|
|
745
|
+
function evaluate(options) {
|
|
882
746
|
if ("models" in options) {
|
|
883
|
-
return
|
|
747
|
+
return new EvalBuilder(options);
|
|
884
748
|
}
|
|
885
|
-
return
|
|
886
|
-
}
|
|
887
|
-
function resolveStore(store) {
|
|
888
|
-
return store instanceof RunStore ? store : new RunStore(store);
|
|
749
|
+
return new EvalBuilder(options);
|
|
889
750
|
}
|
|
890
751
|
function wireReporters(reporters) {
|
|
891
752
|
const emitter = new EvalEmitter();
|
|
@@ -918,7 +779,6 @@ async function notifyRunEnd(reporters, data) {
|
|
|
918
779
|
await Promise.all(reporters.map((r) => r.onRunEnd?.(data)));
|
|
919
780
|
}
|
|
920
781
|
async function evaluateSingle(options) {
|
|
921
|
-
const store = resolveStore(options.store);
|
|
922
782
|
const threshold = options.threshold ?? 0.5;
|
|
923
783
|
const { emitter, cases, getRunId } = wireReporters(options.reporters);
|
|
924
784
|
const summary = await runEval({
|
|
@@ -927,7 +787,7 @@ async function evaluateSingle(options) {
|
|
|
927
787
|
dataset: options.dataset,
|
|
928
788
|
task: options.task,
|
|
929
789
|
scorers: options.scorers,
|
|
930
|
-
store,
|
|
790
|
+
store: options.store,
|
|
931
791
|
emitter,
|
|
932
792
|
suiteId: options.suiteId,
|
|
933
793
|
maxConcurrency: options.maxConcurrency,
|
|
@@ -945,33 +805,39 @@ async function evaluateSingle(options) {
|
|
|
945
805
|
});
|
|
946
806
|
return summary;
|
|
947
807
|
}
|
|
948
|
-
async function evaluateEach(options) {
|
|
949
|
-
const store = resolveStore(options.store);
|
|
808
|
+
async function evaluateEach(options, perModelFailedIndexes) {
|
|
950
809
|
const items = [];
|
|
951
810
|
for await (const item of options.dataset) {
|
|
952
811
|
items.push(item);
|
|
953
812
|
}
|
|
954
|
-
const suite = store.createSuite(options.name);
|
|
813
|
+
const suite = options.store.createSuite(options.name);
|
|
955
814
|
return Promise.all(
|
|
956
|
-
options.models.map(
|
|
957
|
-
|
|
815
|
+
options.models.map((variant) => {
|
|
816
|
+
let ds = dataset(items);
|
|
817
|
+
const failedIndexes = perModelFailedIndexes?.get(variant.name);
|
|
818
|
+
if (failedIndexes && failedIndexes.size > 0) {
|
|
819
|
+
ds = filterRecordsByIndex(ds, failedIndexes);
|
|
820
|
+
}
|
|
821
|
+
return evaluateSingle({
|
|
958
822
|
name: `${options.name} [${variant.name}]`,
|
|
959
823
|
model: variant.name,
|
|
960
|
-
dataset:
|
|
824
|
+
dataset: ds,
|
|
961
825
|
task: (input) => options.task(input, variant),
|
|
962
826
|
scorers: options.scorers,
|
|
963
827
|
reporters: options.reporters,
|
|
964
|
-
store,
|
|
828
|
+
store: options.store,
|
|
965
829
|
suiteId: suite.id,
|
|
966
830
|
maxConcurrency: options.maxConcurrency,
|
|
967
831
|
timeout: options.timeout,
|
|
968
832
|
trials: options.trials,
|
|
969
833
|
threshold: options.threshold
|
|
970
|
-
})
|
|
971
|
-
)
|
|
834
|
+
});
|
|
835
|
+
})
|
|
972
836
|
);
|
|
973
837
|
}
|
|
974
838
|
export {
|
|
839
|
+
EvalAssertionError,
|
|
840
|
+
EvalBuilder,
|
|
975
841
|
evaluate
|
|
976
842
|
};
|
|
977
843
|
//# sourceMappingURL=index.js.map
|