@deepagents/evals 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +218 -0
  2. package/dist/comparison/index.d.ts +41 -0
  3. package/dist/comparison/index.d.ts.map +1 -0
  4. package/dist/comparison/index.js +106 -0
  5. package/dist/comparison/index.js.map +7 -0
  6. package/dist/dataset/hf.d.ts +16 -0
  7. package/dist/dataset/hf.d.ts.map +1 -0
  8. package/dist/dataset/index.d.ts +17 -0
  9. package/dist/dataset/index.d.ts.map +1 -0
  10. package/dist/dataset/index.js +256 -0
  11. package/dist/dataset/index.js.map +7 -0
  12. package/dist/engine/index.d.ts +67 -0
  13. package/dist/engine/index.d.ts.map +1 -0
  14. package/dist/engine/index.js +332 -0
  15. package/dist/engine/index.js.map +7 -0
  16. package/dist/evaluate/index.d.ts +47 -0
  17. package/dist/evaluate/index.d.ts.map +1 -0
  18. package/dist/evaluate/index.js +977 -0
  19. package/dist/evaluate/index.js.map +7 -0
  20. package/dist/index.d.ts +15 -0
  21. package/dist/index.d.ts.map +1 -0
  22. package/dist/index.js +1763 -0
  23. package/dist/index.js.map +7 -0
  24. package/dist/reporters/console.d.ts +6 -0
  25. package/dist/reporters/console.d.ts.map +1 -0
  26. package/dist/reporters/csv.d.ts +6 -0
  27. package/dist/reporters/csv.d.ts.map +1 -0
  28. package/dist/reporters/format.d.ts +12 -0
  29. package/dist/reporters/format.d.ts.map +1 -0
  30. package/dist/reporters/html.d.ts +6 -0
  31. package/dist/reporters/html.d.ts.map +1 -0
  32. package/dist/reporters/index.d.ts +12 -0
  33. package/dist/reporters/index.d.ts.map +1 -0
  34. package/dist/reporters/index.js +447 -0
  35. package/dist/reporters/index.js.map +7 -0
  36. package/dist/reporters/json.d.ts +7 -0
  37. package/dist/reporters/json.d.ts.map +1 -0
  38. package/dist/reporters/markdown.d.ts +6 -0
  39. package/dist/reporters/markdown.d.ts.map +1 -0
  40. package/dist/reporters/shared.d.ts +11 -0
  41. package/dist/reporters/shared.d.ts.map +1 -0
  42. package/dist/reporters/types.d.ts +35 -0
  43. package/dist/reporters/types.d.ts.map +1 -0
  44. package/dist/scorers/index.d.ts +30 -0
  45. package/dist/scorers/index.d.ts.map +1 -0
  46. package/dist/scorers/index.js +175 -0
  47. package/dist/scorers/index.js.map +7 -0
  48. package/dist/store/index.d.ts +103 -0
  49. package/dist/store/index.d.ts.map +1 -0
  50. package/dist/store/index.js +361 -0
  51. package/dist/store/index.js.map +7 -0
  52. package/package.json +99 -0
package/README.md ADDED
@@ -0,0 +1,218 @@
1
+ # @deepagents/evals
2
+
3
+ A general-purpose LLM evaluation framework with dataset loading, scoring, run persistence, model comparison, and console reporting.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @deepagents/evals
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```typescript
14
+ import { dataset, evaluate, exactMatch } from '@deepagents/evals';
15
+
16
+ const summary = await evaluate({
17
+ name: 'my-eval',
18
+ model: 'gpt-4o',
19
+ dataset: dataset([
20
+ { input: 'What is 2+2?', expected: '4' },
21
+ { input: 'What is 3+3?', expected: '6' },
22
+ ]),
23
+ task: async (item) => {
24
+ const response = await callMyLLM(item.input);
25
+ return { output: response };
26
+ },
27
+ scorers: { exact: exactMatch },
28
+ });
29
+ ```
30
+
31
+ ## Modules
32
+
33
+ The package is organized into subpath exports for granular imports:
34
+
35
+ | Import | Description |
36
+ | ------------------------------ | --------------------------------------- |
37
+ | `@deepagents/evals` | Top-level convenience API (`evaluate`) |
38
+ | `@deepagents/evals/dataset` | Dataset loading and transforms |
39
+ | `@deepagents/evals/scorers` | Scorer functions and combinators |
40
+ | `@deepagents/evals/store` | SQLite run persistence |
41
+ | `@deepagents/evals/engine` | Eval engine with concurrency and events |
42
+ | `@deepagents/evals/comparison` | Run diffing and regression detection |
43
+ | `@deepagents/evals/reporters` | Console reporter |
44
+
45
+ ## Dataset
46
+
47
+ Load data from inline arrays or local files (JSON, JSONL, CSV):
48
+
49
+ ```typescript
50
+ import { dataset } from '@deepagents/evals/dataset';
51
+
52
+ // Inline array
53
+ const ds = dataset([{ input: 'hello', expected: 'world' }]);
54
+
55
+ // From file
56
+ const ds = dataset('./data/questions.json');
57
+ const ds = dataset('./data/questions.jsonl');
58
+ const ds = dataset('./data/questions.csv');
59
+ ```
60
+
61
+ ### Transforms
62
+
63
+ Chainable, lazy transforms on datasets:
64
+
65
+ ```typescript
66
+ dataset('./large-dataset.jsonl')
67
+ .filter((row) => row.difficulty === 'hard')
68
+ .map((row) => ({ input: row.question, expected: row.answer }))
69
+ .shuffle()
70
+ .limit(100);
71
+ ```
72
+
73
+ | Transform | Behavior |
74
+ | ------------ | -------------------------------------------- |
75
+ | `map(fn)` | Lazy — transforms each element |
76
+ | `filter(fn)` | Lazy — excludes non-matching elements |
77
+ | `limit(n)` | Lazy — caps output at n elements |
78
+ | `shuffle()` | Eager — buffers all, randomizes order |
79
+ | `sample(n)` | Eager — buffers all, picks n random elements |
80
+ | `toArray()` | Consumes into a plain array |
81
+
82
+ ## Scorers
83
+
84
+ All scorers return `{ score: number (0..1), reason?: string }`.
85
+
86
+ ### Deterministic Scorers
87
+
88
+ ```typescript
89
+ import {
90
+ exactMatch,
91
+ includes,
92
+ jsonMatch,
93
+ levenshtein,
94
+ regex,
95
+ } from '@deepagents/evals/scorers';
96
+ ```
97
+
98
+ | Scorer | Description |
99
+ | ---------------- | ----------------------------------- |
100
+ | `exactMatch` | Strict string equality |
101
+ | `includes` | Substring check |
102
+ | `regex(pattern)` | RegExp test |
103
+ | `levenshtein` | Normalized edit distance similarity |
104
+ | `jsonMatch` | Deep JSON structural equality |
105
+
106
+ ### LLM-Based Scorers
107
+
108
+ ```typescript
109
+ import { factuality, llmJudge } from '@deepagents/evals/scorers';
110
+
111
+ const judge = llmJudge({ model: myModel, criteria: 'Is the answer helpful?' });
112
+ const fact = factuality({ model: myModel });
113
+ ```
114
+
115
+ ### Combinators
116
+
117
+ ```typescript
118
+ import { all, any, weighted } from '@deepagents/evals/scorers';
119
+
120
+ // Weakest-link (minimum score)
121
+ const strict = all(exactMatch, includes);
122
+
123
+ // Best-of (maximum score)
124
+ const lenient = any(exactMatch, includes);
125
+
126
+ // Weighted average
127
+ const balanced = weighted({
128
+ accuracy: { scorer: exactMatch, weight: 2 },
129
+ style: { scorer: llmJudge({ model, criteria: '...' }), weight: 1 },
130
+ });
131
+ ```
132
+
133
+ ## Run Store
134
+
135
+ SQLite-backed persistence for runs, cases, and scores:
136
+
137
+ ```typescript
138
+ import { RunStore } from '@deepagents/evals/store';
139
+
140
+ const store = new RunStore('.evals/store.db');
141
+
142
+ // Create a suite for grouping runs
143
+ const suite = store.createSuite('text2sql-accuracy');
144
+
145
+ // Query results
146
+ const runs = store.listRuns(suite.id);
147
+ const failing = store.getFailingCases(runId, 0.5);
148
+ const summary = store.getRunSummary(runId);
149
+ ```
150
+
151
+ ## Engine
152
+
153
+ The engine orchestrates dataset iteration, task execution, scoring, and persistence:
154
+
155
+ ```typescript
156
+ import { EvalEmitter, runEval } from '@deepagents/evals/engine';
157
+
158
+ const emitter = new EvalEmitter();
159
+ emitter.on('case:scored', (data) => console.log(data.index, data.scores));
160
+
161
+ const summary = await runEval({
162
+ name: 'my-eval',
163
+ model: 'gpt-4o',
164
+ dataset: ds,
165
+ task: myTask,
166
+ scorers: { exact: exactMatch },
167
+ store,
168
+ emitter,
169
+ maxConcurrency: 10,
170
+ timeout: 30_000,
171
+ trials: 1,
172
+ threshold: 0.5,
173
+ });
174
+ ```
175
+
176
+ ### Events
177
+
178
+ | Event | Payload | When |
179
+ | ------------- | ------------------------------------- | ----------------------------------------- |
180
+ | `run:start` | `{ runId, totalCases, name, model }` | Run begins |
181
+ | `case:start` | `{ runId, index, input }` | Case execution starts |
182
+ | `case:scored` | `{ runId, index, scores, latencyMs }` | Case scored (always fires, even on error) |
183
+ | `case:error` | `{ runId, index, error }` | Task threw an error |
184
+ | `run:end` | `{ runId, summary }` | All cases complete |
185
+
186
+ ## Comparison
187
+
188
+ Compare two runs case-by-case to detect improvements and regressions:
189
+
190
+ ```typescript
191
+ import { compareRuns } from '@deepagents/evals/comparison';
192
+
193
+ const result = compareRuns(store, baselineRunId, candidateRunId, {
194
+ tolerance: 0.01,
195
+ regressionThreshold: 0.05,
196
+ });
197
+
198
+ console.log(result.regression.regressed); // true if any scorer regressed
199
+ console.log(result.scorerSummaries); // per-scorer mean deltas and counts
200
+ console.log(result.costDelta); // latency and token differences
201
+ ```
202
+
203
+ ## Console Reporter
204
+
205
+ Subscribe to engine events for terminal output:
206
+
207
+ ```typescript
208
+ import { consoleReporter } from '@deepagents/evals/reporters';
209
+
210
+ consoleReporter(emitter, {
211
+ verbosity: 'normal', // 'quiet' | 'normal' | 'verbose'
212
+ threshold: 0.5,
213
+ });
214
+ ```
215
+
216
+ ## License
217
+
218
+ MIT
@@ -0,0 +1,41 @@
1
+ import type { RunStore } from '../store/index.ts';
2
+ export type ChangeType = 'improved' | 'regressed' | 'unchanged';
3
+ export interface CaseDiff {
4
+ index: number;
5
+ scorerDeltas: Record<string, {
6
+ baseline: number;
7
+ candidate: number;
8
+ delta: number;
9
+ change: ChangeType;
10
+ }>;
11
+ }
12
+ export interface ScorerSummary {
13
+ meanDelta: number;
14
+ improvedCount: number;
15
+ regressedCount: number;
16
+ unchangedCount: number;
17
+ }
18
+ export interface CostDelta {
19
+ latencyDeltaMs: number;
20
+ tokenInDelta: number;
21
+ tokenOutDelta: number;
22
+ }
23
+ export interface ComparisonResult {
24
+ caseDiffs: CaseDiff[];
25
+ scorerSummaries: Record<string, ScorerSummary>;
26
+ costDelta: CostDelta;
27
+ totalCasesCompared: number;
28
+ regression: {
29
+ regressed: boolean;
30
+ details: Record<string, {
31
+ meanDelta: number;
32
+ exceeds: boolean;
33
+ }>;
34
+ };
35
+ }
36
+ export interface CompareOptions {
37
+ tolerance?: number;
38
+ regressionThreshold?: number;
39
+ }
40
+ export declare function compareRuns(store: RunStore, baselineRunId: string, candidateRunId: string, options?: CompareOptions): ComparisonResult;
41
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/comparison/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAkB,QAAQ,EAAc,MAAM,mBAAmB,CAAC;AAE9E,MAAM,MAAM,UAAU,GAAG,UAAU,GAAG,WAAW,GAAG,WAAW,CAAC;AAEhE,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAClB,MAAM,EACN;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,UAAU,CAAA;KAAE,CAC3E,CAAC;CACH;AAED,MAAM,WAAW,aAAa;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,SAAS;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IAC/C,SAAS,EAAE,SAAS,CAAC;IACrB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,UAAU,EAAE;QACV,SAAS,EAAE,OAAO,CAAC;QACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE;YAAE,SAAS,EAAE,MAAM,CAAC;YAAC,OAAO,EAAE,OAAO,CAAA;SAAE,CAAC,CAAC;KAClE,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AA+BD,wBAAgB,WAAW,CACzB,KAAK,EAAE,QAAQ,EACf,aAAa,EAAE,MAAM,EACrB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,cAAc,GACvB,gBAAgB,CA0GlB"}
@@ -0,0 +1,106 @@
1
+ // packages/evals/src/comparison/index.ts
2
+ function categorize(delta, tolerance) {
3
+ if (Math.abs(delta) <= tolerance) return "unchanged";
4
+ return delta > 0 ? "improved" : "regressed";
5
+ }
6
+ function buildScoreMap(cases) {
7
+ const map = /* @__PURE__ */ new Map();
8
+ for (const c of cases) {
9
+ const scores = {};
10
+ for (const s of c.scores) {
11
+ scores[s.scorer_name] = s.score;
12
+ }
13
+ map.set(c.idx, scores);
14
+ }
15
+ return map;
16
+ }
17
+ function getAllCasesWithScores(store, runId) {
18
+ const cases = store.getCases(runId);
19
+ const withScores = store.getFailingCases(runId, Infinity);
20
+ const scoredMap = new Map(withScores.map((c) => [c.id, c]));
21
+ return cases.map((c) => scoredMap.get(c.id) ?? { ...c, scores: [] });
22
+ }
23
+ function compareRuns(store, baselineRunId, candidateRunId, options) {
24
+ const tolerance = options?.tolerance ?? 0.01;
25
+ const regressionThreshold = options?.regressionThreshold ?? 0.05;
26
+ const baselineCases = getAllCasesWithScores(store, baselineRunId);
27
+ const candidateCases = getAllCasesWithScores(store, candidateRunId);
28
+ if (baselineCases.length !== candidateCases.length) {
29
+ console.warn(
30
+ `Run case count mismatch: baseline=${baselineCases.length}, candidate=${candidateCases.length}. Comparing intersection only.`
31
+ );
32
+ }
33
+ const baselineMap = buildScoreMap(baselineCases);
34
+ const candidateMap = buildScoreMap(candidateCases);
35
+ const allScorerNames = /* @__PURE__ */ new Set();
36
+ for (const scores of baselineMap.values()) {
37
+ for (const name of Object.keys(scores)) allScorerNames.add(name);
38
+ }
39
+ for (const scores of candidateMap.values()) {
40
+ for (const name of Object.keys(scores)) allScorerNames.add(name);
41
+ }
42
+ const commonIndices = [...baselineMap.keys()].filter(
43
+ (idx) => candidateMap.has(idx)
44
+ );
45
+ commonIndices.sort((a, b) => a - b);
46
+ const caseDiffs = [];
47
+ const scorerDeltas = {};
48
+ const scorerCounts = {};
49
+ for (const name of allScorerNames) {
50
+ scorerDeltas[name] = [];
51
+ scorerCounts[name] = { improved: 0, regressed: 0, unchanged: 0 };
52
+ }
53
+ for (const idx of commonIndices) {
54
+ const baseScores = baselineMap.get(idx);
55
+ const candScores = candidateMap.get(idx);
56
+ const diff = { index: idx, scorerDeltas: {} };
57
+ for (const name of allScorerNames) {
58
+ const baseline = baseScores[name] ?? 0;
59
+ const candidate = candScores[name] ?? 0;
60
+ const delta = candidate - baseline;
61
+ const change = categorize(delta, tolerance);
62
+ diff.scorerDeltas[name] = { baseline, candidate, delta, change };
63
+ scorerDeltas[name].push(delta);
64
+ if (change === "improved") scorerCounts[name].improved++;
65
+ else if (change === "regressed") scorerCounts[name].regressed++;
66
+ else scorerCounts[name].unchanged++;
67
+ }
68
+ caseDiffs.push(diff);
69
+ }
70
+ const scorerSummaries = {};
71
+ for (const name of allScorerNames) {
72
+ const deltas = scorerDeltas[name];
73
+ const meanDelta = deltas.length > 0 ? deltas.reduce((a, b) => a + b, 0) / deltas.length : 0;
74
+ scorerSummaries[name] = {
75
+ meanDelta,
76
+ improvedCount: scorerCounts[name].improved,
77
+ regressedCount: scorerCounts[name].regressed,
78
+ unchangedCount: scorerCounts[name].unchanged
79
+ };
80
+ }
81
+ const baselineSummary = store.getRunSummary(baselineRunId);
82
+ const candidateSummary = store.getRunSummary(candidateRunId);
83
+ const costDelta = {
84
+ latencyDeltaMs: candidateSummary.totalLatencyMs - baselineSummary.totalLatencyMs,
85
+ tokenInDelta: candidateSummary.totalTokensIn - baselineSummary.totalTokensIn,
86
+ tokenOutDelta: candidateSummary.totalTokensOut - baselineSummary.totalTokensOut
87
+ };
88
+ const regressionDetails = {};
89
+ let anyRegressed = false;
90
+ for (const [name, summary] of Object.entries(scorerSummaries)) {
91
+ const exceeds = summary.meanDelta < -regressionThreshold;
92
+ regressionDetails[name] = { meanDelta: summary.meanDelta, exceeds };
93
+ if (exceeds) anyRegressed = true;
94
+ }
95
+ return {
96
+ caseDiffs,
97
+ scorerSummaries,
98
+ costDelta,
99
+ totalCasesCompared: commonIndices.length,
100
+ regression: { regressed: anyRegressed, details: regressionDetails }
101
+ };
102
+ }
103
+ export {
104
+ compareRuns
105
+ };
106
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,7 @@
1
+ {
2
+ "version": 3,
3
+ "sources": ["../../src/comparison/index.ts"],
4
+ "sourcesContent": ["import type { CaseWithScores, RunStore, RunSummary } from '../store/index.ts';\n\nexport type ChangeType = 'improved' | 'regressed' | 'unchanged';\n\nexport interface CaseDiff {\n index: number;\n scorerDeltas: Record<\n string,\n { baseline: number; candidate: number; delta: number; change: ChangeType }\n >;\n}\n\nexport interface ScorerSummary {\n meanDelta: number;\n improvedCount: number;\n regressedCount: number;\n unchangedCount: number;\n}\n\nexport interface CostDelta {\n latencyDeltaMs: number;\n tokenInDelta: number;\n tokenOutDelta: number;\n}\n\nexport interface ComparisonResult {\n caseDiffs: CaseDiff[];\n scorerSummaries: Record<string, ScorerSummary>;\n costDelta: CostDelta;\n totalCasesCompared: number;\n regression: {\n regressed: boolean;\n details: Record<string, { meanDelta: number; exceeds: boolean }>;\n };\n}\n\nexport interface CompareOptions {\n tolerance?: number;\n regressionThreshold?: number;\n}\n\nfunction categorize(delta: number, tolerance: number): ChangeType {\n if (Math.abs(delta) <= tolerance) return 'unchanged';\n return delta > 0 ? 'improved' : 'regressed';\n}\n\nfunction buildScoreMap(\n cases: CaseWithScores[],\n): Map<number, Record<string, number>> {\n const map = new Map<number, Record<string, number>>();\n for (const c of cases) {\n const scores: Record<string, number> = {};\n for (const s of c.scores) {\n scores[s.scorer_name] = s.score;\n }\n map.set(c.idx, scores);\n }\n return map;\n}\n\nfunction getAllCasesWithScores(\n store: RunStore,\n runId: string,\n): CaseWithScores[] {\n const cases = store.getCases(runId);\n const withScores = store.getFailingCases(runId, Infinity);\n const scoredMap = new Map(withScores.map((c) => [c.id, c]));\n return cases.map((c) => scoredMap.get(c.id) ?? { ...c, scores: [] });\n}\n\nexport function compareRuns(\n store: RunStore,\n baselineRunId: string,\n candidateRunId: string,\n options?: CompareOptions,\n): ComparisonResult {\n const tolerance = options?.tolerance ?? 0.01;\n const regressionThreshold = options?.regressionThreshold ?? 0.05;\n\n const baselineCases = getAllCasesWithScores(store, baselineRunId);\n const candidateCases = getAllCasesWithScores(store, candidateRunId);\n\n if (baselineCases.length !== candidateCases.length) {\n console.warn(\n `Run case count mismatch: baseline=${baselineCases.length}, candidate=${candidateCases.length}. Comparing intersection only.`,\n );\n }\n\n const baselineMap = buildScoreMap(baselineCases);\n const candidateMap = buildScoreMap(candidateCases);\n\n const allScorerNames = new Set<string>();\n for (const scores of baselineMap.values()) {\n for (const name of Object.keys(scores)) allScorerNames.add(name);\n }\n for (const scores of candidateMap.values()) {\n for (const name of Object.keys(scores)) allScorerNames.add(name);\n }\n\n const commonIndices = [...baselineMap.keys()].filter((idx) =>\n candidateMap.has(idx),\n );\n commonIndices.sort((a, b) => a - b);\n\n const caseDiffs: CaseDiff[] = [];\n const scorerDeltas: Record<string, number[]> = {};\n const scorerCounts: Record<\n string,\n { improved: number; regressed: number; unchanged: number }\n > = {};\n\n for (const name of allScorerNames) {\n scorerDeltas[name] = [];\n scorerCounts[name] = { improved: 0, regressed: 0, unchanged: 0 };\n }\n\n for (const idx of commonIndices) {\n const baseScores = baselineMap.get(idx)!;\n const candScores = candidateMap.get(idx)!;\n const diff: CaseDiff = { index: idx, scorerDeltas: {} };\n\n for (const name of allScorerNames) {\n const baseline = baseScores[name] ?? 0;\n const candidate = candScores[name] ?? 0;\n const delta = candidate - baseline;\n const change = categorize(delta, tolerance);\n\n diff.scorerDeltas[name] = { baseline, candidate, delta, change };\n scorerDeltas[name]!.push(delta);\n\n if (change === 'improved') scorerCounts[name]!.improved++;\n else if (change === 'regressed') scorerCounts[name]!.regressed++;\n else scorerCounts[name]!.unchanged++;\n }\n\n caseDiffs.push(diff);\n }\n\n const scorerSummaries: Record<string, ScorerSummary> = {};\n for (const name of allScorerNames) {\n const deltas = scorerDeltas[name]!;\n const meanDelta =\n deltas.length > 0 ? deltas.reduce((a, b) => a + b, 0) / deltas.length : 0;\n scorerSummaries[name] = {\n meanDelta,\n improvedCount: scorerCounts[name]!.improved,\n regressedCount: scorerCounts[name]!.regressed,\n unchangedCount: scorerCounts[name]!.unchanged,\n };\n }\n\n const baselineSummary = store.getRunSummary(baselineRunId);\n const candidateSummary = store.getRunSummary(candidateRunId);\n\n const costDelta: CostDelta = {\n latencyDeltaMs:\n candidateSummary.totalLatencyMs - baselineSummary.totalLatencyMs,\n tokenInDelta:\n candidateSummary.totalTokensIn - baselineSummary.totalTokensIn,\n tokenOutDelta:\n candidateSummary.totalTokensOut - baselineSummary.totalTokensOut,\n };\n\n const regressionDetails: Record<\n string,\n { meanDelta: number; exceeds: boolean }\n > = {};\n let anyRegressed = false;\n for (const [name, summary] of Object.entries(scorerSummaries)) {\n const exceeds = summary.meanDelta < -regressionThreshold;\n regressionDetails[name] = { meanDelta: summary.meanDelta, exceeds };\n if (exceeds) anyRegressed = true;\n }\n\n return {\n caseDiffs,\n scorerSummaries,\n costDelta,\n totalCasesCompared: commonIndices.length,\n regression: { regressed: anyRegressed, details: regressionDetails },\n };\n}\n"],
5
+ "mappings": ";AAyCA,SAAS,WAAW,OAAe,WAA+B;AAChE,MAAI,KAAK,IAAI,KAAK,KAAK,UAAW,QAAO;AACzC,SAAO,QAAQ,IAAI,aAAa;AAClC;AAEA,SAAS,cACP,OACqC;AACrC,QAAM,MAAM,oBAAI,IAAoC;AACpD,aAAW,KAAK,OAAO;AACrB,UAAM,SAAiC,CAAC;AACxC,eAAW,KAAK,EAAE,QAAQ;AACxB,aAAO,EAAE,WAAW,IAAI,EAAE;AAAA,IAC5B;AACA,QAAI,IAAI,EAAE,KAAK,MAAM;AAAA,EACvB;AACA,SAAO;AACT;AAEA,SAAS,sBACP,OACA,OACkB;AAClB,QAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,QAAM,aAAa,MAAM,gBAAgB,OAAO,QAAQ;AACxD,QAAM,YAAY,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;AAC1D,SAAO,MAAM,IAAI,CAAC,MAAM,UAAU,IAAI,EAAE,EAAE,KAAK,EAAE,GAAG,GAAG,QAAQ,CAAC,EAAE,CAAC;AACrE;AAEO,SAAS,YACd,OACA,eACA,gBACA,SACkB;AAClB,QAAM,YAAY,SAAS,aAAa;AACxC,QAAM,sBAAsB,SAAS,uBAAuB;AAE5D,QAAM,gBAAgB,sBAAsB,OAAO,aAAa;AAChE,QAAM,iBAAiB,sBAAsB,OAAO,cAAc;AAElE,MAAI,cAAc,WAAW,eAAe,QAAQ;AAClD,YAAQ;AAAA,MACN,qCAAqC,cAAc,MAAM,eAAe,eAAe,MAAM;AAAA,IAC/F;AAAA,EACF;AAEA,QAAM,cAAc,cAAc,aAAa;AAC/C,QAAM,eAAe,cAAc,cAAc;AAEjD,QAAM,iBAAiB,oBAAI,IAAY;AACvC,aAAW,UAAU,YAAY,OAAO,GAAG;AACzC,eAAW,QAAQ,OAAO,KAAK,MAAM,EAAG,gBAAe,IAAI,IAAI;AAAA,EACjE;AACA,aAAW,UAAU,aAAa,OAAO,GAAG;AAC1C,eAAW,QAAQ,OAAO,KAAK,MAAM,EAAG,gBAAe,IAAI,IAAI;AAAA,EACjE;AAEA,QAAM,gBAAgB,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE;AAAA,IAAO,CAAC,QACpD,aAAa,IAAI,GAAG;AAAA,EACtB;AACA,gBAAc,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAElC,QAAM,YAAwB,CAAC;AAC/B,QAAM,eAAyC,CAAC;AAChD,QAAM,eAGF,CAAC;AAEL,aAAW,QAAQ,gBAAgB;AACjC,iBAAa,IAAI,IAAI,CAAC;AACtB,iBAAa,IAAI,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,WAAW,EAAE;AAAA,EACjE;AAEA,aAAW,OAAO,eAAe;AAC/B,UAAM,aAAa,YAAY,IAAI,GAAG;AACtC,UAAM,aAAa,aAAa,IAAI,GAAG;AACvC,UAAM,OAAiB,EAAE,OAAO,KAAK,cAAc,CAAC,EAAE;AAEtD,eAAW,QAAQ,gBAAgB;AACjC,YAAM,WAAW,WAAW,IAAI,KAAK;AACrC,YAAM,YAAY,WAAW,IAAI,KAAK;AACtC,YAAM,QAAQ,YAAY;AAC1B,YAAM,SAAS,WAAW,OAAO,SAAS;AAE1C,WAAK,aAAa,IAAI,IAAI,EAAE,UAAU,WAAW,OAAO,OAAO;AAC/D,mBAAa,IAAI,EAAG,KAAK,KAAK;AAE9B,UAAI,WAAW,WAAY,cAAa,IAAI,EAAG;AAAA,eACtC,WAAW,YAAa,cAAa,IAAI,EAAG;AAAA,UAChD,cAAa,IAAI,EAAG;AAAA,IAC3B;AAEA,cAAU,KAAK,IAAI;AAAA,EACrB;AAEA,QAAM,kBAAiD,CAAC;AACxD,aAAW,QAAQ,gBAAgB;AACjC,UAAM,SAAS,aAAa,IAAI;AAChC,UAAM,YACJ,OAAO,SAAS,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO,SAAS;AAC1E,oBAAgB,IAAI,IAAI;AAAA,MACtB;AAAA,MACA,eAAe,aAAa,IAAI,EAAG;AAAA,MACnC,gBAAgB,aAAa,IAAI,EAAG;AAAA,MACpC,gBAAgB,aAAa,IAAI,EAAG;AAAA,IACtC;AAAA,EACF;AAEA,QAAM,kBAAkB,MAAM,cAAc,aAAa;AACzD,QAAM,mBAAmB,MAAM,cAAc,cAAc;AAE3D,QAAM,YAAuB;AAAA,IAC3B,gBACE,iBAAiB,iBAAiB,gBAAgB;AAAA,IACpD,cACE,iBAAiB,gBAAgB,gBAAgB;AAAA,IACnD,eACE,iBAAiB,iBAAiB,gBAAgB;AAAA,EACtD;AAEA,QAAM,oBAGF,CAAC;AACL,MAAI,eAAe;AACnB,aAAW,CAAC,MAAM,OAAO,KAAK,OAAO,QAAQ,eAAe,GAAG;AAC7D,UAAM,UAAU,QAAQ,YAAY,CAAC;AACrC,sBAAkB,IAAI,IAAI,EAAE,WAAW,QAAQ,WAAW,QAAQ;AAClE,QAAI,QAAS,gBAAe;AAAA,EAC9B;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,oBAAoB,cAAc;AAAA,IAClC,YAAY,EAAE,WAAW,cAAc,SAAS,kBAAkB;AAAA,EACpE;AACF;",
6
+ "names": []
7
+ }
@@ -0,0 +1,16 @@
1
+ export interface HfOptions {
2
+ dataset: string;
3
+ config: string;
4
+ split: string;
5
+ rows?: number;
6
+ }
7
+ export declare function hf<T = Record<string, unknown>>(options: HfOptions): AsyncIterable<T>;
8
+ export declare function fetchHfRows(options: {
9
+ dataset: string;
10
+ config: string;
11
+ split: string;
12
+ }, offset: number, length: number): Promise<{
13
+ rows: Record<string, unknown>[];
14
+ total: number;
15
+ }>;
16
+ //# sourceMappingURL=hf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hf.d.ts","sourceRoot":"","sources":["../../src/dataset/hf.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAUD,wBAAgB,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC5C,OAAO,EAAE,SAAS,GACjB,aAAa,CAAC,CAAC,CAAC,CAMlB;AA2CD,wBAAsB,WAAW,CAC/B,OAAO,EAAE;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,EAC3D,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,MAAM,GACb,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CAa7D"}
@@ -0,0 +1,17 @@
1
+ export { hf } from './hf.ts';
2
+ export type { HfOptions } from './hf.ts';
3
+ export type TransformFn<T, U> = (item: T) => U;
4
+ export type PredicateFn<T> = (item: T) => boolean;
5
+ export declare class Dataset<T> implements AsyncIterable<T> {
6
+ #private;
7
+ constructor(source: () => AsyncIterable<T>);
8
+ map<U>(fn: TransformFn<T, U>): Dataset<U>;
9
+ filter(fn: PredicateFn<T>): Dataset<T>;
10
+ limit(n: number): Dataset<T>;
11
+ shuffle(): Dataset<T>;
12
+ sample(n: number): Dataset<T>;
13
+ toArray(): Promise<T[]>;
14
+ [Symbol.asyncIterator](): AsyncIterator<T>;
15
+ }
16
+ export declare function dataset<T>(source: T[] | string | AsyncIterable<T>): Dataset<T>;
17
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/dataset/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,EAAE,EAAE,MAAM,SAAS,CAAC;AAC7B,YAAY,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAEzC,MAAM,MAAM,WAAW,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC;AAC/C,MAAM,MAAM,WAAW,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,OAAO,CAAC;AAElD,qBAAa,OAAO,CAAC,CAAC,CAAE,YAAW,aAAa,CAAC,CAAC,CAAC;;gBAGrC,MAAM,EAAE,MAAM,aAAa,CAAC,CAAC,CAAC;IAI1C,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;IASzC,MAAM,CAAC,EAAE,EAAE,WAAW,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;IAStC,KAAK,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC;IAY5B,OAAO,IAAI,OAAO,CAAC,CAAC,CAAC;IAiBrB,MAAM,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC;IAoBvB,OAAO,IAAI,OAAO,CAAC,CAAC,EAAE,CAAC;IAQ7B,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa,CAAC,CAAC,CAAC;CAG3C;AA+FD,wBAAgB,OAAO,CAAC,CAAC,EACvB,MAAM,EAAE,CAAC,EAAE,GAAG,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,GACtC,OAAO,CAAC,CAAC,CAAC,CAwBZ"}
@@ -0,0 +1,256 @@
1
+ // packages/evals/src/dataset/index.ts
2
+ import { createReadStream } from "node:fs";
3
+ import { readFile } from "node:fs/promises";
4
+ import { extname } from "node:path";
5
+ import { createInterface } from "node:readline";
6
+
7
+ // packages/evals/src/dataset/hf.ts
8
+ var HF_BASE_URL = "https://datasets-server.huggingface.co/rows";
9
+ var PAGE_SIZE = 100;
10
+ function hf(options) {
11
+ return {
12
+ [Symbol.asyncIterator]() {
13
+ return paginate(options);
14
+ }
15
+ };
16
+ }
17
+ async function* paginate(options) {
18
+ const { dataset: dataset2, config, split, rows } = options;
19
+ const limit = rows ?? Infinity;
20
+ let offset = 0;
21
+ let yielded = 0;
22
+ while (yielded < limit) {
23
+ const pageSize = limit === Infinity ? PAGE_SIZE : Math.min(PAGE_SIZE, limit - yielded);
24
+ const url = buildUrl(dataset2, config, split, offset, pageSize);
25
+ const page = await fetchPage(url);
26
+ if (page.rows.length === 0) return;
27
+ for (const entry of page.rows) {
28
+ yield entry.row;
29
+ yielded++;
30
+ if (yielded >= limit) return;
31
+ }
32
+ offset += page.rows.length;
33
+ if (page.rows.length < pageSize || offset >= page.num_rows_total) return;
34
+ }
35
+ }
36
+ function buildUrl(dataset2, config, split, offset, length) {
37
+ const url = new URL(HF_BASE_URL);
38
+ url.searchParams.set("dataset", dataset2);
39
+ url.searchParams.set("config", config);
40
+ url.searchParams.set("split", split);
41
+ url.searchParams.set("offset", String(offset));
42
+ url.searchParams.set("length", String(length));
43
+ return url.toString();
44
+ }
45
+ async function fetchPage(url) {
46
+ const response = await fetch(url);
47
+ if (!response.ok) {
48
+ const body = await response.text().catch(() => "");
49
+ throw new Error(
50
+ `HuggingFace API error ${response.status}: ${body || response.statusText}`
51
+ );
52
+ }
53
+ const text = await response.text();
54
+ try {
55
+ return JSON.parse(text);
56
+ } catch {
57
+ throw new Error(
58
+ `HuggingFace API returned non-JSON response from ${url}: ${text.slice(0, 200)}`
59
+ );
60
+ }
61
+ }
62
+
63
+ // packages/evals/src/dataset/index.ts
64
+ var Dataset = class _Dataset {
65
+ #source;
66
+ constructor(source) {
67
+ this.#source = source;
68
+ }
69
+ map(fn) {
70
+ const source = this.#source;
71
+ return new _Dataset(async function* () {
72
+ for await (const item of source()) {
73
+ yield fn(item);
74
+ }
75
+ });
76
+ }
77
+ filter(fn) {
78
+ const source = this.#source;
79
+ return new _Dataset(async function* () {
80
+ for await (const item of source()) {
81
+ if (fn(item)) yield item;
82
+ }
83
+ });
84
+ }
85
+ limit(n) {
86
+ const source = this.#source;
87
+ return new _Dataset(async function* () {
88
+ let count = 0;
89
+ for await (const item of source()) {
90
+ if (count >= n) return;
91
+ yield item;
92
+ count++;
93
+ }
94
+ });
95
+ }
96
+ shuffle() {
97
+ const source = this.#source;
98
+ return new _Dataset(async function* () {
99
+ const items = [];
100
+ for await (const item of source()) {
101
+ items.push(item);
102
+ }
103
+ for (let i = items.length - 1; i > 0; i--) {
104
+ const j = Math.floor(Math.random() * (i + 1));
105
+ const temp = items[i];
106
+ items[i] = items[j];
107
+ items[j] = temp;
108
+ }
109
+ yield* items;
110
+ });
111
+ }
112
+ sample(n) {
113
+ const source = this.#source;
114
+ return new _Dataset(async function* () {
115
+ const items = [];
116
+ for await (const item of source()) {
117
+ items.push(item);
118
+ }
119
+ const count = Math.min(Math.max(0, n), items.length);
120
+ for (let i = items.length - 1; i > items.length - count - 1; i--) {
121
+ const j = Math.floor(Math.random() * (i + 1));
122
+ const temp = items[i];
123
+ items[i] = items[j];
124
+ items[j] = temp;
125
+ }
126
+ for (let i = items.length - count; i < items.length; i++) {
127
+ yield items[i];
128
+ }
129
+ });
130
+ }
131
+ async toArray() {
132
+ const result = [];
133
+ for await (const item of this.#source()) {
134
+ result.push(item);
135
+ }
136
+ return result;
137
+ }
138
+ [Symbol.asyncIterator]() {
139
+ return this.#source()[Symbol.asyncIterator]();
140
+ }
141
+ };
142
+ function parseCSVLine(line) {
143
+ const fields = [];
144
+ let current = "";
145
+ let inQuotes = false;
146
+ for (let i = 0; i < line.length; i++) {
147
+ const char = line[i];
148
+ if (inQuotes) {
149
+ if (char === '"') {
150
+ if (i + 1 < line.length && line[i + 1] === '"') {
151
+ current += '"';
152
+ i++;
153
+ } else {
154
+ inQuotes = false;
155
+ }
156
+ } else {
157
+ current += char;
158
+ }
159
+ } else {
160
+ if (char === '"' && current === "") {
161
+ inQuotes = true;
162
+ } else if (char === ",") {
163
+ fields.push(current);
164
+ current = "";
165
+ } else {
166
+ current += char;
167
+ }
168
+ }
169
+ }
170
+ fields.push(current);
171
+ return fields;
172
+ }
173
+ function loadJSON(filePath) {
174
+ return async function* () {
175
+ const content = await readFile(filePath, "utf-8");
176
+ const data = JSON.parse(content);
177
+ if (!Array.isArray(data)) {
178
+ throw new Error(`JSON file "${filePath}" does not contain an array`);
179
+ }
180
+ yield* data;
181
+ };
182
+ }
183
+ function loadJSONL(filePath) {
184
+ return async function* () {
185
+ const rl = createInterface({
186
+ input: createReadStream(filePath, "utf-8"),
187
+ crlfDelay: Infinity
188
+ });
189
+ try {
190
+ for await (const line of rl) {
191
+ const trimmed = line.trim();
192
+ if (trimmed) {
193
+ yield JSON.parse(trimmed);
194
+ }
195
+ }
196
+ } finally {
197
+ rl.close();
198
+ }
199
+ };
200
+ }
201
+ function loadCSV(filePath) {
202
+ return async function* () {
203
+ const rl = createInterface({
204
+ input: createReadStream(filePath, "utf-8"),
205
+ crlfDelay: Infinity
206
+ });
207
+ try {
208
+ let headers;
209
+ for await (const line of rl) {
210
+ const trimmed = line.trim();
211
+ if (!trimmed) continue;
212
+ const fields = parseCSVLine(trimmed);
213
+ if (!headers) {
214
+ headers = fields;
215
+ continue;
216
+ }
217
+ const row = {};
218
+ for (let i = 0; i < headers.length; i++) {
219
+ row[headers[i]] = fields[i] ?? "";
220
+ }
221
+ yield row;
222
+ }
223
+ } finally {
224
+ rl.close();
225
+ }
226
+ };
227
+ }
228
+ function dataset(source) {
229
+ if (Array.isArray(source)) {
230
+ return new Dataset(async function* () {
231
+ yield* source;
232
+ });
233
+ }
234
+ if (typeof source === "object" && Symbol.asyncIterator in source) {
235
+ return new Dataset(() => source);
236
+ }
237
+ const ext = extname(source).toLowerCase();
238
+ switch (ext) {
239
+ case ".json":
240
+ return new Dataset(loadJSON(source));
241
+ case ".jsonl":
242
+ return new Dataset(loadJSONL(source));
243
+ case ".csv":
244
+ return new Dataset(loadCSV(source));
245
+ default:
246
+ throw new Error(
247
+ `Unsupported file extension "${ext}" for dataset file "${source}". Supported: .json, .jsonl, .csv`
248
+ );
249
+ }
250
+ }
251
+ export {
252
+ Dataset,
253
+ dataset,
254
+ hf
255
+ };
256
+ //# sourceMappingURL=index.js.map