@deepagents/evals 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/dist/comparison/index.d.ts +41 -0
- package/dist/comparison/index.d.ts.map +1 -0
- package/dist/comparison/index.js +106 -0
- package/dist/comparison/index.js.map +7 -0
- package/dist/dataset/hf.d.ts +16 -0
- package/dist/dataset/hf.d.ts.map +1 -0
- package/dist/dataset/index.d.ts +17 -0
- package/dist/dataset/index.d.ts.map +1 -0
- package/dist/dataset/index.js +256 -0
- package/dist/dataset/index.js.map +7 -0
- package/dist/engine/index.d.ts +67 -0
- package/dist/engine/index.d.ts.map +1 -0
- package/dist/engine/index.js +332 -0
- package/dist/engine/index.js.map +7 -0
- package/dist/evaluate/index.d.ts +47 -0
- package/dist/evaluate/index.d.ts.map +1 -0
- package/dist/evaluate/index.js +977 -0
- package/dist/evaluate/index.js.map +7 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1763 -0
- package/dist/index.js.map +7 -0
- package/dist/reporters/console.d.ts +6 -0
- package/dist/reporters/console.d.ts.map +1 -0
- package/dist/reporters/csv.d.ts +6 -0
- package/dist/reporters/csv.d.ts.map +1 -0
- package/dist/reporters/format.d.ts +12 -0
- package/dist/reporters/format.d.ts.map +1 -0
- package/dist/reporters/html.d.ts +6 -0
- package/dist/reporters/html.d.ts.map +1 -0
- package/dist/reporters/index.d.ts +12 -0
- package/dist/reporters/index.d.ts.map +1 -0
- package/dist/reporters/index.js +447 -0
- package/dist/reporters/index.js.map +7 -0
- package/dist/reporters/json.d.ts +7 -0
- package/dist/reporters/json.d.ts.map +1 -0
- package/dist/reporters/markdown.d.ts +6 -0
- package/dist/reporters/markdown.d.ts.map +1 -0
- package/dist/reporters/shared.d.ts +11 -0
- package/dist/reporters/shared.d.ts.map +1 -0
- package/dist/reporters/types.d.ts +35 -0
- package/dist/reporters/types.d.ts.map +1 -0
- package/dist/scorers/index.d.ts +30 -0
- package/dist/scorers/index.d.ts.map +1 -0
- package/dist/scorers/index.js +175 -0
- package/dist/scorers/index.js.map +7 -0
- package/dist/store/index.d.ts +103 -0
- package/dist/store/index.d.ts.map +1 -0
- package/dist/store/index.js +361 -0
- package/dist/store/index.js.map +7 -0
- package/package.json +99 -0
package/README.md
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# @deepagents/evals
|
|
2
|
+
|
|
3
|
+
A general-purpose LLM evaluation framework with dataset loading, scoring, run persistence, model comparison, and console reporting.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @deepagents/evals
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { dataset, evaluate, exactMatch } from '@deepagents/evals';
|
|
15
|
+
|
|
16
|
+
const summary = await evaluate({
|
|
17
|
+
name: 'my-eval',
|
|
18
|
+
model: 'gpt-4o',
|
|
19
|
+
dataset: dataset([
|
|
20
|
+
{ input: 'What is 2+2?', expected: '4' },
|
|
21
|
+
{ input: 'What is 3+3?', expected: '6' },
|
|
22
|
+
]),
|
|
23
|
+
task: async (item) => {
|
|
24
|
+
const response = await callMyLLM(item.input);
|
|
25
|
+
return { output: response };
|
|
26
|
+
},
|
|
27
|
+
scorers: { exact: exactMatch },
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Modules
|
|
32
|
+
|
|
33
|
+
The package is organized into subpath exports for granular imports:
|
|
34
|
+
|
|
35
|
+
| Import | Description |
|
|
36
|
+
| ------------------------------ | --------------------------------------- |
|
|
37
|
+
| `@deepagents/evals` | Top-level convenience API (`evaluate`) |
|
|
38
|
+
| `@deepagents/evals/dataset` | Dataset loading and transforms |
|
|
39
|
+
| `@deepagents/evals/scorers` | Scorer functions and combinators |
|
|
40
|
+
| `@deepagents/evals/store` | SQLite run persistence |
|
|
41
|
+
| `@deepagents/evals/engine` | Eval engine with concurrency and events |
|
|
42
|
+
| `@deepagents/evals/comparison` | Run diffing and regression detection |
|
|
43
|
+
| `@deepagents/evals/reporters` | Console reporter |
|
|
44
|
+
|
|
45
|
+
## Dataset
|
|
46
|
+
|
|
47
|
+
Load data from inline arrays or local files (JSON, JSONL, CSV):
|
|
48
|
+
|
|
49
|
+
```typescript
|
|
50
|
+
import { dataset } from '@deepagents/evals/dataset';
|
|
51
|
+
|
|
52
|
+
// Inline array
|
|
53
|
+
const ds = dataset([{ input: 'hello', expected: 'world' }]);
|
|
54
|
+
|
|
55
|
+
// From file
|
|
56
|
+
const ds = dataset('./data/questions.json');
|
|
57
|
+
const ds = dataset('./data/questions.jsonl');
|
|
58
|
+
const ds = dataset('./data/questions.csv');
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Transforms
|
|
62
|
+
|
|
63
|
+
Chainable, lazy transforms on datasets:
|
|
64
|
+
|
|
65
|
+
```typescript
|
|
66
|
+
dataset('./large-dataset.jsonl')
|
|
67
|
+
.filter((row) => row.difficulty === 'hard')
|
|
68
|
+
.map((row) => ({ input: row.question, expected: row.answer }))
|
|
69
|
+
.shuffle()
|
|
70
|
+
.limit(100);
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
| Transform | Behavior |
|
|
74
|
+
| ------------ | -------------------------------------------- |
|
|
75
|
+
| `map(fn)` | Lazy — transforms each element |
|
|
76
|
+
| `filter(fn)` | Lazy — excludes non-matching elements |
|
|
77
|
+
| `limit(n)` | Lazy — caps output at n elements |
|
|
78
|
+
| `shuffle()` | Eager — buffers all, randomizes order |
|
|
79
|
+
| `sample(n)` | Eager — buffers all, picks n random elements |
|
|
80
|
+
| `toArray()` | Consumes into a plain array |
|
|
81
|
+
|
|
82
|
+
## Scorers
|
|
83
|
+
|
|
84
|
+
All scorers return `{ score: number (0..1), reason?: string }`.
|
|
85
|
+
|
|
86
|
+
### Deterministic Scorers
|
|
87
|
+
|
|
88
|
+
```typescript
|
|
89
|
+
import {
|
|
90
|
+
exactMatch,
|
|
91
|
+
includes,
|
|
92
|
+
jsonMatch,
|
|
93
|
+
levenshtein,
|
|
94
|
+
regex,
|
|
95
|
+
} from '@deepagents/evals/scorers';
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
| Scorer | Description |
|
|
99
|
+
| ---------------- | ----------------------------------- |
|
|
100
|
+
| `exactMatch` | Strict string equality |
|
|
101
|
+
| `includes` | Substring check |
|
|
102
|
+
| `regex(pattern)` | RegExp test |
|
|
103
|
+
| `levenshtein` | Normalized edit distance similarity |
|
|
104
|
+
| `jsonMatch` | Deep JSON structural equality |
|
|
105
|
+
|
|
106
|
+
### LLM-Based Scorers
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
import { factuality, llmJudge } from '@deepagents/evals/scorers';
|
|
110
|
+
|
|
111
|
+
const judge = llmJudge({ model: myModel, criteria: 'Is the answer helpful?' });
|
|
112
|
+
const fact = factuality({ model: myModel });
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Combinators
|
|
116
|
+
|
|
117
|
+
```typescript
|
|
118
|
+
import { all, any, weighted } from '@deepagents/evals/scorers';
|
|
119
|
+
|
|
120
|
+
// Weakest-link (minimum score)
|
|
121
|
+
const strict = all(exactMatch, includes);
|
|
122
|
+
|
|
123
|
+
// Best-of (maximum score)
|
|
124
|
+
const lenient = any(exactMatch, includes);
|
|
125
|
+
|
|
126
|
+
// Weighted average
|
|
127
|
+
const balanced = weighted({
|
|
128
|
+
accuracy: { scorer: exactMatch, weight: 2 },
|
|
129
|
+
style: { scorer: llmJudge({ model, criteria: '...' }), weight: 1 },
|
|
130
|
+
});
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Run Store
|
|
134
|
+
|
|
135
|
+
SQLite-backed persistence for runs, cases, and scores:
|
|
136
|
+
|
|
137
|
+
```typescript
|
|
138
|
+
import { RunStore } from '@deepagents/evals/store';
|
|
139
|
+
|
|
140
|
+
const store = new RunStore('.evals/store.db');
|
|
141
|
+
|
|
142
|
+
// Create a suite for grouping runs
|
|
143
|
+
const suite = store.createSuite('text2sql-accuracy');
|
|
144
|
+
|
|
145
|
+
// Query results
|
|
146
|
+
const runs = store.listRuns(suite.id);
|
|
147
|
+
const failing = store.getFailingCases(runId, 0.5);
|
|
148
|
+
const summary = store.getRunSummary(runId);
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Engine
|
|
152
|
+
|
|
153
|
+
The engine orchestrates dataset iteration, task execution, scoring, and persistence:
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
import { EvalEmitter, runEval } from '@deepagents/evals/engine';
|
|
157
|
+
|
|
158
|
+
const emitter = new EvalEmitter();
|
|
159
|
+
emitter.on('case:scored', (data) => console.log(data.index, data.scores));
|
|
160
|
+
|
|
161
|
+
const summary = await runEval({
|
|
162
|
+
name: 'my-eval',
|
|
163
|
+
model: 'gpt-4o',
|
|
164
|
+
dataset: ds,
|
|
165
|
+
task: myTask,
|
|
166
|
+
scorers: { exact: exactMatch },
|
|
167
|
+
store,
|
|
168
|
+
emitter,
|
|
169
|
+
maxConcurrency: 10,
|
|
170
|
+
timeout: 30_000,
|
|
171
|
+
trials: 1,
|
|
172
|
+
threshold: 0.5,
|
|
173
|
+
});
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Events
|
|
177
|
+
|
|
178
|
+
| Event | Payload | When |
|
|
179
|
+
| ------------- | ------------------------------------- | ----------------------------------------- |
|
|
180
|
+
| `run:start` | `{ runId, totalCases, name, model }` | Run begins |
|
|
181
|
+
| `case:start` | `{ runId, index, input }` | Case execution starts |
|
|
182
|
+
| `case:scored` | `{ runId, index, scores, latencyMs }` | Case scored (always fires, even on error) |
|
|
183
|
+
| `case:error` | `{ runId, index, error }` | Task threw an error |
|
|
184
|
+
| `run:end` | `{ runId, summary }` | All cases complete |
|
|
185
|
+
|
|
186
|
+
## Comparison
|
|
187
|
+
|
|
188
|
+
Compare two runs case-by-case to detect improvements and regressions:
|
|
189
|
+
|
|
190
|
+
```typescript
|
|
191
|
+
import { compareRuns } from '@deepagents/evals/comparison';
|
|
192
|
+
|
|
193
|
+
const result = compareRuns(store, baselineRunId, candidateRunId, {
|
|
194
|
+
tolerance: 0.01,
|
|
195
|
+
regressionThreshold: 0.05,
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
console.log(result.regression.regressed); // true if any scorer regressed
|
|
199
|
+
console.log(result.scorerSummaries); // per-scorer mean deltas and counts
|
|
200
|
+
console.log(result.costDelta); // latency and token differences
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Console Reporter
|
|
204
|
+
|
|
205
|
+
Subscribe to engine events for terminal output:
|
|
206
|
+
|
|
207
|
+
```typescript
|
|
208
|
+
import { consoleReporter } from '@deepagents/evals/reporters';
|
|
209
|
+
|
|
210
|
+
consoleReporter(emitter, {
|
|
211
|
+
verbosity: 'normal', // 'quiet' | 'normal' | 'verbose'
|
|
212
|
+
threshold: 0.5,
|
|
213
|
+
});
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## License
|
|
217
|
+
|
|
218
|
+
MIT
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import type { RunStore } from '../store/index.ts';
|
|
2
|
+
export type ChangeType = 'improved' | 'regressed' | 'unchanged';
|
|
3
|
+
export interface CaseDiff {
|
|
4
|
+
index: number;
|
|
5
|
+
scorerDeltas: Record<string, {
|
|
6
|
+
baseline: number;
|
|
7
|
+
candidate: number;
|
|
8
|
+
delta: number;
|
|
9
|
+
change: ChangeType;
|
|
10
|
+
}>;
|
|
11
|
+
}
|
|
12
|
+
export interface ScorerSummary {
|
|
13
|
+
meanDelta: number;
|
|
14
|
+
improvedCount: number;
|
|
15
|
+
regressedCount: number;
|
|
16
|
+
unchangedCount: number;
|
|
17
|
+
}
|
|
18
|
+
export interface CostDelta {
|
|
19
|
+
latencyDeltaMs: number;
|
|
20
|
+
tokenInDelta: number;
|
|
21
|
+
tokenOutDelta: number;
|
|
22
|
+
}
|
|
23
|
+
export interface ComparisonResult {
|
|
24
|
+
caseDiffs: CaseDiff[];
|
|
25
|
+
scorerSummaries: Record<string, ScorerSummary>;
|
|
26
|
+
costDelta: CostDelta;
|
|
27
|
+
totalCasesCompared: number;
|
|
28
|
+
regression: {
|
|
29
|
+
regressed: boolean;
|
|
30
|
+
details: Record<string, {
|
|
31
|
+
meanDelta: number;
|
|
32
|
+
exceeds: boolean;
|
|
33
|
+
}>;
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
export interface CompareOptions {
|
|
37
|
+
tolerance?: number;
|
|
38
|
+
regressionThreshold?: number;
|
|
39
|
+
}
|
|
40
|
+
export declare function compareRuns(store: RunStore, baselineRunId: string, candidateRunId: string, options?: CompareOptions): ComparisonResult;
|
|
41
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/comparison/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAkB,QAAQ,EAAc,MAAM,mBAAmB,CAAC;AAE9E,MAAM,MAAM,UAAU,GAAG,UAAU,GAAG,WAAW,GAAG,WAAW,CAAC;AAEhE,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAClB,MAAM,EACN;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,UAAU,CAAA;KAAE,CAC3E,CAAC;CACH;AAED,MAAM,WAAW,aAAa;IAC5B,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,cAAc,EAAE,MAAM,CAAC;IACvB,cAAc,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,SAAS;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,aAAa,CAAC,CAAC;IAC/C,SAAS,EAAE,SAAS,CAAC;IACrB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,UAAU,EAAE;QACV,SAAS,EAAE,OAAO,CAAC;QACnB,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE;YAAE,SAAS,EAAE,MAAM,CAAC;YAAC,OAAO,EAAE,OAAO,CAAA;SAAE,CAAC,CAAC;KAClE,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B;AA+BD,wBAAgB,WAAW,CACzB,KAAK,EAAE,QAAQ,EACf,aAAa,EAAE,MAAM,EACrB,cAAc,EAAE,MAAM,EACtB,OAAO,CAAC,EAAE,cAAc,GACvB,gBAAgB,CA0GlB"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
// packages/evals/src/comparison/index.ts
|
|
2
|
+
function categorize(delta, tolerance) {
|
|
3
|
+
if (Math.abs(delta) <= tolerance) return "unchanged";
|
|
4
|
+
return delta > 0 ? "improved" : "regressed";
|
|
5
|
+
}
|
|
6
|
+
function buildScoreMap(cases) {
|
|
7
|
+
const map = /* @__PURE__ */ new Map();
|
|
8
|
+
for (const c of cases) {
|
|
9
|
+
const scores = {};
|
|
10
|
+
for (const s of c.scores) {
|
|
11
|
+
scores[s.scorer_name] = s.score;
|
|
12
|
+
}
|
|
13
|
+
map.set(c.idx, scores);
|
|
14
|
+
}
|
|
15
|
+
return map;
|
|
16
|
+
}
|
|
17
|
+
function getAllCasesWithScores(store, runId) {
|
|
18
|
+
const cases = store.getCases(runId);
|
|
19
|
+
const withScores = store.getFailingCases(runId, Infinity);
|
|
20
|
+
const scoredMap = new Map(withScores.map((c) => [c.id, c]));
|
|
21
|
+
return cases.map((c) => scoredMap.get(c.id) ?? { ...c, scores: [] });
|
|
22
|
+
}
|
|
23
|
+
function compareRuns(store, baselineRunId, candidateRunId, options) {
|
|
24
|
+
const tolerance = options?.tolerance ?? 0.01;
|
|
25
|
+
const regressionThreshold = options?.regressionThreshold ?? 0.05;
|
|
26
|
+
const baselineCases = getAllCasesWithScores(store, baselineRunId);
|
|
27
|
+
const candidateCases = getAllCasesWithScores(store, candidateRunId);
|
|
28
|
+
if (baselineCases.length !== candidateCases.length) {
|
|
29
|
+
console.warn(
|
|
30
|
+
`Run case count mismatch: baseline=${baselineCases.length}, candidate=${candidateCases.length}. Comparing intersection only.`
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
const baselineMap = buildScoreMap(baselineCases);
|
|
34
|
+
const candidateMap = buildScoreMap(candidateCases);
|
|
35
|
+
const allScorerNames = /* @__PURE__ */ new Set();
|
|
36
|
+
for (const scores of baselineMap.values()) {
|
|
37
|
+
for (const name of Object.keys(scores)) allScorerNames.add(name);
|
|
38
|
+
}
|
|
39
|
+
for (const scores of candidateMap.values()) {
|
|
40
|
+
for (const name of Object.keys(scores)) allScorerNames.add(name);
|
|
41
|
+
}
|
|
42
|
+
const commonIndices = [...baselineMap.keys()].filter(
|
|
43
|
+
(idx) => candidateMap.has(idx)
|
|
44
|
+
);
|
|
45
|
+
commonIndices.sort((a, b) => a - b);
|
|
46
|
+
const caseDiffs = [];
|
|
47
|
+
const scorerDeltas = {};
|
|
48
|
+
const scorerCounts = {};
|
|
49
|
+
for (const name of allScorerNames) {
|
|
50
|
+
scorerDeltas[name] = [];
|
|
51
|
+
scorerCounts[name] = { improved: 0, regressed: 0, unchanged: 0 };
|
|
52
|
+
}
|
|
53
|
+
for (const idx of commonIndices) {
|
|
54
|
+
const baseScores = baselineMap.get(idx);
|
|
55
|
+
const candScores = candidateMap.get(idx);
|
|
56
|
+
const diff = { index: idx, scorerDeltas: {} };
|
|
57
|
+
for (const name of allScorerNames) {
|
|
58
|
+
const baseline = baseScores[name] ?? 0;
|
|
59
|
+
const candidate = candScores[name] ?? 0;
|
|
60
|
+
const delta = candidate - baseline;
|
|
61
|
+
const change = categorize(delta, tolerance);
|
|
62
|
+
diff.scorerDeltas[name] = { baseline, candidate, delta, change };
|
|
63
|
+
scorerDeltas[name].push(delta);
|
|
64
|
+
if (change === "improved") scorerCounts[name].improved++;
|
|
65
|
+
else if (change === "regressed") scorerCounts[name].regressed++;
|
|
66
|
+
else scorerCounts[name].unchanged++;
|
|
67
|
+
}
|
|
68
|
+
caseDiffs.push(diff);
|
|
69
|
+
}
|
|
70
|
+
const scorerSummaries = {};
|
|
71
|
+
for (const name of allScorerNames) {
|
|
72
|
+
const deltas = scorerDeltas[name];
|
|
73
|
+
const meanDelta = deltas.length > 0 ? deltas.reduce((a, b) => a + b, 0) / deltas.length : 0;
|
|
74
|
+
scorerSummaries[name] = {
|
|
75
|
+
meanDelta,
|
|
76
|
+
improvedCount: scorerCounts[name].improved,
|
|
77
|
+
regressedCount: scorerCounts[name].regressed,
|
|
78
|
+
unchangedCount: scorerCounts[name].unchanged
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
const baselineSummary = store.getRunSummary(baselineRunId);
|
|
82
|
+
const candidateSummary = store.getRunSummary(candidateRunId);
|
|
83
|
+
const costDelta = {
|
|
84
|
+
latencyDeltaMs: candidateSummary.totalLatencyMs - baselineSummary.totalLatencyMs,
|
|
85
|
+
tokenInDelta: candidateSummary.totalTokensIn - baselineSummary.totalTokensIn,
|
|
86
|
+
tokenOutDelta: candidateSummary.totalTokensOut - baselineSummary.totalTokensOut
|
|
87
|
+
};
|
|
88
|
+
const regressionDetails = {};
|
|
89
|
+
let anyRegressed = false;
|
|
90
|
+
for (const [name, summary] of Object.entries(scorerSummaries)) {
|
|
91
|
+
const exceeds = summary.meanDelta < -regressionThreshold;
|
|
92
|
+
regressionDetails[name] = { meanDelta: summary.meanDelta, exceeds };
|
|
93
|
+
if (exceeds) anyRegressed = true;
|
|
94
|
+
}
|
|
95
|
+
return {
|
|
96
|
+
caseDiffs,
|
|
97
|
+
scorerSummaries,
|
|
98
|
+
costDelta,
|
|
99
|
+
totalCasesCompared: commonIndices.length,
|
|
100
|
+
regression: { regressed: anyRegressed, details: regressionDetails }
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
export {
|
|
104
|
+
compareRuns
|
|
105
|
+
};
|
|
106
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../src/comparison/index.ts"],
|
|
4
|
+
"sourcesContent": ["import type { CaseWithScores, RunStore, RunSummary } from '../store/index.ts';\n\nexport type ChangeType = 'improved' | 'regressed' | 'unchanged';\n\nexport interface CaseDiff {\n index: number;\n scorerDeltas: Record<\n string,\n { baseline: number; candidate: number; delta: number; change: ChangeType }\n >;\n}\n\nexport interface ScorerSummary {\n meanDelta: number;\n improvedCount: number;\n regressedCount: number;\n unchangedCount: number;\n}\n\nexport interface CostDelta {\n latencyDeltaMs: number;\n tokenInDelta: number;\n tokenOutDelta: number;\n}\n\nexport interface ComparisonResult {\n caseDiffs: CaseDiff[];\n scorerSummaries: Record<string, ScorerSummary>;\n costDelta: CostDelta;\n totalCasesCompared: number;\n regression: {\n regressed: boolean;\n details: Record<string, { meanDelta: number; exceeds: boolean }>;\n };\n}\n\nexport interface CompareOptions {\n tolerance?: number;\n regressionThreshold?: number;\n}\n\nfunction categorize(delta: number, tolerance: number): ChangeType {\n if (Math.abs(delta) <= tolerance) return 'unchanged';\n return delta > 0 ? 'improved' : 'regressed';\n}\n\nfunction buildScoreMap(\n cases: CaseWithScores[],\n): Map<number, Record<string, number>> {\n const map = new Map<number, Record<string, number>>();\n for (const c of cases) {\n const scores: Record<string, number> = {};\n for (const s of c.scores) {\n scores[s.scorer_name] = s.score;\n }\n map.set(c.idx, scores);\n }\n return map;\n}\n\nfunction getAllCasesWithScores(\n store: RunStore,\n runId: string,\n): CaseWithScores[] {\n const cases = store.getCases(runId);\n const withScores = store.getFailingCases(runId, Infinity);\n const scoredMap = new Map(withScores.map((c) => [c.id, c]));\n return cases.map((c) => scoredMap.get(c.id) ?? { ...c, scores: [] });\n}\n\nexport function compareRuns(\n store: RunStore,\n baselineRunId: string,\n candidateRunId: string,\n options?: CompareOptions,\n): ComparisonResult {\n const tolerance = options?.tolerance ?? 0.01;\n const regressionThreshold = options?.regressionThreshold ?? 0.05;\n\n const baselineCases = getAllCasesWithScores(store, baselineRunId);\n const candidateCases = getAllCasesWithScores(store, candidateRunId);\n\n if (baselineCases.length !== candidateCases.length) {\n console.warn(\n `Run case count mismatch: baseline=${baselineCases.length}, candidate=${candidateCases.length}. Comparing intersection only.`,\n );\n }\n\n const baselineMap = buildScoreMap(baselineCases);\n const candidateMap = buildScoreMap(candidateCases);\n\n const allScorerNames = new Set<string>();\n for (const scores of baselineMap.values()) {\n for (const name of Object.keys(scores)) allScorerNames.add(name);\n }\n for (const scores of candidateMap.values()) {\n for (const name of Object.keys(scores)) allScorerNames.add(name);\n }\n\n const commonIndices = [...baselineMap.keys()].filter((idx) =>\n candidateMap.has(idx),\n );\n commonIndices.sort((a, b) => a - b);\n\n const caseDiffs: CaseDiff[] = [];\n const scorerDeltas: Record<string, number[]> = {};\n const scorerCounts: Record<\n string,\n { improved: number; regressed: number; unchanged: number }\n > = {};\n\n for (const name of allScorerNames) {\n scorerDeltas[name] = [];\n scorerCounts[name] = { improved: 0, regressed: 0, unchanged: 0 };\n }\n\n for (const idx of commonIndices) {\n const baseScores = baselineMap.get(idx)!;\n const candScores = candidateMap.get(idx)!;\n const diff: CaseDiff = { index: idx, scorerDeltas: {} };\n\n for (const name of allScorerNames) {\n const baseline = baseScores[name] ?? 0;\n const candidate = candScores[name] ?? 0;\n const delta = candidate - baseline;\n const change = categorize(delta, tolerance);\n\n diff.scorerDeltas[name] = { baseline, candidate, delta, change };\n scorerDeltas[name]!.push(delta);\n\n if (change === 'improved') scorerCounts[name]!.improved++;\n else if (change === 'regressed') scorerCounts[name]!.regressed++;\n else scorerCounts[name]!.unchanged++;\n }\n\n caseDiffs.push(diff);\n }\n\n const scorerSummaries: Record<string, ScorerSummary> = {};\n for (const name of allScorerNames) {\n const deltas = scorerDeltas[name]!;\n const meanDelta =\n deltas.length > 0 ? deltas.reduce((a, b) => a + b, 0) / deltas.length : 0;\n scorerSummaries[name] = {\n meanDelta,\n improvedCount: scorerCounts[name]!.improved,\n regressedCount: scorerCounts[name]!.regressed,\n unchangedCount: scorerCounts[name]!.unchanged,\n };\n }\n\n const baselineSummary = store.getRunSummary(baselineRunId);\n const candidateSummary = store.getRunSummary(candidateRunId);\n\n const costDelta: CostDelta = {\n latencyDeltaMs:\n candidateSummary.totalLatencyMs - baselineSummary.totalLatencyMs,\n tokenInDelta:\n candidateSummary.totalTokensIn - baselineSummary.totalTokensIn,\n tokenOutDelta:\n candidateSummary.totalTokensOut - baselineSummary.totalTokensOut,\n };\n\n const regressionDetails: Record<\n string,\n { meanDelta: number; exceeds: boolean }\n > = {};\n let anyRegressed = false;\n for (const [name, summary] of Object.entries(scorerSummaries)) {\n const exceeds = summary.meanDelta < -regressionThreshold;\n regressionDetails[name] = { meanDelta: summary.meanDelta, exceeds };\n if (exceeds) anyRegressed = true;\n }\n\n return {\n caseDiffs,\n scorerSummaries,\n costDelta,\n totalCasesCompared: commonIndices.length,\n regression: { regressed: anyRegressed, details: regressionDetails },\n };\n}\n"],
|
|
5
|
+
"mappings": ";AAyCA,SAAS,WAAW,OAAe,WAA+B;AAChE,MAAI,KAAK,IAAI,KAAK,KAAK,UAAW,QAAO;AACzC,SAAO,QAAQ,IAAI,aAAa;AAClC;AAEA,SAAS,cACP,OACqC;AACrC,QAAM,MAAM,oBAAI,IAAoC;AACpD,aAAW,KAAK,OAAO;AACrB,UAAM,SAAiC,CAAC;AACxC,eAAW,KAAK,EAAE,QAAQ;AACxB,aAAO,EAAE,WAAW,IAAI,EAAE;AAAA,IAC5B;AACA,QAAI,IAAI,EAAE,KAAK,MAAM;AAAA,EACvB;AACA,SAAO;AACT;AAEA,SAAS,sBACP,OACA,OACkB;AAClB,QAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,QAAM,aAAa,MAAM,gBAAgB,OAAO,QAAQ;AACxD,QAAM,YAAY,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;AAC1D,SAAO,MAAM,IAAI,CAAC,MAAM,UAAU,IAAI,EAAE,EAAE,KAAK,EAAE,GAAG,GAAG,QAAQ,CAAC,EAAE,CAAC;AACrE;AAEO,SAAS,YACd,OACA,eACA,gBACA,SACkB;AAClB,QAAM,YAAY,SAAS,aAAa;AACxC,QAAM,sBAAsB,SAAS,uBAAuB;AAE5D,QAAM,gBAAgB,sBAAsB,OAAO,aAAa;AAChE,QAAM,iBAAiB,sBAAsB,OAAO,cAAc;AAElE,MAAI,cAAc,WAAW,eAAe,QAAQ;AAClD,YAAQ;AAAA,MACN,qCAAqC,cAAc,MAAM,eAAe,eAAe,MAAM;AAAA,IAC/F;AAAA,EACF;AAEA,QAAM,cAAc,cAAc,aAAa;AAC/C,QAAM,eAAe,cAAc,cAAc;AAEjD,QAAM,iBAAiB,oBAAI,IAAY;AACvC,aAAW,UAAU,YAAY,OAAO,GAAG;AACzC,eAAW,QAAQ,OAAO,KAAK,MAAM,EAAG,gBAAe,IAAI,IAAI;AAAA,EACjE;AACA,aAAW,UAAU,aAAa,OAAO,GAAG;AAC1C,eAAW,QAAQ,OAAO,KAAK,MAAM,EAAG,gBAAe,IAAI,IAAI;AAAA,EACjE;AAEA,QAAM,gBAAgB,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE;AAAA,IAAO,CAAC,QACpD,aAAa,IAAI,GAAG;AAAA,EACtB;AACA,gBAAc,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAElC,QAAM,YAAwB,CAAC;AAC/B,QAAM,eAAyC,CAAC;AAChD,QAAM,eAGF,CAAC;AAEL,aAAW,QAAQ,gBAAgB;AACjC,iBAAa,IAAI,IAAI,CAAC;AACtB,iBAAa,IAAI,IAAI,EAAE,UAAU,GAAG,WAAW,GAAG,WAAW,EAAE;AAAA,EACjE;AAEA,aAAW,OAAO,eAAe;AAC/B,UAAM,aAAa,YAAY,IAAI,GAAG;AACtC,UAAM,aAAa,aAAa,IAAI,GAAG;AACvC,UAAM,OAAiB,EAAE,OAAO,KAAK,cAAc,CAAC,EAAE;AAEtD,eAAW,QAAQ,gBAAgB;AACjC,YAAM,WAAW,WAAW,IAAI,KAAK;AACrC,YAAM,YAAY,WAAW,IAAI,KAAK;AACtC,YAAM,QAAQ,YAAY;AAC1B,YAAM,SAAS,WAAW,OAAO,SAAS;AAE1C,WAAK,aAAa,IAAI,IAAI,EAAE,UAAU,WAAW,OAAO,OAAO;AAC/D,mBAAa,IAAI,EAAG,KAAK,KAAK;AAE9B,UAAI,WAAW,WAAY,cAAa,IAAI,EAAG;AAAA,eACtC,WAAW,YAAa,cAAa,IAAI,EAAG;AAAA,UAChD,cAAa,IAAI,EAAG;AAAA,IAC3B;AAEA,cAAU,KAAK,IAAI;AAAA,EACrB;AAEA,QAAM,kBAAiD,CAAC;AACxD,aAAW,QAAQ,gBAAgB;AACjC,UAAM,SAAS,aAAa,IAAI;AAChC,UAAM,YACJ,OAAO,SAAS,IAAI,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO,SAAS;AAC1E,oBAAgB,IAAI,IAAI;AAAA,MACtB;AAAA,MACA,eAAe,aAAa,IAAI,EAAG;AAAA,MACnC,gBAAgB,aAAa,IAAI,EAAG;AAAA,MACpC,gBAAgB,aAAa,IAAI,EAAG;AAAA,IACtC;AAAA,EACF;AAEA,QAAM,kBAAkB,MAAM,cAAc,aAAa;AACzD,QAAM,mBAAmB,MAAM,cAAc,cAAc;AAE3D,QAAM,YAAuB;AAAA,IAC3B,gBACE,iBAAiB,iBAAiB,gBAAgB;AAAA,IACpD,cACE,iBAAiB,gBAAgB,gBAAgB;AAAA,IACnD,eACE,iBAAiB,iBAAiB,gBAAgB;AAAA,EACtD;AAEA,QAAM,oBAGF,CAAC;AACL,MAAI,eAAe;AACnB,aAAW,CAAC,MAAM,OAAO,KAAK,OAAO,QAAQ,eAAe,GAAG;AAC7D,UAAM,UAAU,QAAQ,YAAY,CAAC;AACrC,sBAAkB,IAAI,IAAI,EAAE,WAAW,QAAQ,WAAW,QAAQ;AAClE,QAAI,QAAS,gBAAe;AAAA,EAC9B;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,oBAAoB,cAAc;AAAA,IAClC,YAAY,EAAE,WAAW,cAAc,SAAS,kBAAkB;AAAA,EACpE;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export interface HfOptions {
|
|
2
|
+
dataset: string;
|
|
3
|
+
config: string;
|
|
4
|
+
split: string;
|
|
5
|
+
rows?: number;
|
|
6
|
+
}
|
|
7
|
+
export declare function hf<T = Record<string, unknown>>(options: HfOptions): AsyncIterable<T>;
|
|
8
|
+
export declare function fetchHfRows(options: {
|
|
9
|
+
dataset: string;
|
|
10
|
+
config: string;
|
|
11
|
+
split: string;
|
|
12
|
+
}, offset: number, length: number): Promise<{
|
|
13
|
+
rows: Record<string, unknown>[];
|
|
14
|
+
total: number;
|
|
15
|
+
}>;
|
|
16
|
+
//# sourceMappingURL=hf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hf.d.ts","sourceRoot":"","sources":["../../src/dataset/hf.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,SAAS;IACxB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAUD,wBAAgB,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC5C,OAAO,EAAE,SAAS,GACjB,aAAa,CAAC,CAAC,CAAC,CAMlB;AA2CD,wBAAsB,WAAW,CAC/B,OAAO,EAAE;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,EAC3D,MAAM,EAAE,MAAM,EACd,MAAM,EAAE,MAAM,GACb,OAAO,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAAE,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,CAa7D"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export { hf } from './hf.ts';
|
|
2
|
+
export type { HfOptions } from './hf.ts';
|
|
3
|
+
export type TransformFn<T, U> = (item: T) => U;
|
|
4
|
+
export type PredicateFn<T> = (item: T) => boolean;
|
|
5
|
+
export declare class Dataset<T> implements AsyncIterable<T> {
|
|
6
|
+
#private;
|
|
7
|
+
constructor(source: () => AsyncIterable<T>);
|
|
8
|
+
map<U>(fn: TransformFn<T, U>): Dataset<U>;
|
|
9
|
+
filter(fn: PredicateFn<T>): Dataset<T>;
|
|
10
|
+
limit(n: number): Dataset<T>;
|
|
11
|
+
shuffle(): Dataset<T>;
|
|
12
|
+
sample(n: number): Dataset<T>;
|
|
13
|
+
toArray(): Promise<T[]>;
|
|
14
|
+
[Symbol.asyncIterator](): AsyncIterator<T>;
|
|
15
|
+
}
|
|
16
|
+
export declare function dataset<T>(source: T[] | string | AsyncIterable<T>): Dataset<T>;
|
|
17
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/dataset/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,EAAE,EAAE,MAAM,SAAS,CAAC;AAC7B,YAAY,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAEzC,MAAM,MAAM,WAAW,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC;AAC/C,MAAM,MAAM,WAAW,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,OAAO,CAAC;AAElD,qBAAa,OAAO,CAAC,CAAC,CAAE,YAAW,aAAa,CAAC,CAAC,CAAC;;gBAGrC,MAAM,EAAE,MAAM,aAAa,CAAC,CAAC,CAAC;IAI1C,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;IASzC,MAAM,CAAC,EAAE,EAAE,WAAW,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;IAStC,KAAK,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC;IAY5B,OAAO,IAAI,OAAO,CAAC,CAAC,CAAC;IAiBrB,MAAM,CAAC,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC;IAoBvB,OAAO,IAAI,OAAO,CAAC,CAAC,EAAE,CAAC;IAQ7B,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,aAAa,CAAC,CAAC,CAAC;CAG3C;AA+FD,wBAAgB,OAAO,CAAC,CAAC,EACvB,MAAM,EAAE,CAAC,EAAE,GAAG,MAAM,GAAG,aAAa,CAAC,CAAC,CAAC,GACtC,OAAO,CAAC,CAAC,CAAC,CAwBZ"}
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
// packages/evals/src/dataset/index.ts
|
|
2
|
+
import { createReadStream } from "node:fs";
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { extname } from "node:path";
|
|
5
|
+
import { createInterface } from "node:readline";
|
|
6
|
+
|
|
7
|
+
// packages/evals/src/dataset/hf.ts
|
|
8
|
+
var HF_BASE_URL = "https://datasets-server.huggingface.co/rows";
|
|
9
|
+
var PAGE_SIZE = 100;
|
|
10
|
+
function hf(options) {
|
|
11
|
+
return {
|
|
12
|
+
[Symbol.asyncIterator]() {
|
|
13
|
+
return paginate(options);
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
async function* paginate(options) {
|
|
18
|
+
const { dataset: dataset2, config, split, rows } = options;
|
|
19
|
+
const limit = rows ?? Infinity;
|
|
20
|
+
let offset = 0;
|
|
21
|
+
let yielded = 0;
|
|
22
|
+
while (yielded < limit) {
|
|
23
|
+
const pageSize = limit === Infinity ? PAGE_SIZE : Math.min(PAGE_SIZE, limit - yielded);
|
|
24
|
+
const url = buildUrl(dataset2, config, split, offset, pageSize);
|
|
25
|
+
const page = await fetchPage(url);
|
|
26
|
+
if (page.rows.length === 0) return;
|
|
27
|
+
for (const entry of page.rows) {
|
|
28
|
+
yield entry.row;
|
|
29
|
+
yielded++;
|
|
30
|
+
if (yielded >= limit) return;
|
|
31
|
+
}
|
|
32
|
+
offset += page.rows.length;
|
|
33
|
+
if (page.rows.length < pageSize || offset >= page.num_rows_total) return;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function buildUrl(dataset2, config, split, offset, length) {
|
|
37
|
+
const url = new URL(HF_BASE_URL);
|
|
38
|
+
url.searchParams.set("dataset", dataset2);
|
|
39
|
+
url.searchParams.set("config", config);
|
|
40
|
+
url.searchParams.set("split", split);
|
|
41
|
+
url.searchParams.set("offset", String(offset));
|
|
42
|
+
url.searchParams.set("length", String(length));
|
|
43
|
+
return url.toString();
|
|
44
|
+
}
|
|
45
|
+
async function fetchPage(url) {
|
|
46
|
+
const response = await fetch(url);
|
|
47
|
+
if (!response.ok) {
|
|
48
|
+
const body = await response.text().catch(() => "");
|
|
49
|
+
throw new Error(
|
|
50
|
+
`HuggingFace API error ${response.status}: ${body || response.statusText}`
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
const text = await response.text();
|
|
54
|
+
try {
|
|
55
|
+
return JSON.parse(text);
|
|
56
|
+
} catch {
|
|
57
|
+
throw new Error(
|
|
58
|
+
`HuggingFace API returned non-JSON response from ${url}: ${text.slice(0, 200)}`
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// packages/evals/src/dataset/index.ts
|
|
64
|
+
var Dataset = class _Dataset {
|
|
65
|
+
#source;
|
|
66
|
+
constructor(source) {
|
|
67
|
+
this.#source = source;
|
|
68
|
+
}
|
|
69
|
+
map(fn) {
|
|
70
|
+
const source = this.#source;
|
|
71
|
+
return new _Dataset(async function* () {
|
|
72
|
+
for await (const item of source()) {
|
|
73
|
+
yield fn(item);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
filter(fn) {
|
|
78
|
+
const source = this.#source;
|
|
79
|
+
return new _Dataset(async function* () {
|
|
80
|
+
for await (const item of source()) {
|
|
81
|
+
if (fn(item)) yield item;
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
limit(n) {
|
|
86
|
+
const source = this.#source;
|
|
87
|
+
return new _Dataset(async function* () {
|
|
88
|
+
let count = 0;
|
|
89
|
+
for await (const item of source()) {
|
|
90
|
+
if (count >= n) return;
|
|
91
|
+
yield item;
|
|
92
|
+
count++;
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
shuffle() {
|
|
97
|
+
const source = this.#source;
|
|
98
|
+
return new _Dataset(async function* () {
|
|
99
|
+
const items = [];
|
|
100
|
+
for await (const item of source()) {
|
|
101
|
+
items.push(item);
|
|
102
|
+
}
|
|
103
|
+
for (let i = items.length - 1; i > 0; i--) {
|
|
104
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
105
|
+
const temp = items[i];
|
|
106
|
+
items[i] = items[j];
|
|
107
|
+
items[j] = temp;
|
|
108
|
+
}
|
|
109
|
+
yield* items;
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
sample(n) {
|
|
113
|
+
const source = this.#source;
|
|
114
|
+
return new _Dataset(async function* () {
|
|
115
|
+
const items = [];
|
|
116
|
+
for await (const item of source()) {
|
|
117
|
+
items.push(item);
|
|
118
|
+
}
|
|
119
|
+
const count = Math.min(Math.max(0, n), items.length);
|
|
120
|
+
for (let i = items.length - 1; i > items.length - count - 1; i--) {
|
|
121
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
122
|
+
const temp = items[i];
|
|
123
|
+
items[i] = items[j];
|
|
124
|
+
items[j] = temp;
|
|
125
|
+
}
|
|
126
|
+
for (let i = items.length - count; i < items.length; i++) {
|
|
127
|
+
yield items[i];
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
async toArray() {
|
|
132
|
+
const result = [];
|
|
133
|
+
for await (const item of this.#source()) {
|
|
134
|
+
result.push(item);
|
|
135
|
+
}
|
|
136
|
+
return result;
|
|
137
|
+
}
|
|
138
|
+
[Symbol.asyncIterator]() {
|
|
139
|
+
return this.#source()[Symbol.asyncIterator]();
|
|
140
|
+
}
|
|
141
|
+
};
|
|
142
|
+
function parseCSVLine(line) {
|
|
143
|
+
const fields = [];
|
|
144
|
+
let current = "";
|
|
145
|
+
let inQuotes = false;
|
|
146
|
+
for (let i = 0; i < line.length; i++) {
|
|
147
|
+
const char = line[i];
|
|
148
|
+
if (inQuotes) {
|
|
149
|
+
if (char === '"') {
|
|
150
|
+
if (i + 1 < line.length && line[i + 1] === '"') {
|
|
151
|
+
current += '"';
|
|
152
|
+
i++;
|
|
153
|
+
} else {
|
|
154
|
+
inQuotes = false;
|
|
155
|
+
}
|
|
156
|
+
} else {
|
|
157
|
+
current += char;
|
|
158
|
+
}
|
|
159
|
+
} else {
|
|
160
|
+
if (char === '"' && current === "") {
|
|
161
|
+
inQuotes = true;
|
|
162
|
+
} else if (char === ",") {
|
|
163
|
+
fields.push(current);
|
|
164
|
+
current = "";
|
|
165
|
+
} else {
|
|
166
|
+
current += char;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
fields.push(current);
|
|
171
|
+
return fields;
|
|
172
|
+
}
|
|
173
|
+
function loadJSON(filePath) {
|
|
174
|
+
return async function* () {
|
|
175
|
+
const content = await readFile(filePath, "utf-8");
|
|
176
|
+
const data = JSON.parse(content);
|
|
177
|
+
if (!Array.isArray(data)) {
|
|
178
|
+
throw new Error(`JSON file "${filePath}" does not contain an array`);
|
|
179
|
+
}
|
|
180
|
+
yield* data;
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
function loadJSONL(filePath) {
|
|
184
|
+
return async function* () {
|
|
185
|
+
const rl = createInterface({
|
|
186
|
+
input: createReadStream(filePath, "utf-8"),
|
|
187
|
+
crlfDelay: Infinity
|
|
188
|
+
});
|
|
189
|
+
try {
|
|
190
|
+
for await (const line of rl) {
|
|
191
|
+
const trimmed = line.trim();
|
|
192
|
+
if (trimmed) {
|
|
193
|
+
yield JSON.parse(trimmed);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
} finally {
|
|
197
|
+
rl.close();
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
function loadCSV(filePath) {
|
|
202
|
+
return async function* () {
|
|
203
|
+
const rl = createInterface({
|
|
204
|
+
input: createReadStream(filePath, "utf-8"),
|
|
205
|
+
crlfDelay: Infinity
|
|
206
|
+
});
|
|
207
|
+
try {
|
|
208
|
+
let headers;
|
|
209
|
+
for await (const line of rl) {
|
|
210
|
+
const trimmed = line.trim();
|
|
211
|
+
if (!trimmed) continue;
|
|
212
|
+
const fields = parseCSVLine(trimmed);
|
|
213
|
+
if (!headers) {
|
|
214
|
+
headers = fields;
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
const row = {};
|
|
218
|
+
for (let i = 0; i < headers.length; i++) {
|
|
219
|
+
row[headers[i]] = fields[i] ?? "";
|
|
220
|
+
}
|
|
221
|
+
yield row;
|
|
222
|
+
}
|
|
223
|
+
} finally {
|
|
224
|
+
rl.close();
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
function dataset(source) {
|
|
229
|
+
if (Array.isArray(source)) {
|
|
230
|
+
return new Dataset(async function* () {
|
|
231
|
+
yield* source;
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
if (typeof source === "object" && Symbol.asyncIterator in source) {
|
|
235
|
+
return new Dataset(() => source);
|
|
236
|
+
}
|
|
237
|
+
const ext = extname(source).toLowerCase();
|
|
238
|
+
switch (ext) {
|
|
239
|
+
case ".json":
|
|
240
|
+
return new Dataset(loadJSON(source));
|
|
241
|
+
case ".jsonl":
|
|
242
|
+
return new Dataset(loadJSONL(source));
|
|
243
|
+
case ".csv":
|
|
244
|
+
return new Dataset(loadCSV(source));
|
|
245
|
+
default:
|
|
246
|
+
throw new Error(
|
|
247
|
+
`Unsupported file extension "${ext}" for dataset file "${source}". Supported: .json, .jsonl, .csv`
|
|
248
|
+
);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
export {
|
|
252
|
+
Dataset,
|
|
253
|
+
dataset,
|
|
254
|
+
hf
|
|
255
|
+
};
|
|
256
|
+
//# sourceMappingURL=index.js.map
|