@plaited/agent-eval-harness 0.8.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +1 -1
- package/src/commands/capture.ts +101 -26
- package/src/commands/tests/capture-cli.spec.ts +57 -0
- package/src/commands/tests/trials-cli.spec.ts +40 -0
- package/src/commands/trials.ts +111 -28
- package/src/core/core.ts +18 -0
- package/src/core/loading.ts +15 -19
- package/src/core/streaming.ts +172 -0
- package/src/core/tests/streaming.spec.ts +399 -0
- package/src/core/tests/worker-pool.spec.ts +377 -0
- package/src/core/worker-pool.ts +220 -0
- package/src/core.ts +14 -0
- package/src/pipeline/compare-trials.ts +2 -1
- package/src/pipeline/compare.ts +1 -0
- package/src/pipeline/tests/compare-statistical.spec.ts +4 -0
- package/src/pipeline/tests/compare-trials.spec.ts +3 -0
- package/src/schemas/schemas.ts +6 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for native streaming utilities.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for memory-efficient streaming functions in streaming.ts:
|
|
6
|
+
* - streamJsonl: Generic JSONL streaming with optional schema validation
|
|
7
|
+
* - streamPrompts: PromptCase streaming
|
|
8
|
+
* - streamResultsNative: CaptureResult streaming
|
|
9
|
+
* - streamTrialResults: TrialResult streaming
|
|
10
|
+
* - countLinesStreaming: Line counting without full file load
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
16
|
+
import { unlink } from 'node:fs/promises'
|
|
17
|
+
import { z } from 'zod'
|
|
18
|
+
import {
|
|
19
|
+
countLinesStreaming,
|
|
20
|
+
streamJsonl,
|
|
21
|
+
streamPrompts,
|
|
22
|
+
streamResultsNative,
|
|
23
|
+
streamTrialResults,
|
|
24
|
+
} from '../streaming.ts'
|
|
25
|
+
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// streamJsonl Tests
|
|
28
|
+
// ============================================================================
|
|
29
|
+
|
|
30
|
+
describe('streamJsonl', () => {
|
|
31
|
+
const testFile = '/tmp/streaming-test-jsonl.jsonl'
|
|
32
|
+
|
|
33
|
+
afterEach(async () => {
|
|
34
|
+
try {
|
|
35
|
+
await unlink(testFile)
|
|
36
|
+
} catch {
|
|
37
|
+
// Ignore if file doesn't exist
|
|
38
|
+
}
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
test('streams items one at a time', async () => {
|
|
42
|
+
await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
|
|
43
|
+
|
|
44
|
+
const items: Array<{ a: number }> = []
|
|
45
|
+
for await (const item of streamJsonl<{ a: number }>(testFile)) {
|
|
46
|
+
items.push(item)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
expect(items.length).toBe(3)
|
|
50
|
+
expect(items[0]?.a).toBe(1)
|
|
51
|
+
expect(items[1]?.a).toBe(2)
|
|
52
|
+
expect(items[2]?.a).toBe(3)
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
test('handles files without trailing newline', async () => {
|
|
56
|
+
await Bun.write(testFile, '{"a":1}\n{"a":2}')
|
|
57
|
+
|
|
58
|
+
const items: Array<{ a: number }> = []
|
|
59
|
+
for await (const item of streamJsonl<{ a: number }>(testFile)) {
|
|
60
|
+
items.push(item)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
expect(items.length).toBe(2)
|
|
64
|
+
expect(items[1]?.a).toBe(2)
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('validates with schema when provided', async () => {
|
|
68
|
+
const schema = z.object({ id: z.string(), value: z.number() })
|
|
69
|
+
await Bun.write(testFile, '{"id":"a","value":1}\n{"id":"b","value":2}')
|
|
70
|
+
|
|
71
|
+
const items: Array<{ id: string; value: number }> = []
|
|
72
|
+
for await (const item of streamJsonl(testFile, schema)) {
|
|
73
|
+
items.push(item)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
expect(items.length).toBe(2)
|
|
77
|
+
expect(items[0]?.id).toBe('a')
|
|
78
|
+
expect(items[0]?.value).toBe(1)
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
test('throws with line number on invalid JSON', async () => {
|
|
82
|
+
await Bun.write(testFile, '{"a":1}\ninvalid json\n{"a":3}')
|
|
83
|
+
|
|
84
|
+
const items: unknown[] = []
|
|
85
|
+
let error: Error | undefined
|
|
86
|
+
|
|
87
|
+
try {
|
|
88
|
+
for await (const item of streamJsonl(testFile)) {
|
|
89
|
+
items.push(item)
|
|
90
|
+
}
|
|
91
|
+
} catch (e) {
|
|
92
|
+
error = e as Error
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
expect(error).toBeDefined()
|
|
96
|
+
expect(error?.message).toContain('line 2')
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('throws with line number on schema validation failure', async () => {
|
|
100
|
+
const schema = z.object({ id: z.string(), required: z.number() })
|
|
101
|
+
await Bun.write(testFile, '{"id":"a","required":1}\n{"id":"b"}')
|
|
102
|
+
|
|
103
|
+
const items: unknown[] = []
|
|
104
|
+
let error: Error | undefined
|
|
105
|
+
|
|
106
|
+
try {
|
|
107
|
+
for await (const item of streamJsonl(testFile, schema)) {
|
|
108
|
+
items.push(item)
|
|
109
|
+
}
|
|
110
|
+
} catch (e) {
|
|
111
|
+
error = e as Error
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
expect(error).toBeDefined()
|
|
115
|
+
expect(error?.message).toContain('line 2')
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
test('handles empty files', async () => {
|
|
119
|
+
await Bun.write(testFile, '')
|
|
120
|
+
|
|
121
|
+
const items: unknown[] = []
|
|
122
|
+
for await (const item of streamJsonl(testFile)) {
|
|
123
|
+
items.push(item)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
expect(items.length).toBe(0)
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
test('handles single-line files', async () => {
|
|
130
|
+
await Bun.write(testFile, '{"single":true}')
|
|
131
|
+
|
|
132
|
+
const items: Array<{ single: boolean }> = []
|
|
133
|
+
for await (const item of streamJsonl<{ single: boolean }>(testFile)) {
|
|
134
|
+
items.push(item)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
expect(items.length).toBe(1)
|
|
138
|
+
expect(items[0]?.single).toBe(true)
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
test('skips empty lines', async () => {
|
|
142
|
+
await Bun.write(testFile, '{"a":1}\n\n\n{"a":2}\n')
|
|
143
|
+
|
|
144
|
+
const items: Array<{ a: number }> = []
|
|
145
|
+
for await (const item of streamJsonl<{ a: number }>(testFile)) {
|
|
146
|
+
items.push(item)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
expect(items.length).toBe(2)
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
test('handles whitespace-only lines', async () => {
|
|
153
|
+
await Bun.write(testFile, '{"a":1}\n \n{"a":2}')
|
|
154
|
+
|
|
155
|
+
const items: Array<{ a: number }> = []
|
|
156
|
+
for await (const item of streamJsonl<{ a: number }>(testFile)) {
|
|
157
|
+
items.push(item)
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
expect(items.length).toBe(2)
|
|
161
|
+
})
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
// ============================================================================
|
|
165
|
+
// streamPrompts Tests
|
|
166
|
+
// ============================================================================
|
|
167
|
+
|
|
168
|
+
describe('streamPrompts', () => {
|
|
169
|
+
const testFile = '/tmp/streaming-test-prompts.jsonl'
|
|
170
|
+
|
|
171
|
+
afterEach(async () => {
|
|
172
|
+
try {
|
|
173
|
+
await unlink(testFile)
|
|
174
|
+
} catch {
|
|
175
|
+
// Ignore
|
|
176
|
+
}
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
test('yields validated PromptCase objects', async () => {
|
|
180
|
+
await Bun.write(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
|
|
181
|
+
|
|
182
|
+
const prompts = []
|
|
183
|
+
for await (const prompt of streamPrompts(testFile)) {
|
|
184
|
+
prompts.push(prompt)
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
expect(prompts.length).toBe(2)
|
|
188
|
+
expect(prompts[0]?.id).toBe('p1')
|
|
189
|
+
expect(prompts[0]?.input).toBe('hello')
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
test('handles multi-turn prompts', async () => {
|
|
193
|
+
await Bun.write(testFile, '{"id":"m1","input":["turn1","turn2"]}')
|
|
194
|
+
|
|
195
|
+
const prompts = []
|
|
196
|
+
for await (const prompt of streamPrompts(testFile)) {
|
|
197
|
+
prompts.push(prompt)
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
expect(prompts.length).toBe(1)
|
|
201
|
+
expect(Array.isArray(prompts[0]?.input)).toBe(true)
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
test('throws on schema validation failure', async () => {
|
|
205
|
+
// Missing required 'id' field
|
|
206
|
+
await Bun.write(testFile, '{"input":"hello"}')
|
|
207
|
+
|
|
208
|
+
let error: Error | undefined
|
|
209
|
+
try {
|
|
210
|
+
for await (const _ of streamPrompts(testFile)) {
|
|
211
|
+
// Consume
|
|
212
|
+
}
|
|
213
|
+
} catch (e) {
|
|
214
|
+
error = e as Error
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
expect(error).toBeDefined()
|
|
218
|
+
expect(error?.message).toContain('line 1')
|
|
219
|
+
})
|
|
220
|
+
})
|
|
221
|
+
|
|
222
|
+
// ============================================================================
|
|
223
|
+
// streamResultsNative Tests
|
|
224
|
+
// ============================================================================
|
|
225
|
+
|
|
226
|
+
describe('streamResultsNative', () => {
|
|
227
|
+
const testFile = '/tmp/streaming-test-results.jsonl'
|
|
228
|
+
|
|
229
|
+
afterEach(async () => {
|
|
230
|
+
try {
|
|
231
|
+
await unlink(testFile)
|
|
232
|
+
} catch {
|
|
233
|
+
// Ignore
|
|
234
|
+
}
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
test('yields validated CaptureResult objects', async () => {
|
|
238
|
+
const result = {
|
|
239
|
+
id: 'r1',
|
|
240
|
+
input: 'test',
|
|
241
|
+
output: 'result',
|
|
242
|
+
trajectory: [],
|
|
243
|
+
metadata: {},
|
|
244
|
+
toolErrors: false,
|
|
245
|
+
timing: {
|
|
246
|
+
start: 0,
|
|
247
|
+
end: 100,
|
|
248
|
+
total: 100,
|
|
249
|
+
sessionCreation: 10,
|
|
250
|
+
},
|
|
251
|
+
}
|
|
252
|
+
await Bun.write(testFile, JSON.stringify(result))
|
|
253
|
+
|
|
254
|
+
const results = []
|
|
255
|
+
for await (const r of streamResultsNative(testFile)) {
|
|
256
|
+
results.push(r)
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
expect(results.length).toBe(1)
|
|
260
|
+
expect(results[0]?.id).toBe('r1')
|
|
261
|
+
expect(results[0]?.output).toBe('result')
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
test('streams multiple results', async () => {
|
|
265
|
+
const makeResult = (id: string) => ({
|
|
266
|
+
id,
|
|
267
|
+
input: 'test',
|
|
268
|
+
output: 'result',
|
|
269
|
+
trajectory: [],
|
|
270
|
+
metadata: {},
|
|
271
|
+
toolErrors: false,
|
|
272
|
+
timing: { start: 0, end: 100, total: 100, sessionCreation: 10 },
|
|
273
|
+
})
|
|
274
|
+
|
|
275
|
+
await Bun.write(
|
|
276
|
+
testFile,
|
|
277
|
+
`${JSON.stringify(makeResult('r1'))}\n${JSON.stringify(makeResult('r2'))}\n${JSON.stringify(makeResult('r3'))}`,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
const results = []
|
|
281
|
+
for await (const r of streamResultsNative(testFile)) {
|
|
282
|
+
results.push(r)
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
expect(results.length).toBe(3)
|
|
286
|
+
expect(results.map((r) => r.id)).toEqual(['r1', 'r2', 'r3'])
|
|
287
|
+
})
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
// ============================================================================
|
|
291
|
+
// streamTrialResults Tests
|
|
292
|
+
// ============================================================================
|
|
293
|
+
|
|
294
|
+
describe('streamTrialResults', () => {
|
|
295
|
+
const testFile = '/tmp/streaming-test-trials.jsonl'
|
|
296
|
+
|
|
297
|
+
afterEach(async () => {
|
|
298
|
+
try {
|
|
299
|
+
await unlink(testFile)
|
|
300
|
+
} catch {
|
|
301
|
+
// Ignore
|
|
302
|
+
}
|
|
303
|
+
})
|
|
304
|
+
|
|
305
|
+
test('yields validated TrialResult objects', async () => {
|
|
306
|
+
const trialResult = {
|
|
307
|
+
id: 't1',
|
|
308
|
+
input: 'test prompt',
|
|
309
|
+
k: 3,
|
|
310
|
+
passRate: 0.67,
|
|
311
|
+
passAtK: 1,
|
|
312
|
+
passExpK: 0.7,
|
|
313
|
+
trials: [
|
|
314
|
+
{ trialNum: 1, output: 'output1', trajectory: [], duration: 100, pass: true },
|
|
315
|
+
{ trialNum: 2, output: 'output2', trajectory: [], duration: 150, pass: true },
|
|
316
|
+
{ trialNum: 3, output: 'output3', trajectory: [], duration: 120, pass: false },
|
|
317
|
+
],
|
|
318
|
+
}
|
|
319
|
+
await Bun.write(testFile, JSON.stringify(trialResult))
|
|
320
|
+
|
|
321
|
+
const results = []
|
|
322
|
+
for await (const r of streamTrialResults(testFile)) {
|
|
323
|
+
results.push(r)
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
expect(results.length).toBe(1)
|
|
327
|
+
expect(results[0]?.id).toBe('t1')
|
|
328
|
+
expect(results[0]?.k).toBe(3)
|
|
329
|
+
expect(results[0]?.passRate).toBe(0.67)
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
test('throws on invalid trial result', async () => {
|
|
333
|
+
// Missing required 'k' field
|
|
334
|
+
await Bun.write(testFile, '{"id":"t1","input":"test","trials":[]}')
|
|
335
|
+
|
|
336
|
+
let error: Error | undefined
|
|
337
|
+
try {
|
|
338
|
+
for await (const _ of streamTrialResults(testFile)) {
|
|
339
|
+
// Consume
|
|
340
|
+
}
|
|
341
|
+
} catch (e) {
|
|
342
|
+
error = e as Error
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
expect(error).toBeDefined()
|
|
346
|
+
expect(error?.message).toContain('line 1')
|
|
347
|
+
})
|
|
348
|
+
})
|
|
349
|
+
|
|
350
|
+
// ============================================================================
|
|
351
|
+
// countLinesStreaming Tests
|
|
352
|
+
// ============================================================================
|
|
353
|
+
|
|
354
|
+
describe('countLinesStreaming', () => {
|
|
355
|
+
const testFile = '/tmp/streaming-test-count.jsonl'
|
|
356
|
+
|
|
357
|
+
afterEach(async () => {
|
|
358
|
+
try {
|
|
359
|
+
await unlink(testFile)
|
|
360
|
+
} catch {
|
|
361
|
+
// Ignore
|
|
362
|
+
}
|
|
363
|
+
})
|
|
364
|
+
|
|
365
|
+
test('counts lines without loading full file', async () => {
|
|
366
|
+
await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
|
|
367
|
+
|
|
368
|
+
const count = await countLinesStreaming(testFile)
|
|
369
|
+
expect(count).toBe(3)
|
|
370
|
+
})
|
|
371
|
+
|
|
372
|
+
test('handles empty file', async () => {
|
|
373
|
+
await Bun.write(testFile, '')
|
|
374
|
+
|
|
375
|
+
const count = await countLinesStreaming(testFile)
|
|
376
|
+
expect(count).toBe(0)
|
|
377
|
+
})
|
|
378
|
+
|
|
379
|
+
test('handles file without trailing newline', async () => {
|
|
380
|
+
await Bun.write(testFile, '{"a":1}\n{"a":2}')
|
|
381
|
+
|
|
382
|
+
const count = await countLinesStreaming(testFile)
|
|
383
|
+
expect(count).toBe(2)
|
|
384
|
+
})
|
|
385
|
+
|
|
386
|
+
test('skips empty lines', async () => {
|
|
387
|
+
await Bun.write(testFile, '{"a":1}\n\n{"a":2}\n\n')
|
|
388
|
+
|
|
389
|
+
const count = await countLinesStreaming(testFile)
|
|
390
|
+
expect(count).toBe(2)
|
|
391
|
+
})
|
|
392
|
+
|
|
393
|
+
test('handles single-line file', async () => {
|
|
394
|
+
await Bun.write(testFile, '{"single":true}')
|
|
395
|
+
|
|
396
|
+
const count = await countLinesStreaming(testFile)
|
|
397
|
+
expect(count).toBe(1)
|
|
398
|
+
})
|
|
399
|
+
})
|