@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ /**
2
+ * Native streaming utilities for JSONL files.
3
+ *
4
+ * @remarks
5
+ * Provides true memory-efficient streaming using Bun.file().stream().
6
+ * Unlike the batch-then-yield approach in loading.ts, these functions
7
+ * process data chunk-by-chunk, maintaining O(1) memory usage regardless
8
+ * of file size.
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ import type { ZodSchema } from 'zod'
14
+ import type { CaptureResult, PromptCase, TrialResult } from '../schemas.ts'
15
+ import { CaptureResultSchema, PromptCaseSchema, TrialResultSchema } from '../schemas.ts'
16
+
17
+ /**
18
+ * Stream JSONL file entries with optional schema validation.
19
+ *
20
+ * @remarks
21
+ * Uses Bun's native ReadableStream for true streaming - only holds one
22
+ * chunk in memory at a time. For files with 10k+ results, this provides
23
+ * constant memory usage vs O(file size) for batch loading.
24
+ *
25
+ * @typeParam T - The expected type of each JSON line
26
+ * @param path - Path to the JSONL file
27
+ * @param schema - Optional Zod schema for validation
28
+ * @yields Parsed (and optionally validated) JSON objects
29
+ * @throws Error with line number if JSON parsing or validation fails
30
+ *
31
+ * @public
32
+ */
33
+ export async function* streamJsonl<T>(path: string, schema?: ZodSchema<T>): AsyncGenerator<T, void, unknown> {
34
+ const file = Bun.file(path)
35
+ const stream = file.stream()
36
+ const decoder = new TextDecoder()
37
+
38
+ let buffer = ''
39
+ let lineNum = 0
40
+
41
+ /**
42
+ * Process a single line of JSON.
43
+ */
44
+ const processLine = (line: string): T => {
45
+ const parsed = JSON.parse(line)
46
+ return schema ? schema.parse(parsed) : (parsed as T)
47
+ }
48
+
49
+ for await (const chunk of stream) {
50
+ buffer += decoder.decode(chunk, { stream: true })
51
+
52
+ let newlineIndex = buffer.indexOf('\n')
53
+ while (newlineIndex !== -1) {
54
+ const line = buffer.slice(0, newlineIndex).trim()
55
+ buffer = buffer.slice(newlineIndex + 1)
56
+ lineNum++
57
+
58
+ if (line) {
59
+ try {
60
+ yield processLine(line)
61
+ } catch (error) {
62
+ throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`)
63
+ }
64
+ }
65
+
66
+ newlineIndex = buffer.indexOf('\n')
67
+ }
68
+ }
69
+
70
+ // Flush remaining buffer content (handles files without trailing newline)
71
+ buffer += decoder.decode()
72
+
73
+ const finalLine = buffer.trim()
74
+ if (finalLine) {
75
+ lineNum++
76
+ try {
77
+ yield processLine(finalLine)
78
+ } catch (error) {
79
+ throw new Error(`Invalid JSON at line ${lineNum}: ${error instanceof Error ? error.message : error}`)
80
+ }
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Stream prompt cases from a JSONL file.
86
+ *
87
+ * @remarks
88
+ * Memory-efficient streaming with PromptCaseSchema validation.
89
+ * Use this for large prompt files when you don't need random access.
90
+ *
91
+ * @param path - Path to the prompts.jsonl file
92
+ * @yields Validated PromptCase objects
93
+ * @throws Error with line number if validation fails
94
+ *
95
+ * @public
96
+ */
97
+ export async function* streamPrompts(path: string): AsyncGenerator<PromptCase, void, unknown> {
98
+ yield* streamJsonl<PromptCase>(path, PromptCaseSchema)
99
+ }
100
+
101
+ /**
102
+ * Stream capture results from a JSONL file using native streaming.
103
+ *
104
+ * @remarks
105
+ * True streaming alternative to the batch-then-yield streamResults in loading.ts.
106
+ * Maintains O(1) memory usage regardless of file size.
107
+ *
108
+ * @param path - Path to the results.jsonl file
109
+ * @yields Validated CaptureResult objects
110
+ * @throws Error with line number if validation fails
111
+ *
112
+ * @public
113
+ */
114
+ export async function* streamResultsNative(path: string): AsyncGenerator<CaptureResult, void, unknown> {
115
+ yield* streamJsonl<CaptureResult>(path, CaptureResultSchema)
116
+ }
117
+
118
+ /**
119
+ * Stream trial results from a JSONL file.
120
+ *
121
+ * @remarks
122
+ * Memory-efficient streaming with TrialResultSchema validation.
123
+ * Use for large trial result files from the trials command.
124
+ *
125
+ * @param path - Path to the trial results JSONL file
126
+ * @yields Validated TrialResult objects
127
+ * @throws Error with line number if validation fails
128
+ *
129
+ * @public
130
+ */
131
+ export async function* streamTrialResults(path: string): AsyncGenerator<TrialResult, void, unknown> {
132
+ yield* streamJsonl<TrialResult>(path, TrialResultSchema)
133
+ }
134
+
135
+ /**
136
+ * Count lines in a JSONL file using streaming.
137
+ *
138
+ * @remarks
139
+ * Counts non-empty lines without loading the entire file into memory.
140
+ * Uses byte-level newline scanning for efficiency.
141
+ *
142
+ * @param path - Path to the JSONL file
143
+ * @returns Number of non-empty lines
144
+ *
145
+ * @public
146
+ */
147
+ export const countLinesStreaming = async (path: string): Promise<number> => {
148
+ const file = Bun.file(path)
149
+ const stream = file.stream()
150
+ const decoder = new TextDecoder()
151
+
152
+ let count = 0
153
+ let buffer = ''
154
+
155
+ for await (const chunk of stream) {
156
+ buffer += decoder.decode(chunk, { stream: true })
157
+
158
+ let newlineIndex = buffer.indexOf('\n')
159
+ while (newlineIndex !== -1) {
160
+ const line = buffer.slice(0, newlineIndex).trim()
161
+ buffer = buffer.slice(newlineIndex + 1)
162
+ if (line) count++
163
+ newlineIndex = buffer.indexOf('\n')
164
+ }
165
+ }
166
+
167
+ // Flush and check final line
168
+ buffer += decoder.decode()
169
+ if (buffer.trim()) count++
170
+
171
+ return count
172
+ }
@@ -0,0 +1,399 @@
1
+ /**
2
+ * Unit tests for native streaming utilities.
3
+ *
4
+ * @remarks
5
+ * Tests for memory-efficient streaming functions in streaming.ts:
6
+ * - streamJsonl: Generic JSONL streaming with optional schema validation
7
+ * - streamPrompts: PromptCase streaming
8
+ * - streamResultsNative: CaptureResult streaming
9
+ * - streamTrialResults: TrialResult streaming
10
+ * - countLinesStreaming: Line counting without full file load
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+
15
+ import { afterEach, describe, expect, test } from 'bun:test'
16
+ import { unlink } from 'node:fs/promises'
17
+ import { z } from 'zod'
18
+ import {
19
+ countLinesStreaming,
20
+ streamJsonl,
21
+ streamPrompts,
22
+ streamResultsNative,
23
+ streamTrialResults,
24
+ } from '../streaming.ts'
25
+
26
+ // ============================================================================
27
+ // streamJsonl Tests
28
+ // ============================================================================
29
+
30
+ describe('streamJsonl', () => {
31
+ const testFile = '/tmp/streaming-test-jsonl.jsonl'
32
+
33
+ afterEach(async () => {
34
+ try {
35
+ await unlink(testFile)
36
+ } catch {
37
+ // Ignore if file doesn't exist
38
+ }
39
+ })
40
+
41
+ test('streams items one at a time', async () => {
42
+ await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
43
+
44
+ const items: Array<{ a: number }> = []
45
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
46
+ items.push(item)
47
+ }
48
+
49
+ expect(items.length).toBe(3)
50
+ expect(items[0]?.a).toBe(1)
51
+ expect(items[1]?.a).toBe(2)
52
+ expect(items[2]?.a).toBe(3)
53
+ })
54
+
55
+ test('handles files without trailing newline', async () => {
56
+ await Bun.write(testFile, '{"a":1}\n{"a":2}')
57
+
58
+ const items: Array<{ a: number }> = []
59
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
60
+ items.push(item)
61
+ }
62
+
63
+ expect(items.length).toBe(2)
64
+ expect(items[1]?.a).toBe(2)
65
+ })
66
+
67
+ test('validates with schema when provided', async () => {
68
+ const schema = z.object({ id: z.string(), value: z.number() })
69
+ await Bun.write(testFile, '{"id":"a","value":1}\n{"id":"b","value":2}')
70
+
71
+ const items: Array<{ id: string; value: number }> = []
72
+ for await (const item of streamJsonl(testFile, schema)) {
73
+ items.push(item)
74
+ }
75
+
76
+ expect(items.length).toBe(2)
77
+ expect(items[0]?.id).toBe('a')
78
+ expect(items[0]?.value).toBe(1)
79
+ })
80
+
81
+ test('throws with line number on invalid JSON', async () => {
82
+ await Bun.write(testFile, '{"a":1}\ninvalid json\n{"a":3}')
83
+
84
+ const items: unknown[] = []
85
+ let error: Error | undefined
86
+
87
+ try {
88
+ for await (const item of streamJsonl(testFile)) {
89
+ items.push(item)
90
+ }
91
+ } catch (e) {
92
+ error = e as Error
93
+ }
94
+
95
+ expect(error).toBeDefined()
96
+ expect(error?.message).toContain('line 2')
97
+ })
98
+
99
+ test('throws with line number on schema validation failure', async () => {
100
+ const schema = z.object({ id: z.string(), required: z.number() })
101
+ await Bun.write(testFile, '{"id":"a","required":1}\n{"id":"b"}')
102
+
103
+ const items: unknown[] = []
104
+ let error: Error | undefined
105
+
106
+ try {
107
+ for await (const item of streamJsonl(testFile, schema)) {
108
+ items.push(item)
109
+ }
110
+ } catch (e) {
111
+ error = e as Error
112
+ }
113
+
114
+ expect(error).toBeDefined()
115
+ expect(error?.message).toContain('line 2')
116
+ })
117
+
118
+ test('handles empty files', async () => {
119
+ await Bun.write(testFile, '')
120
+
121
+ const items: unknown[] = []
122
+ for await (const item of streamJsonl(testFile)) {
123
+ items.push(item)
124
+ }
125
+
126
+ expect(items.length).toBe(0)
127
+ })
128
+
129
+ test('handles single-line files', async () => {
130
+ await Bun.write(testFile, '{"single":true}')
131
+
132
+ const items: Array<{ single: boolean }> = []
133
+ for await (const item of streamJsonl<{ single: boolean }>(testFile)) {
134
+ items.push(item)
135
+ }
136
+
137
+ expect(items.length).toBe(1)
138
+ expect(items[0]?.single).toBe(true)
139
+ })
140
+
141
+ test('skips empty lines', async () => {
142
+ await Bun.write(testFile, '{"a":1}\n\n\n{"a":2}\n')
143
+
144
+ const items: Array<{ a: number }> = []
145
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
146
+ items.push(item)
147
+ }
148
+
149
+ expect(items.length).toBe(2)
150
+ })
151
+
152
+ test('handles whitespace-only lines', async () => {
153
+ await Bun.write(testFile, '{"a":1}\n \n{"a":2}')
154
+
155
+ const items: Array<{ a: number }> = []
156
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
157
+ items.push(item)
158
+ }
159
+
160
+ expect(items.length).toBe(2)
161
+ })
162
+ })
163
+
164
+ // ============================================================================
165
+ // streamPrompts Tests
166
+ // ============================================================================
167
+
168
+ describe('streamPrompts', () => {
169
+ const testFile = '/tmp/streaming-test-prompts.jsonl'
170
+
171
+ afterEach(async () => {
172
+ try {
173
+ await unlink(testFile)
174
+ } catch {
175
+ // Ignore
176
+ }
177
+ })
178
+
179
+ test('yields validated PromptCase objects', async () => {
180
+ await Bun.write(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
181
+
182
+ const prompts = []
183
+ for await (const prompt of streamPrompts(testFile)) {
184
+ prompts.push(prompt)
185
+ }
186
+
187
+ expect(prompts.length).toBe(2)
188
+ expect(prompts[0]?.id).toBe('p1')
189
+ expect(prompts[0]?.input).toBe('hello')
190
+ })
191
+
192
+ test('handles multi-turn prompts', async () => {
193
+ await Bun.write(testFile, '{"id":"m1","input":["turn1","turn2"]}')
194
+
195
+ const prompts = []
196
+ for await (const prompt of streamPrompts(testFile)) {
197
+ prompts.push(prompt)
198
+ }
199
+
200
+ expect(prompts.length).toBe(1)
201
+ expect(Array.isArray(prompts[0]?.input)).toBe(true)
202
+ })
203
+
204
+ test('throws on schema validation failure', async () => {
205
+ // Missing required 'id' field
206
+ await Bun.write(testFile, '{"input":"hello"}')
207
+
208
+ let error: Error | undefined
209
+ try {
210
+ for await (const _ of streamPrompts(testFile)) {
211
+ // Consume
212
+ }
213
+ } catch (e) {
214
+ error = e as Error
215
+ }
216
+
217
+ expect(error).toBeDefined()
218
+ expect(error?.message).toContain('line 1')
219
+ })
220
+ })
221
+
222
+ // ============================================================================
223
+ // streamResultsNative Tests
224
+ // ============================================================================
225
+
226
+ describe('streamResultsNative', () => {
227
+ const testFile = '/tmp/streaming-test-results.jsonl'
228
+
229
+ afterEach(async () => {
230
+ try {
231
+ await unlink(testFile)
232
+ } catch {
233
+ // Ignore
234
+ }
235
+ })
236
+
237
+ test('yields validated CaptureResult objects', async () => {
238
+ const result = {
239
+ id: 'r1',
240
+ input: 'test',
241
+ output: 'result',
242
+ trajectory: [],
243
+ metadata: {},
244
+ toolErrors: false,
245
+ timing: {
246
+ start: 0,
247
+ end: 100,
248
+ total: 100,
249
+ sessionCreation: 10,
250
+ },
251
+ }
252
+ await Bun.write(testFile, JSON.stringify(result))
253
+
254
+ const results = []
255
+ for await (const r of streamResultsNative(testFile)) {
256
+ results.push(r)
257
+ }
258
+
259
+ expect(results.length).toBe(1)
260
+ expect(results[0]?.id).toBe('r1')
261
+ expect(results[0]?.output).toBe('result')
262
+ })
263
+
264
+ test('streams multiple results', async () => {
265
+ const makeResult = (id: string) => ({
266
+ id,
267
+ input: 'test',
268
+ output: 'result',
269
+ trajectory: [],
270
+ metadata: {},
271
+ toolErrors: false,
272
+ timing: { start: 0, end: 100, total: 100, sessionCreation: 10 },
273
+ })
274
+
275
+ await Bun.write(
276
+ testFile,
277
+ `${JSON.stringify(makeResult('r1'))}\n${JSON.stringify(makeResult('r2'))}\n${JSON.stringify(makeResult('r3'))}`,
278
+ )
279
+
280
+ const results = []
281
+ for await (const r of streamResultsNative(testFile)) {
282
+ results.push(r)
283
+ }
284
+
285
+ expect(results.length).toBe(3)
286
+ expect(results.map((r) => r.id)).toEqual(['r1', 'r2', 'r3'])
287
+ })
288
+ })
289
+
290
+ // ============================================================================
291
+ // streamTrialResults Tests
292
+ // ============================================================================
293
+
294
+ describe('streamTrialResults', () => {
295
+ const testFile = '/tmp/streaming-test-trials.jsonl'
296
+
297
+ afterEach(async () => {
298
+ try {
299
+ await unlink(testFile)
300
+ } catch {
301
+ // Ignore
302
+ }
303
+ })
304
+
305
+ test('yields validated TrialResult objects', async () => {
306
+ const trialResult = {
307
+ id: 't1',
308
+ input: 'test prompt',
309
+ k: 3,
310
+ passRate: 0.67,
311
+ passAtK: 1,
312
+ passExpK: 0.7,
313
+ trials: [
314
+ { trialNum: 1, output: 'output1', trajectory: [], duration: 100, pass: true },
315
+ { trialNum: 2, output: 'output2', trajectory: [], duration: 150, pass: true },
316
+ { trialNum: 3, output: 'output3', trajectory: [], duration: 120, pass: false },
317
+ ],
318
+ }
319
+ await Bun.write(testFile, JSON.stringify(trialResult))
320
+
321
+ const results = []
322
+ for await (const r of streamTrialResults(testFile)) {
323
+ results.push(r)
324
+ }
325
+
326
+ expect(results.length).toBe(1)
327
+ expect(results[0]?.id).toBe('t1')
328
+ expect(results[0]?.k).toBe(3)
329
+ expect(results[0]?.passRate).toBe(0.67)
330
+ })
331
+
332
+ test('throws on invalid trial result', async () => {
333
+ // Missing required 'k' field
334
+ await Bun.write(testFile, '{"id":"t1","input":"test","trials":[]}')
335
+
336
+ let error: Error | undefined
337
+ try {
338
+ for await (const _ of streamTrialResults(testFile)) {
339
+ // Consume
340
+ }
341
+ } catch (e) {
342
+ error = e as Error
343
+ }
344
+
345
+ expect(error).toBeDefined()
346
+ expect(error?.message).toContain('line 1')
347
+ })
348
+ })
349
+
350
+ // ============================================================================
351
+ // countLinesStreaming Tests
352
+ // ============================================================================
353
+
354
+ describe('countLinesStreaming', () => {
355
+ const testFile = '/tmp/streaming-test-count.jsonl'
356
+
357
+ afterEach(async () => {
358
+ try {
359
+ await unlink(testFile)
360
+ } catch {
361
+ // Ignore
362
+ }
363
+ })
364
+
365
+ test('counts lines without loading full file', async () => {
366
+ await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
367
+
368
+ const count = await countLinesStreaming(testFile)
369
+ expect(count).toBe(3)
370
+ })
371
+
372
+ test('handles empty file', async () => {
373
+ await Bun.write(testFile, '')
374
+
375
+ const count = await countLinesStreaming(testFile)
376
+ expect(count).toBe(0)
377
+ })
378
+
379
+ test('handles file without trailing newline', async () => {
380
+ await Bun.write(testFile, '{"a":1}\n{"a":2}')
381
+
382
+ const count = await countLinesStreaming(testFile)
383
+ expect(count).toBe(2)
384
+ })
385
+
386
+ test('skips empty lines', async () => {
387
+ await Bun.write(testFile, '{"a":1}\n\n{"a":2}\n\n')
388
+
389
+ const count = await countLinesStreaming(testFile)
390
+ expect(count).toBe(2)
391
+ })
392
+
393
+ test('handles single-line file', async () => {
394
+ await Bun.write(testFile, '{"single":true}')
395
+
396
+ const count = await countLinesStreaming(testFile)
397
+ expect(count).toBe(1)
398
+ })
399
+ })