@plaited/agent-eval-harness 0.8.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,399 @@
1
+ /**
2
+ * Unit tests for native streaming utilities.
3
+ *
4
+ * @remarks
5
+ * Tests for memory-efficient streaming functions in streaming.ts:
6
+ * - streamJsonl: Generic JSONL streaming with optional schema validation
7
+ * - streamPrompts: PromptCase streaming
8
+ * - streamResultsNative: CaptureResult streaming
9
+ * - streamTrialResults: TrialResult streaming
10
+ * - countLinesStreaming: Line counting without full file load
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+
15
+ import { afterEach, describe, expect, test } from 'bun:test'
16
+ import { unlink } from 'node:fs/promises'
17
+ import { z } from 'zod'
18
+ import {
19
+ countLinesStreaming,
20
+ streamJsonl,
21
+ streamPrompts,
22
+ streamResultsNative,
23
+ streamTrialResults,
24
+ } from '../streaming.ts'
25
+
26
+ // ============================================================================
27
+ // streamJsonl Tests
28
+ // ============================================================================
29
+
30
+ describe('streamJsonl', () => {
31
+ const testFile = '/tmp/streaming-test-jsonl.jsonl'
32
+
33
+ afterEach(async () => {
34
+ try {
35
+ await unlink(testFile)
36
+ } catch {
37
+ // Ignore if file doesn't exist
38
+ }
39
+ })
40
+
41
+ test('streams items one at a time', async () => {
42
+ await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
43
+
44
+ const items: Array<{ a: number }> = []
45
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
46
+ items.push(item)
47
+ }
48
+
49
+ expect(items.length).toBe(3)
50
+ expect(items[0]?.a).toBe(1)
51
+ expect(items[1]?.a).toBe(2)
52
+ expect(items[2]?.a).toBe(3)
53
+ })
54
+
55
+ test('handles files without trailing newline', async () => {
56
+ await Bun.write(testFile, '{"a":1}\n{"a":2}')
57
+
58
+ const items: Array<{ a: number }> = []
59
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
60
+ items.push(item)
61
+ }
62
+
63
+ expect(items.length).toBe(2)
64
+ expect(items[1]?.a).toBe(2)
65
+ })
66
+
67
+ test('validates with schema when provided', async () => {
68
+ const schema = z.object({ id: z.string(), value: z.number() })
69
+ await Bun.write(testFile, '{"id":"a","value":1}\n{"id":"b","value":2}')
70
+
71
+ const items: Array<{ id: string; value: number }> = []
72
+ for await (const item of streamJsonl(testFile, schema)) {
73
+ items.push(item)
74
+ }
75
+
76
+ expect(items.length).toBe(2)
77
+ expect(items[0]?.id).toBe('a')
78
+ expect(items[0]?.value).toBe(1)
79
+ })
80
+
81
+ test('throws with line number on invalid JSON', async () => {
82
+ await Bun.write(testFile, '{"a":1}\ninvalid json\n{"a":3}')
83
+
84
+ const items: unknown[] = []
85
+ let error: Error | undefined
86
+
87
+ try {
88
+ for await (const item of streamJsonl(testFile)) {
89
+ items.push(item)
90
+ }
91
+ } catch (e) {
92
+ error = e as Error
93
+ }
94
+
95
+ expect(error).toBeDefined()
96
+ expect(error?.message).toContain('line 2')
97
+ })
98
+
99
+ test('throws with line number on schema validation failure', async () => {
100
+ const schema = z.object({ id: z.string(), required: z.number() })
101
+ await Bun.write(testFile, '{"id":"a","required":1}\n{"id":"b"}')
102
+
103
+ const items: unknown[] = []
104
+ let error: Error | undefined
105
+
106
+ try {
107
+ for await (const item of streamJsonl(testFile, schema)) {
108
+ items.push(item)
109
+ }
110
+ } catch (e) {
111
+ error = e as Error
112
+ }
113
+
114
+ expect(error).toBeDefined()
115
+ expect(error?.message).toContain('line 2')
116
+ })
117
+
118
+ test('handles empty files', async () => {
119
+ await Bun.write(testFile, '')
120
+
121
+ const items: unknown[] = []
122
+ for await (const item of streamJsonl(testFile)) {
123
+ items.push(item)
124
+ }
125
+
126
+ expect(items.length).toBe(0)
127
+ })
128
+
129
+ test('handles single-line files', async () => {
130
+ await Bun.write(testFile, '{"single":true}')
131
+
132
+ const items: Array<{ single: boolean }> = []
133
+ for await (const item of streamJsonl<{ single: boolean }>(testFile)) {
134
+ items.push(item)
135
+ }
136
+
137
+ expect(items.length).toBe(1)
138
+ expect(items[0]?.single).toBe(true)
139
+ })
140
+
141
+ test('skips empty lines', async () => {
142
+ await Bun.write(testFile, '{"a":1}\n\n\n{"a":2}\n')
143
+
144
+ const items: Array<{ a: number }> = []
145
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
146
+ items.push(item)
147
+ }
148
+
149
+ expect(items.length).toBe(2)
150
+ })
151
+
152
+ test('handles whitespace-only lines', async () => {
153
+ await Bun.write(testFile, '{"a":1}\n \n{"a":2}')
154
+
155
+ const items: Array<{ a: number }> = []
156
+ for await (const item of streamJsonl<{ a: number }>(testFile)) {
157
+ items.push(item)
158
+ }
159
+
160
+ expect(items.length).toBe(2)
161
+ })
162
+ })
163
+
164
+ // ============================================================================
165
+ // streamPrompts Tests
166
+ // ============================================================================
167
+
168
+ describe('streamPrompts', () => {
169
+ const testFile = '/tmp/streaming-test-prompts.jsonl'
170
+
171
+ afterEach(async () => {
172
+ try {
173
+ await unlink(testFile)
174
+ } catch {
175
+ // Ignore
176
+ }
177
+ })
178
+
179
+ test('yields validated PromptCase objects', async () => {
180
+ await Bun.write(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
181
+
182
+ const prompts = []
183
+ for await (const prompt of streamPrompts(testFile)) {
184
+ prompts.push(prompt)
185
+ }
186
+
187
+ expect(prompts.length).toBe(2)
188
+ expect(prompts[0]?.id).toBe('p1')
189
+ expect(prompts[0]?.input).toBe('hello')
190
+ })
191
+
192
+ test('handles multi-turn prompts', async () => {
193
+ await Bun.write(testFile, '{"id":"m1","input":["turn1","turn2"]}')
194
+
195
+ const prompts = []
196
+ for await (const prompt of streamPrompts(testFile)) {
197
+ prompts.push(prompt)
198
+ }
199
+
200
+ expect(prompts.length).toBe(1)
201
+ expect(Array.isArray(prompts[0]?.input)).toBe(true)
202
+ })
203
+
204
+ test('throws on schema validation failure', async () => {
205
+ // Missing required 'id' field
206
+ await Bun.write(testFile, '{"input":"hello"}')
207
+
208
+ let error: Error | undefined
209
+ try {
210
+ for await (const _ of streamPrompts(testFile)) {
211
+ // Consume
212
+ }
213
+ } catch (e) {
214
+ error = e as Error
215
+ }
216
+
217
+ expect(error).toBeDefined()
218
+ expect(error?.message).toContain('line 1')
219
+ })
220
+ })
221
+
222
+ // ============================================================================
223
+ // streamResultsNative Tests
224
+ // ============================================================================
225
+
226
+ describe('streamResultsNative', () => {
227
+ const testFile = '/tmp/streaming-test-results.jsonl'
228
+
229
+ afterEach(async () => {
230
+ try {
231
+ await unlink(testFile)
232
+ } catch {
233
+ // Ignore
234
+ }
235
+ })
236
+
237
+ test('yields validated CaptureResult objects', async () => {
238
+ const result = {
239
+ id: 'r1',
240
+ input: 'test',
241
+ output: 'result',
242
+ trajectory: [],
243
+ metadata: {},
244
+ toolErrors: false,
245
+ timing: {
246
+ start: 0,
247
+ end: 100,
248
+ total: 100,
249
+ sessionCreation: 10,
250
+ },
251
+ }
252
+ await Bun.write(testFile, JSON.stringify(result))
253
+
254
+ const results = []
255
+ for await (const r of streamResultsNative(testFile)) {
256
+ results.push(r)
257
+ }
258
+
259
+ expect(results.length).toBe(1)
260
+ expect(results[0]?.id).toBe('r1')
261
+ expect(results[0]?.output).toBe('result')
262
+ })
263
+
264
+ test('streams multiple results', async () => {
265
+ const makeResult = (id: string) => ({
266
+ id,
267
+ input: 'test',
268
+ output: 'result',
269
+ trajectory: [],
270
+ metadata: {},
271
+ toolErrors: false,
272
+ timing: { start: 0, end: 100, total: 100, sessionCreation: 10 },
273
+ })
274
+
275
+ await Bun.write(
276
+ testFile,
277
+ `${JSON.stringify(makeResult('r1'))}\n${JSON.stringify(makeResult('r2'))}\n${JSON.stringify(makeResult('r3'))}`,
278
+ )
279
+
280
+ const results = []
281
+ for await (const r of streamResultsNative(testFile)) {
282
+ results.push(r)
283
+ }
284
+
285
+ expect(results.length).toBe(3)
286
+ expect(results.map((r) => r.id)).toEqual(['r1', 'r2', 'r3'])
287
+ })
288
+ })
289
+
290
+ // ============================================================================
291
+ // streamTrialResults Tests
292
+ // ============================================================================
293
+
294
+ describe('streamTrialResults', () => {
295
+ const testFile = '/tmp/streaming-test-trials.jsonl'
296
+
297
+ afterEach(async () => {
298
+ try {
299
+ await unlink(testFile)
300
+ } catch {
301
+ // Ignore
302
+ }
303
+ })
304
+
305
+ test('yields validated TrialResult objects', async () => {
306
+ const trialResult = {
307
+ id: 't1',
308
+ input: 'test prompt',
309
+ k: 3,
310
+ passRate: 0.67,
311
+ passAtK: 1,
312
+ passExpK: 0.7,
313
+ trials: [
314
+ { trialNum: 1, output: 'output1', trajectory: [], duration: 100, pass: true },
315
+ { trialNum: 2, output: 'output2', trajectory: [], duration: 150, pass: true },
316
+ { trialNum: 3, output: 'output3', trajectory: [], duration: 120, pass: false },
317
+ ],
318
+ }
319
+ await Bun.write(testFile, JSON.stringify(trialResult))
320
+
321
+ const results = []
322
+ for await (const r of streamTrialResults(testFile)) {
323
+ results.push(r)
324
+ }
325
+
326
+ expect(results.length).toBe(1)
327
+ expect(results[0]?.id).toBe('t1')
328
+ expect(results[0]?.k).toBe(3)
329
+ expect(results[0]?.passRate).toBe(0.67)
330
+ })
331
+
332
+ test('throws on invalid trial result', async () => {
333
+ // Missing required 'k' field
334
+ await Bun.write(testFile, '{"id":"t1","input":"test","trials":[]}')
335
+
336
+ let error: Error | undefined
337
+ try {
338
+ for await (const _ of streamTrialResults(testFile)) {
339
+ // Consume
340
+ }
341
+ } catch (e) {
342
+ error = e as Error
343
+ }
344
+
345
+ expect(error).toBeDefined()
346
+ expect(error?.message).toContain('line 1')
347
+ })
348
+ })
349
+
350
+ // ============================================================================
351
+ // countLinesStreaming Tests
352
+ // ============================================================================
353
+
354
+ describe('countLinesStreaming', () => {
355
+ const testFile = '/tmp/streaming-test-count.jsonl'
356
+
357
+ afterEach(async () => {
358
+ try {
359
+ await unlink(testFile)
360
+ } catch {
361
+ // Ignore
362
+ }
363
+ })
364
+
365
+ test('counts lines without loading full file', async () => {
366
+ await Bun.write(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
367
+
368
+ const count = await countLinesStreaming(testFile)
369
+ expect(count).toBe(3)
370
+ })
371
+
372
+ test('handles empty file', async () => {
373
+ await Bun.write(testFile, '')
374
+
375
+ const count = await countLinesStreaming(testFile)
376
+ expect(count).toBe(0)
377
+ })
378
+
379
+ test('handles file without trailing newline', async () => {
380
+ await Bun.write(testFile, '{"a":1}\n{"a":2}')
381
+
382
+ const count = await countLinesStreaming(testFile)
383
+ expect(count).toBe(2)
384
+ })
385
+
386
+ test('skips empty lines', async () => {
387
+ await Bun.write(testFile, '{"a":1}\n\n{"a":2}\n\n')
388
+
389
+ const count = await countLinesStreaming(testFile)
390
+ expect(count).toBe(2)
391
+ })
392
+
393
+ test('handles single-line file', async () => {
394
+ await Bun.write(testFile, '{"single":true}')
395
+
396
+ const count = await countLinesStreaming(testFile)
397
+ expect(count).toBe(1)
398
+ })
399
+ })