@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,524 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import type { ParsedUpdate } from '../../headless/headless-output-parser.ts'
3
+ import type { TrajectoryStep } from '../../schemas.ts'
4
+ import {
5
+ detectTrajectoryRichness,
6
+ extractContent,
7
+ extractFilePath,
8
+ extractOutput,
9
+ extractTrajectory,
10
+ hasToolErrors,
11
+ headTailPreview,
12
+ loadPrompts,
13
+ } from '../capture.ts'
14
+
15
+ // ============================================================================
16
+ // loadPrompts
17
+ // ============================================================================
18
+
19
+ describe('loadPrompts', () => {
20
+ test('parses valid JSONL file with string input', async () => {
21
+ // Create a temporary test file
22
+ const testPath = '/tmp/test-prompts-valid.jsonl'
23
+ await Bun.write(
24
+ testPath,
25
+ `{"id": "test-1", "input": "What is 2+2?"}
26
+ {"id": "test-2", "input": "Hello world", "hint": "greeting"}`,
27
+ )
28
+
29
+ const prompts = await loadPrompts(testPath)
30
+
31
+ expect(prompts).toHaveLength(2)
32
+ expect(prompts[0]?.id).toBe('test-1')
33
+ expect(prompts[0]?.input).toBe('What is 2+2?')
34
+ expect(prompts[1]?.id).toBe('test-2')
35
+ expect(prompts[1]?.hint).toBe('greeting')
36
+ })
37
+
38
+ test('parses multi-turn input (string array)', async () => {
39
+ const testPath = '/tmp/test-prompts-multiturn.jsonl'
40
+ await Bun.write(testPath, `{"id": "test-1", "input": ["Hello", "How are you?", "Goodbye"], "hint": "farewell"}`)
41
+
42
+ const prompts = await loadPrompts(testPath)
43
+
44
+ expect(prompts).toHaveLength(1)
45
+ expect(prompts[0]?.id).toBe('test-1')
46
+ expect(Array.isArray(prompts[0]?.input)).toBe(true)
47
+ expect(prompts[0]?.input).toEqual(['Hello', 'How are you?', 'Goodbye'])
48
+ expect(prompts[0]?.hint).toBe('farewell')
49
+ })
50
+
51
+ test('parses prompts with metadata', async () => {
52
+ const testPath = '/tmp/test-prompts-metadata.jsonl'
53
+ await Bun.write(
54
+ testPath,
55
+ `{"id": "test-1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`,
56
+ )
57
+
58
+ const prompts = await loadPrompts(testPath)
59
+
60
+ expect(prompts).toHaveLength(1)
61
+ expect(prompts[0]?.metadata?.category).toBe('math')
62
+ expect(prompts[0]?.metadata?.difficulty).toBe('easy')
63
+ })
64
+
65
+ test('throws on invalid JSON at specific line', async () => {
66
+ const testPath = '/tmp/test-prompts-invalid.jsonl'
67
+ await Bun.write(
68
+ testPath,
69
+ `{"id": "test-1", "input": "Valid"}
70
+ {invalid json here}
71
+ {"id": "test-3", "input": "Also valid"}`,
72
+ )
73
+
74
+ await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 2')
75
+ })
76
+
77
+ test('throws on missing required fields', async () => {
78
+ const testPath = '/tmp/test-prompts-missing.jsonl'
79
+ await Bun.write(testPath, `{"id": "test-1"}`)
80
+
81
+ await expect(loadPrompts(testPath)).rejects.toThrow('Invalid prompt at line 1')
82
+ })
83
+
84
+ test('handles empty lines gracefully', async () => {
85
+ const testPath = '/tmp/test-prompts-empty-lines.jsonl'
86
+ await Bun.write(
87
+ testPath,
88
+ `{"id": "test-1", "input": "First"}
89
+
90
+ {"id": "test-2", "input": "Second"}
91
+ `,
92
+ )
93
+
94
+ const prompts = await loadPrompts(testPath)
95
+ expect(prompts).toHaveLength(2)
96
+ })
97
+ })
98
+
99
+ // ============================================================================
100
+ // extractTrajectory
101
+ // ============================================================================
102
+
103
+ describe('extractTrajectory', () => {
104
+ const baseTime = 0
105
+
106
+ test('extracts thoughts from thought type updates', () => {
107
+ const updates: ParsedUpdate[] = [
108
+ {
109
+ type: 'thought',
110
+ content: 'Let me think about this...',
111
+ raw: { type: 'thought', text: 'Let me think about this...' },
112
+ },
113
+ ]
114
+
115
+ const trajectory = extractTrajectory(updates, baseTime)
116
+
117
+ expect(trajectory).toHaveLength(1)
118
+ expect(trajectory[0]?.type).toBe('thought')
119
+ const step = trajectory[0]!
120
+ expect(step.type === 'thought' && step.content).toBe('Let me think about this...')
121
+ })
122
+
123
+ test('extracts messages from message type updates', () => {
124
+ const updates: ParsedUpdate[] = [
125
+ {
126
+ type: 'message',
127
+ content: 'Here is my answer.',
128
+ raw: { type: 'message', text: 'Here is my answer.' },
129
+ },
130
+ ]
131
+
132
+ const trajectory = extractTrajectory(updates, baseTime)
133
+
134
+ expect(trajectory).toHaveLength(1)
135
+ expect(trajectory[0]?.type).toBe('message')
136
+ const step = trajectory[0]!
137
+ expect(step.type === 'message' && step.content).toBe('Here is my answer.')
138
+ })
139
+
140
+ test('extracts tool calls with title and status', () => {
141
+ const updates: ParsedUpdate[] = [
142
+ {
143
+ type: 'tool_call',
144
+ title: 'Read',
145
+ status: 'pending',
146
+ raw: { tool: 'Read', input: { file_path: '/test.ts' } },
147
+ },
148
+ ]
149
+
150
+ const trajectory = extractTrajectory(updates, baseTime)
151
+
152
+ expect(trajectory).toHaveLength(1)
153
+ expect(trajectory[0]?.type).toBe('tool_call')
154
+ const step = trajectory[0]!
155
+ expect(step.type === 'tool_call' && step.name).toBe('Read')
156
+ expect(step.type === 'tool_call' && step.status).toBe('pending')
157
+ })
158
+
159
+ test('extracts plan type updates', () => {
160
+ const updates: ParsedUpdate[] = [
161
+ {
162
+ type: 'plan',
163
+ raw: {
164
+ entries: [
165
+ { content: 'Step 1', status: 'completed' },
166
+ { content: 'Step 2', status: 'in_progress' },
167
+ ],
168
+ },
169
+ },
170
+ ]
171
+
172
+ const trajectory = extractTrajectory(updates, baseTime)
173
+
174
+ expect(trajectory).toHaveLength(1)
175
+ expect(trajectory[0]?.type).toBe('plan')
176
+ // Note: extractTrajectory creates plan entries from the update type
177
+ // but doesn't extract entries from raw (they are captured via output parser mappings)
178
+ const step = trajectory[0]!
179
+ expect(step.type === 'plan').toBe(true)
180
+ })
181
+
182
+ test('handles empty updates', () => {
183
+ const trajectory = extractTrajectory([], baseTime)
184
+ expect(trajectory).toEqual([])
185
+ })
186
+
187
+ test('assigns timestamps relative to start time', () => {
188
+ const originalNow = Date.now
189
+ try {
190
+ let currentTime = 1000
191
+
192
+ Date.now = () => currentTime
193
+
194
+ const updates: ParsedUpdate[] = [
195
+ {
196
+ type: 'message',
197
+ content: 'First',
198
+ raw: { type: 'message', text: 'First' },
199
+ },
200
+ ]
201
+
202
+ const startTime = 1000
203
+ currentTime = 1500 // 500ms later
204
+
205
+ const trajectory = extractTrajectory(updates, startTime)
206
+
207
+ expect(trajectory[0]?.timestamp).toBe(500)
208
+ } finally {
209
+ Date.now = originalNow
210
+ }
211
+ })
212
+
213
+ test('handles updates without content for message/thought types', () => {
214
+ const updates: ParsedUpdate[] = [
215
+ {
216
+ type: 'message',
217
+ content: undefined, // No content - will have empty string
218
+ raw: { type: 'message' },
219
+ },
220
+ {
221
+ type: 'message',
222
+ content: 'Has content',
223
+ raw: { type: 'message', text: 'Has content' },
224
+ },
225
+ ]
226
+
227
+ const trajectory = extractTrajectory(updates, baseTime)
228
+
229
+ // Both messages are included - ones without content get empty string
230
+ expect(trajectory).toHaveLength(2)
231
+ expect(trajectory[0]?.type).toBe('message')
232
+ expect(trajectory[1]?.type).toBe('message')
233
+ })
234
+ })
235
+
236
+ // ============================================================================
237
+ // extractOutput
238
+ // ============================================================================
239
+
240
+ describe('extractOutput', () => {
241
+ test('joins message contents with newlines', () => {
242
+ const trajectory: TrajectoryStep[] = [
243
+ { type: 'message', content: 'First line', timestamp: 0 },
244
+ { type: 'message', content: 'Second line', timestamp: 100 },
245
+ ]
246
+
247
+ expect(extractOutput(trajectory)).toBe('First line\nSecond line')
248
+ })
249
+
250
+ test('filters out non-message steps', () => {
251
+ const trajectory: TrajectoryStep[] = [
252
+ { type: 'thought', content: 'Thinking...', timestamp: 0 },
253
+ { type: 'message', content: 'Answer', timestamp: 100 },
254
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 200 },
255
+ { type: 'message', content: 'Done', timestamp: 300 },
256
+ ]
257
+
258
+ expect(extractOutput(trajectory)).toBe('Answer\nDone')
259
+ })
260
+
261
+ test('returns empty string for empty trajectory', () => {
262
+ expect(extractOutput([])).toBe('')
263
+ })
264
+
265
+ test('returns empty string when no messages', () => {
266
+ const trajectory: TrajectoryStep[] = [
267
+ { type: 'thought', content: 'Just thinking', timestamp: 0 },
268
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 100 },
269
+ ]
270
+
271
+ expect(extractOutput(trajectory)).toBe('')
272
+ })
273
+
274
+ test('handles single message', () => {
275
+ const trajectory: TrajectoryStep[] = [{ type: 'message', content: 'Only message', timestamp: 0 }]
276
+
277
+ expect(extractOutput(trajectory)).toBe('Only message')
278
+ })
279
+ })
280
+
281
+ // ============================================================================
282
+ // hasToolErrors
283
+ // ============================================================================
284
+
285
+ describe('hasToolErrors', () => {
286
+ test('returns false when no tool calls', () => {
287
+ const trajectory: TrajectoryStep[] = [
288
+ { type: 'thought', content: 'Thinking', timestamp: 0 },
289
+ { type: 'message', content: 'Done', timestamp: 100 },
290
+ ]
291
+
292
+ expect(hasToolErrors(trajectory)).toBe(false)
293
+ })
294
+
295
+ test('returns false when all tool calls succeeded', () => {
296
+ const trajectory: TrajectoryStep[] = [
297
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
298
+ { type: 'tool_call', name: 'Write', status: 'completed', timestamp: 100 },
299
+ ]
300
+
301
+ expect(hasToolErrors(trajectory)).toBe(false)
302
+ })
303
+
304
+ test('returns true when any tool call failed', () => {
305
+ const trajectory: TrajectoryStep[] = [
306
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
307
+ { type: 'tool_call', name: 'Write', status: 'failed', timestamp: 100 },
308
+ { type: 'tool_call', name: 'Bash', status: 'completed', timestamp: 200 },
309
+ ]
310
+
311
+ expect(hasToolErrors(trajectory)).toBe(true)
312
+ })
313
+
314
+ test('returns false for empty trajectory', () => {
315
+ expect(hasToolErrors([])).toBe(false)
316
+ })
317
+
318
+ test('returns true when only tool call failed', () => {
319
+ const trajectory: TrajectoryStep[] = [{ type: 'tool_call', name: 'Bash', status: 'failed', timestamp: 0 }]
320
+
321
+ expect(hasToolErrors(trajectory)).toBe(true)
322
+ })
323
+ })
324
+
325
+ // ============================================================================
326
+ // headTailPreview
327
+ // ============================================================================
328
+
329
+ describe('headTailPreview', () => {
330
+ test('returns full content when under limit', () => {
331
+ const content = 'line1\nline2\nline3'
332
+ expect(headTailPreview(content, 5, 5)).toBe(content)
333
+ })
334
+
335
+ test('truncates with omitted count for long content', () => {
336
+ const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`)
337
+ const content = lines.join('\n')
338
+
339
+ const result = headTailPreview(content, 3, 3)
340
+
341
+ expect(result).toContain('line1')
342
+ expect(result).toContain('line2')
343
+ expect(result).toContain('line3')
344
+ expect(result).toContain('line18')
345
+ expect(result).toContain('line19')
346
+ expect(result).toContain('line20')
347
+ expect(result).toContain('14 lines omitted')
348
+ })
349
+
350
+ test('respects custom head line count', () => {
351
+ const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`)
352
+ const content = lines.join('\n')
353
+
354
+ const result = headTailPreview(content, 2, 2)
355
+
356
+ expect(result).toContain('line1')
357
+ expect(result).toContain('line2')
358
+ expect(result).not.toContain('line3')
359
+ expect(result).toContain('6 lines omitted')
360
+ })
361
+
362
+ test('respects custom tail line count', () => {
363
+ const lines = Array.from({ length: 10 }, (_, i) => `line${i + 1}`)
364
+ const content = lines.join('\n')
365
+
366
+ const result = headTailPreview(content, 1, 4)
367
+
368
+ expect(result).toContain('line1')
369
+ expect(result).toContain('line7')
370
+ expect(result).toContain('line10')
371
+ expect(result).toContain('5 lines omitted')
372
+ })
373
+
374
+ test('handles content exactly at boundary', () => {
375
+ const content = 'line1\nline2\nline3\nline4\nline5\nline6'
376
+ // 6 lines, head=3, tail=3 means no truncation needed
377
+ expect(headTailPreview(content, 3, 3)).toBe(content)
378
+ })
379
+
380
+ test('handles single line content', () => {
381
+ const content = 'single line'
382
+ expect(headTailPreview(content, 3, 3)).toBe(content)
383
+ })
384
+
385
+ test('handles empty content', () => {
386
+ expect(headTailPreview('', 3, 3)).toBe('')
387
+ })
388
+ })
389
+
390
+ // ============================================================================
391
+ // extractFilePath
392
+ // ============================================================================
393
+
394
+ describe('extractFilePath', () => {
395
+ test('extracts file_path field', () => {
396
+ const input = { file_path: '/path/to/file.ts' }
397
+ expect(extractFilePath(input)).toBe('/path/to/file.ts')
398
+ })
399
+
400
+ test('extracts path field as fallback', () => {
401
+ const input = { path: '/another/path.js' }
402
+ expect(extractFilePath(input)).toBe('/another/path.js')
403
+ })
404
+
405
+ test('prefers file_path over path', () => {
406
+ const input = { file_path: '/preferred.ts', path: '/fallback.ts' }
407
+ expect(extractFilePath(input)).toBe('/preferred.ts')
408
+ })
409
+
410
+ test('returns undefined for invalid input', () => {
411
+ expect(extractFilePath(null)).toBeUndefined()
412
+ expect(extractFilePath(undefined)).toBeUndefined()
413
+ expect(extractFilePath('string')).toBeUndefined()
414
+ expect(extractFilePath(123)).toBeUndefined()
415
+ })
416
+
417
+ test('returns undefined when no path fields present', () => {
418
+ const input = { content: 'some content' }
419
+ expect(extractFilePath(input)).toBeUndefined()
420
+ })
421
+
422
+ test('handles empty object', () => {
423
+ expect(extractFilePath({})).toBeUndefined()
424
+ })
425
+ })
426
+
427
+ // ============================================================================
428
+ // extractContent
429
+ // ============================================================================
430
+
431
+ describe('extractContent', () => {
432
+ test('extracts content field', () => {
433
+ const input = { content: 'const x = 1;' }
434
+ expect(extractContent(input)).toBe('const x = 1;')
435
+ })
436
+
437
+ test('extracts new_string field as fallback', () => {
438
+ const input = { new_string: 'const y = 2;' }
439
+ expect(extractContent(input)).toBe('const y = 2;')
440
+ })
441
+
442
+ test('prefers content over new_string', () => {
443
+ const input = { content: 'preferred', new_string: 'fallback' }
444
+ expect(extractContent(input)).toBe('preferred')
445
+ })
446
+
447
+ test('returns undefined for invalid input', () => {
448
+ expect(extractContent(null)).toBeUndefined()
449
+ expect(extractContent(undefined)).toBeUndefined()
450
+ expect(extractContent('string')).toBeUndefined()
451
+ expect(extractContent(123)).toBeUndefined()
452
+ })
453
+
454
+ test('returns undefined when no content fields present', () => {
455
+ const input = { file_path: '/some/path.ts' }
456
+ expect(extractContent(input)).toBeUndefined()
457
+ })
458
+
459
+ test('handles empty object', () => {
460
+ expect(extractContent({})).toBeUndefined()
461
+ })
462
+
463
+ test('handles multiline content', () => {
464
+ const input = { content: 'line1\nline2\nline3' }
465
+ expect(extractContent(input)).toBe('line1\nline2\nline3')
466
+ })
467
+ })
468
+
469
+ // ============================================================================
470
+ // detectTrajectoryRichness
471
+ // ============================================================================
472
+
473
+ describe('detectTrajectoryRichness', () => {
474
+ test('returns "full" when trajectory has thoughts', () => {
475
+ const trajectory: TrajectoryStep[] = [
476
+ { type: 'thought', content: 'Let me think...', timestamp: 0 },
477
+ { type: 'message', content: 'Answer', timestamp: 100 },
478
+ ]
479
+
480
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
481
+ })
482
+
483
+ test('returns "full" when trajectory has tool calls', () => {
484
+ const trajectory: TrajectoryStep[] = [
485
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
486
+ { type: 'message', content: 'Answer', timestamp: 100 },
487
+ ]
488
+
489
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
490
+ })
491
+
492
+ test('returns "full" when trajectory has plans', () => {
493
+ const trajectory: TrajectoryStep[] = [
494
+ { type: 'plan', entries: [{ content: 'Step 1', status: 'completed' }], timestamp: 0 },
495
+ { type: 'message', content: 'Answer', timestamp: 100 },
496
+ ]
497
+
498
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
499
+ })
500
+
501
+ test('returns "messages-only" when trajectory only has messages', () => {
502
+ const trajectory: TrajectoryStep[] = [
503
+ { type: 'message', content: 'First', timestamp: 0 },
504
+ { type: 'message', content: 'Second', timestamp: 100 },
505
+ ]
506
+
507
+ expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
508
+ })
509
+
510
+ test('returns "minimal" when trajectory is empty', () => {
511
+ expect(detectTrajectoryRichness([])).toBe('minimal')
512
+ })
513
+
514
+ test('returns "full" when trajectory has mixed rich content', () => {
515
+ const trajectory: TrajectoryStep[] = [
516
+ { type: 'thought', content: 'Thinking...', timestamp: 0 },
517
+ { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 50 },
518
+ { type: 'plan', entries: [], timestamp: 100 },
519
+ { type: 'message', content: 'Done', timestamp: 150 },
520
+ ]
521
+
522
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
523
+ })
524
+ })