@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,529 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import { join } from 'node:path'
3
+ import { z } from 'zod'
4
+
5
+ /**
6
+ * Tests for the agent-eval-harness CLI.
7
+ *
8
+ * @remarks
9
+ * Tests CLI argument parsing, help output, and output format schemas.
10
+ * Integration tests requiring an actual CLI agent are in *.docker.ts files.
11
+ */
12
+
13
+ const CLI_PATH = join(import.meta.dir, '..', 'cli.ts')
14
+
15
+ // ============================================================================
16
+ // CLI Invocation Tests
17
+ // ============================================================================
18
+
19
+ describe('CLI invocation', () => {
20
+ test('shows help with --help flag', async () => {
21
+ const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
22
+ stdout: 'pipe',
23
+ stderr: 'pipe',
24
+ })
25
+ const stdout = await new Response(proc.stdout).text()
26
+ const exitCode = await proc.exited
27
+
28
+ expect(exitCode).toBe(0)
29
+ expect(stdout).toContain('agent-eval-harness')
30
+ expect(stdout).toContain('Commands:')
31
+ expect(stdout).toContain('capture')
32
+ expect(stdout).toContain('trials')
33
+ expect(stdout).toContain('summarize')
34
+ })
35
+
36
+ test('shows help with -h flag', async () => {
37
+ const proc = Bun.spawn(['bun', CLI_PATH, '-h'], {
38
+ stdout: 'pipe',
39
+ stderr: 'pipe',
40
+ })
41
+ const stdout = await new Response(proc.stdout).text()
42
+ const exitCode = await proc.exited
43
+
44
+ expect(exitCode).toBe(0)
45
+ expect(stdout).toContain('agent-eval-harness')
46
+ })
47
+
48
+ test('shows help when no arguments provided', async () => {
49
+ const proc = Bun.spawn(['bun', CLI_PATH], {
50
+ stdout: 'pipe',
51
+ stderr: 'pipe',
52
+ })
53
+ const stdout = await new Response(proc.stdout).text()
54
+ const exitCode = await proc.exited
55
+
56
+ expect(exitCode).toBe(0) // Exits cleanly when showing help
57
+ expect(stdout).toContain('agent-eval-harness')
58
+ })
59
+
60
+ test('help shows example commands', async () => {
61
+ const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
62
+ stdout: 'pipe',
63
+ stderr: 'pipe',
64
+ })
65
+ const stdout = await new Response(proc.stdout).text()
66
+
67
+ expect(stdout).toContain('--schema')
68
+ expect(stdout).toContain('prompts.jsonl')
69
+ expect(stdout).toContain('results.jsonl')
70
+ })
71
+
72
+ test('help shows available commands', async () => {
73
+ const proc = Bun.spawn(['bun', CLI_PATH, '--help'], {
74
+ stdout: 'pipe',
75
+ stderr: 'pipe',
76
+ })
77
+ const stdout = await new Response(proc.stdout).text()
78
+
79
+ expect(stdout).toContain('capture')
80
+ expect(stdout).toContain('trials')
81
+ expect(stdout).toContain('summarize')
82
+ expect(stdout).toContain('calibrate')
83
+ expect(stdout).toContain('balance')
84
+ expect(stdout).toContain('schemas')
85
+ })
86
+
87
+ test('fails with non-existent schema file', async () => {
88
+ const proc = Bun.spawn(['bun', CLI_PATH, 'capture', 'prompts.jsonl', '--schema', 'nonexistent.json'], {
89
+ stdout: 'pipe',
90
+ stderr: 'pipe',
91
+ })
92
+ const stderr = await new Response(proc.stderr).text()
93
+ const exitCode = await proc.exited
94
+
95
+ expect(exitCode).not.toBe(0)
96
+ expect(stderr).toContain('Schema file not found')
97
+ })
98
+
99
+ test('fails when no schema provided', async () => {
100
+ const tmpFile = `/tmp/test-prompts-${Date.now()}.jsonl`
101
+ await Bun.write(tmpFile, '{"id":"test-001","input":"test"}\n')
102
+
103
+ const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile], {
104
+ stdout: 'pipe',
105
+ stderr: 'pipe',
106
+ })
107
+ const stderr = await new Response(proc.stderr).text()
108
+ const exitCode = await proc.exited
109
+
110
+ expect(exitCode).toBe(1)
111
+ expect(stderr).toContain('--schema is required')
112
+ })
113
+
114
+ test('fails with unknown command', async () => {
115
+ const proc = Bun.spawn(['bun', CLI_PATH, 'unknown-command'], {
116
+ stdout: 'pipe',
117
+ stderr: 'pipe',
118
+ })
119
+ const stderr = await new Response(proc.stderr).text()
120
+ const exitCode = await proc.exited
121
+
122
+ expect(exitCode).toBe(1)
123
+ expect(stderr).toContain('Unknown command')
124
+ })
125
+
126
+ test('capture command shows help with --help', async () => {
127
+ const proc = Bun.spawn(['bun', CLI_PATH, 'capture', '--help'], {
128
+ stdout: 'pipe',
129
+ stderr: 'pipe',
130
+ })
131
+ const stdout = await new Response(proc.stdout).text()
132
+ const exitCode = await proc.exited
133
+
134
+ expect(exitCode).toBe(0)
135
+ expect(stdout).toContain('capture')
136
+ expect(stdout).toContain('prompts.jsonl')
137
+ expect(stdout).toContain('--output')
138
+ })
139
+
140
+ test('trials command shows help with --help', async () => {
141
+ const proc = Bun.spawn(['bun', CLI_PATH, 'trials', '--help'], {
142
+ stdout: 'pipe',
143
+ stderr: 'pipe',
144
+ })
145
+ const stdout = await new Response(proc.stdout).text()
146
+ const exitCode = await proc.exited
147
+
148
+ expect(exitCode).toBe(0)
149
+ expect(stdout).toContain('trials')
150
+ expect(stdout).toContain('-k')
151
+ expect(stdout).toContain('pass@k')
152
+ })
153
+ })
154
+
155
+ // ============================================================================
156
+ // Output Format Schemas (for downstream validation)
157
+ // ============================================================================
158
+
159
+ const SummaryResultSchema = z.object({
160
+ id: z.string(),
161
+ input: z.string(),
162
+ output: z.string(),
163
+ toolCalls: z.array(z.string()),
164
+ duration: z.number(),
165
+ })
166
+
167
+ const TrajectoryStepSchema = z.discriminatedUnion('type', [
168
+ z.object({
169
+ type: z.literal('thought'),
170
+ content: z.string(),
171
+ timestamp: z.number(),
172
+ stepId: z.string().optional(),
173
+ }),
174
+ z.object({
175
+ type: z.literal('message'),
176
+ content: z.string(),
177
+ timestamp: z.number(),
178
+ stepId: z.string().optional(),
179
+ }),
180
+ z.object({
181
+ type: z.literal('tool_call'),
182
+ name: z.string(),
183
+ status: z.string(),
184
+ input: z.unknown().optional(),
185
+ output: z.unknown().optional(),
186
+ duration: z.number().optional(),
187
+ timestamp: z.number(),
188
+ stepId: z.string().optional(),
189
+ }),
190
+ z.object({
191
+ type: z.literal('plan'),
192
+ entries: z.array(z.unknown()),
193
+ timestamp: z.number(),
194
+ stepId: z.string().optional(),
195
+ }),
196
+ ])
197
+
198
+ const CaptureResultSchema = z.object({
199
+ id: z.string(),
200
+ input: z.string(),
201
+ output: z.string(),
202
+ expected: z.string().optional(),
203
+ trajectory: z.array(TrajectoryStepSchema),
204
+ metadata: z.record(z.string(), z.unknown()),
205
+ timing: z.object({
206
+ start: z.number(),
207
+ end: z.number(),
208
+ firstResponse: z.number().optional(),
209
+ }),
210
+ toolErrors: z.boolean(),
211
+ errors: z.array(z.string()).optional(),
212
+ })
213
+
214
+ // ============================================================================
215
+ // Sample Output Data (matches harness output format)
216
+ // ============================================================================
217
+
218
+ const SAMPLE_SUMMARY_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","toolCalls":["Write"],"duration":1234}
219
+ {"id":"test-002","input":"Fix the bug","output":"I fixed the bug","toolCalls":["Read","Edit"],"duration":2567}
220
+ {"id":"test-003","input":"Broken test","output":"","toolCalls":[],"duration":500}`
221
+
222
+ const SAMPLE_CAPTURE_JSONL = `{"id":"test-001","input":"Create a button","output":"I created the button","trajectory":[{"type":"thought","content":"I'll create a button template","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"export const Button = () => <button>Click</button>"},"output":"File written","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude-code-acp"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"toolErrors":false}
223
+ {"id":"test-002","input":"Fix the bug","output":"I fixed the bug","trajectory":[{"type":"tool_call","name":"Read","status":"completed","input":{"file_path":"src/app.ts"},"output":"file contents...","duration":100,"timestamp":50,"stepId":"test-002-step-1"},{"type":"tool_call","name":"Edit","status":"completed","input":{"file_path":"src/app.ts","old_string":"bug","new_string":"fix"},"duration":150,"timestamp":200,"stepId":"test-002-step-2"},{"type":"message","content":"I fixed the bug","timestamp":400,"stepId":"test-002-step-3"}],"metadata":{"category":"bugfix","agent":"claude-code-acp"},"timing":{"start":1704067300000,"end":1704067302567},"toolErrors":false}`
224
+
225
+ // ============================================================================
226
+ // Downstream Pattern Tests
227
+ // ============================================================================
228
+
229
+ describe('downstream patterns: summary JSONL', () => {
230
+ const parseResults = (jsonl: string) =>
231
+ jsonl
232
+ .trim()
233
+ .split('\n')
234
+ .map((line) => JSON.parse(line))
235
+
236
+ test('parses summary JSONL correctly', () => {
237
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
238
+
239
+ expect(results).toHaveLength(3)
240
+ for (const result of results) {
241
+ expect(() => SummaryResultSchema.parse(result)).not.toThrow()
242
+ }
243
+ })
244
+
245
+ test('filters by output presence (jq pattern)', () => {
246
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
247
+ const withOutput = results.filter((r) => r.output.length > 0)
248
+
249
+ expect(withOutput).toHaveLength(2)
250
+ })
251
+
252
+ test('calculates average duration (jq pattern)', () => {
253
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
254
+ const avg = results.reduce((sum, r) => sum + r.duration, 0) / results.length
255
+
256
+ expect(avg).toBeCloseTo(1433.67, 0)
257
+ })
258
+
259
+ test('counts tool usage (jq pattern)', () => {
260
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
261
+ const allTools = results.flatMap((r) => r.toolCalls)
262
+ const toolCounts = allTools.reduce<Record<string, number>>((acc, tool) => {
263
+ acc[tool] = (acc[tool] ?? 0) + 1
264
+ return acc
265
+ }, {})
266
+
267
+ expect(toolCounts).toEqual({ Write: 1, Read: 1, Edit: 1 })
268
+ })
269
+
270
+ test('calculates success rate by output presence', () => {
271
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
272
+ const withOutput = results.filter((r) => r.output.length > 0).length
273
+ const total = results.length
274
+
275
+ expect(withOutput).toBe(2)
276
+ expect(total).toBe(3)
277
+ expect(withOutput / total).toBeCloseTo(0.667, 2)
278
+ })
279
+ })
280
+
281
+ describe('downstream patterns: capture JSONL', () => {
282
+ const parseResults = (jsonl: string) =>
283
+ jsonl
284
+ .trim()
285
+ .split('\n')
286
+ .map((line) => JSON.parse(line))
287
+
288
+ test('parses capture JSONL with trajectories', () => {
289
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
290
+
291
+ expect(results).toHaveLength(2)
292
+ for (const result of results) {
293
+ expect(() => CaptureResultSchema.parse(result)).not.toThrow()
294
+ }
295
+ })
296
+
297
+ test('step IDs follow expected format', () => {
298
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
299
+
300
+ for (const result of results) {
301
+ for (const step of result.trajectory) {
302
+ expect(step.stepId).toMatch(new RegExp(`^${result.id}-step-\\d+$`))
303
+ }
304
+ }
305
+ })
306
+
307
+ test('step-level retrieval pattern works', () => {
308
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
309
+
310
+ // Build step index (pattern from downstream.md)
311
+ const stepIndex = new Map<string, unknown>()
312
+ for (const result of results) {
313
+ for (const step of result.trajectory) {
314
+ stepIndex.set(step.stepId, step)
315
+ }
316
+ }
317
+
318
+ // Retrieve specific step by ID
319
+ const step = stepIndex.get('test-001-step-2') as { name: string; input: { file_path: string } }
320
+ expect(step).toBeDefined()
321
+ expect(step.name).toBe('Write')
322
+ expect(step.input.file_path).toBe('src/button.tsx')
323
+ })
324
+
325
+ test('extracts tool calls from trajectory', () => {
326
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
327
+ const result = results[1] // test-002
328
+
329
+ const toolCalls = result.trajectory.filter((s: { type: string }) => s.type === 'tool_call')
330
+ expect(toolCalls).toHaveLength(2)
331
+ expect(toolCalls.map((t: { name: string }) => t.name)).toEqual(['Read', 'Edit'])
332
+ })
333
+
334
+ test('filters by metadata category', () => {
335
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
336
+ const uiResults = results.filter((r) => r.metadata.category === 'ui')
337
+
338
+ expect(uiResults).toHaveLength(1)
339
+ expect(uiResults[0]?.id).toBe('test-001')
340
+ })
341
+
342
+ test('identifies results with tool errors', () => {
343
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
344
+ const withErrors = results.filter((r) => r.toolErrors)
345
+
346
+ expect(withErrors).toHaveLength(0) // Sample data has no errors
347
+ })
348
+ })
349
+
350
+ describe('downstream patterns: advanced filtering', () => {
351
+ const parseResults = (jsonl: string) =>
352
+ jsonl
353
+ .trim()
354
+ .split('\n')
355
+ .map((line) => JSON.parse(line))
356
+
357
+ test('filters by tool usage (jq contains pattern)', () => {
358
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
359
+ const withWrite = results.filter((r) => r.toolCalls.includes('Write'))
360
+
361
+ expect(withWrite).toHaveLength(1)
362
+ expect(withWrite[0]?.id).toBe('test-001')
363
+ })
364
+
365
+ test('filters by duration threshold (slow evaluations)', () => {
366
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
367
+ const slow = results.filter((r) => r.duration > 2000)
368
+
369
+ expect(slow).toHaveLength(1)
370
+ expect(slow[0]?.id).toBe('test-002')
371
+ })
372
+
373
+ test('finds slowest evaluations (sorted)', () => {
374
+ const results = parseResults(SAMPLE_SUMMARY_JSONL)
375
+ const sorted = [...results].sort((a, b) => b.duration - a.duration)
376
+ const top2 = sorted.slice(0, 2)
377
+
378
+ expect(top2[0]?.id).toBe('test-002')
379
+ expect(top2[1]?.id).toBe('test-001')
380
+ })
381
+
382
+ test('deduplicates by ID keeping latest (merge pattern)', () => {
383
+ const combinedJsonl = `${SAMPLE_SUMMARY_JSONL}
384
+ {"id":"test-001","input":"Create a button v2","output":"I created the button v2","toolCalls":["Write","Edit"],"duration":1500}`
385
+
386
+ const results = parseResults(combinedJsonl)
387
+
388
+ // Group by ID and keep last occurrence (simulates jq group_by + last)
389
+ const byId = new Map<string, unknown>()
390
+ for (const result of results) {
391
+ byId.set(result.id, result)
392
+ }
393
+ const deduped = Array.from(byId.values())
394
+
395
+ expect(deduped).toHaveLength(3) // test-001, test-002, test-003
396
+ const test001 = deduped.find((r) => (r as { id: string }).id === 'test-001') as { input: string }
397
+ expect(test001?.input).toBe('Create a button v2')
398
+ })
399
+
400
+ test('groups by category and counts', () => {
401
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
402
+
403
+ // Group by category (simulates jq group_by pattern)
404
+ const grouped = results.reduce<Record<string, number>>((acc, r) => {
405
+ const cat = r.metadata.category as string
406
+ acc[cat] = (acc[cat] ?? 0) + 1
407
+ return acc
408
+ }, {})
409
+
410
+ expect(grouped).toEqual({ ui: 1, bugfix: 1 })
411
+ })
412
+
413
+ test('extracts timing information', () => {
414
+ const results = parseResults(SAMPLE_CAPTURE_JSONL)
415
+ const result = results[0]
416
+
417
+ expect(result.timing.start).toBe(1704067200000)
418
+ expect(result.timing.end).toBe(1704067201234)
419
+ expect(result.timing.firstResponse).toBe(100)
420
+ expect(result.timing.end - result.timing.start).toBe(1234) // matches duration
421
+ })
422
+ })
423
+
424
+ // ============================================================================
425
+ // MCP Server Config Parsing Tests
426
+ // ============================================================================
427
+
428
+ describe('MCP server config parsing', () => {
429
+ test('parses stdio MCP server config', () => {
430
+ const json = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":["/data"],"env":[]}'
431
+ const proc = Bun.spawn(
432
+ ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
433
+ {
434
+ stdout: 'pipe',
435
+ stderr: 'pipe',
436
+ },
437
+ )
438
+
439
+ // If it doesn't crash, the parsing worked
440
+ expect(proc.exited).resolves.toBeDefined()
441
+ })
442
+
443
+ test('parses http MCP server config', () => {
444
+ const json =
445
+ '{"type":"http","name":"api","url":"https://example.com/mcp","headers":[{"name":"Authorization","value":"Bearer token"}]}'
446
+ const proc = Bun.spawn(
447
+ ['bun', CLI_PATH, 'capture', '/tmp/test.jsonl', 'bunx', 'claude-code-acp', '--mcp-server', json, '--help'],
448
+ {
449
+ stdout: 'pipe',
450
+ stderr: 'pipe',
451
+ },
452
+ )
453
+
454
+ // If it doesn't crash, the parsing worked
455
+ expect(proc.exited).resolves.toBeDefined()
456
+ })
457
+
458
+ test('accepts multiple MCP servers', () => {
459
+ const json1 = '{"type":"stdio","name":"fs","command":"mcp-filesystem","args":[],"env":[]}'
460
+ const json2 = '{"type":"http","name":"api","url":"https://example.com","headers":[]}'
461
+ const proc = Bun.spawn(
462
+ [
463
+ 'bun',
464
+ CLI_PATH,
465
+ 'capture',
466
+ '/tmp/test.jsonl',
467
+ 'bunx',
468
+ 'claude-code-acp',
469
+ '--mcp-server',
470
+ json1,
471
+ '--mcp-server',
472
+ json2,
473
+ '--help',
474
+ ],
475
+ {
476
+ stdout: 'pipe',
477
+ stderr: 'pipe',
478
+ },
479
+ )
480
+
481
+ // If it doesn't crash, the parsing worked
482
+ expect(proc.exited).resolves.toBeDefined()
483
+ })
484
+ })
485
+
486
+ // ============================================================================
487
+ // Error Handling Tests
488
+ // ============================================================================
489
+
490
+ describe('error handling', () => {
491
+ test('fails when schema file does not exist', async () => {
492
+ const tmpFile = `/tmp/invalid-${Date.now()}.jsonl`
493
+ await Bun.write(tmpFile, '{"id": "t1", "input": "test"}\n')
494
+
495
+ const proc = Bun.spawn(['bun', CLI_PATH, 'capture', tmpFile, '--schema', 'nonexistent-schema.json'], {
496
+ stdout: 'pipe',
497
+ stderr: 'pipe',
498
+ })
499
+ const stderr = await new Response(proc.stderr).text()
500
+ const exitCode = await proc.exited
501
+
502
+ expect(exitCode).not.toBe(0)
503
+ expect(stderr).toContain('Schema file not found')
504
+ })
505
+
506
+ test('capture command requires prompts path', async () => {
507
+ const proc = Bun.spawn(['bun', CLI_PATH, 'capture'], {
508
+ stdout: 'pipe',
509
+ stderr: 'pipe',
510
+ })
511
+ const stderr = await new Response(proc.stderr).text()
512
+ const exitCode = await proc.exited
513
+
514
+ expect(exitCode).toBe(1)
515
+ expect(stderr).toContain('prompts.jsonl path is required')
516
+ })
517
+
518
+ test('summarize command requires input path', async () => {
519
+ const proc = Bun.spawn(['bun', CLI_PATH, 'summarize'], {
520
+ stdout: 'pipe',
521
+ stderr: 'pipe',
522
+ })
523
+ const stderr = await new Response(proc.stderr).text()
524
+ const exitCode = await proc.exited
525
+
526
+ expect(exitCode).toBe(1)
527
+ expect(stderr).toContain('results.jsonl path is required')
528
+ })
529
+ })
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "@plaited/agent-eval-harness",
3
+ "version": "0.5.0",
4
+ "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
+ "license": "ISC",
6
+ "engines": {
7
+ "bun": ">= v1.2.9"
8
+ },
9
+ "repository": {
10
+ "type": "git",
11
+ "url": "git+https://github.com/plaited/agent-eval-harness.git"
12
+ },
13
+ "bugs": {
14
+ "url": "https://github.com/plaited/agent-eval-harness/issues"
15
+ },
16
+ "homepage": "https://github.com/plaited/agent-eval-harness/tree/main#readme",
17
+ "bin": {
18
+ "agent-eval-harness": "./bin/cli.ts"
19
+ },
20
+ "type": "module",
21
+ "exports": {
22
+ ".": "./src/harness.ts",
23
+ "./schemas": "./src/schemas.ts",
24
+ "./headless": "./src/headless.ts",
25
+ "./pipeline": "./src/pipeline.ts"
26
+ },
27
+ "files": [
28
+ "./src/**",
29
+ "./bin/**",
30
+ "!./src/**/tests/*",
31
+ "!./src/**/*.spec.ts",
32
+ "!./bin/**/tests/*",
33
+ "!./bin/**/*.spec.ts"
34
+ ],
35
+ "publishConfig": {
36
+ "access": "public"
37
+ },
38
+ "scripts": {
39
+ "check": "bun run check:biome && bun run check:types && bun run check:package",
40
+ "check:biome": "biome check",
41
+ "check:package": "format-package --check",
42
+ "check:types": "tsc --noEmit",
43
+ "check:write": "biome check --write && format-package --write",
44
+ "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
45
+ "test": "bun test ./**/tests/*.spec.ts",
46
+ "test:integration": "bun test ./**/integration_tests/*.spec.ts"
47
+ },
48
+ "lint-staged": {
49
+ "*.{js,cjs,jsx,tsx,ts}": [
50
+ "bunx biome check --write --files-ignore-unknown"
51
+ ],
52
+ "package.json": [
53
+ "format-package -w"
54
+ ]
55
+ },
56
+ "dependencies": {
57
+ "zod": "^4.3.5",
58
+ "@plaited/development-skills": "0.6.3"
59
+ },
60
+ "devDependencies": {
61
+ "@biomejs/biome": "2.3.11",
62
+ "@types/bun": "1.3.6",
63
+ "format-package": "7.0.0",
64
+ "lint-staged": "16.2.7",
65
+ "typescript": "5.9.3"
66
+ }
67
+ }