@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,678 @@
1
+ /**
2
+ * Unit tests for headless adapter factory.
3
+ *
4
+ * @remarks
5
+ * Tests cover:
6
+ * - Schema validation with Zod
7
+ * - JSONPath extraction
8
+ * - Output parsing with event mappings
9
+ * - History building for iterative mode
10
+ */
11
+
12
+ import { describe, expect, test } from 'bun:test'
13
+ import { HeadlessAdapterSchema, parseHeadlessConfig, safeParseHeadlessConfig } from '../headless.schemas.ts'
14
+ import { createHistoryBuilder } from '../headless-history-builder.ts'
15
+ import { createOutputParser, jsonPath, jsonPathString } from '../headless-output-parser.ts'
16
+
17
+ // ============================================================================
18
+ // Test Fixtures
19
+ // ============================================================================
20
+
21
+ const validClaudeSchema = {
22
+ version: 1,
23
+ name: 'claude-headless',
24
+ command: ['claude'],
25
+ sessionMode: 'stream',
26
+ prompt: { flag: '-p' },
27
+ output: { flag: '--output-format', value: 'stream-json' },
28
+ autoApprove: ['--dangerously-skip-permissions'],
29
+ resume: { flag: '--resume', sessionIdPath: '$.session_id' },
30
+ outputEvents: [
31
+ {
32
+ match: { path: '$.type', value: 'assistant' },
33
+ emitAs: 'message',
34
+ extract: { content: '$.message.text' },
35
+ },
36
+ {
37
+ match: { path: '$.type', value: 'tool_use' },
38
+ emitAs: 'tool_call',
39
+ extract: { title: '$.name', status: "'pending'" },
40
+ },
41
+ ],
42
+ result: {
43
+ matchPath: '$.type',
44
+ matchValue: 'result',
45
+ contentPath: '$.result',
46
+ },
47
+ }
48
+
49
+ const validGeminiSchema = {
50
+ version: 1,
51
+ name: 'gemini-headless',
52
+ command: ['gemini'],
53
+ sessionMode: 'iterative',
54
+ prompt: { flag: '--prompt' },
55
+ output: { flag: '--output-format', value: 'json' },
56
+ outputEvents: [
57
+ {
58
+ match: { path: '$.type', value: 'message' },
59
+ emitAs: 'message',
60
+ extract: { content: '$.content' },
61
+ },
62
+ ],
63
+ result: {
64
+ matchPath: '$.type',
65
+ matchValue: 'result',
66
+ contentPath: '$.response',
67
+ },
68
+ historyTemplate: 'User: {{input}}\nAssistant: {{output}}',
69
+ }
70
+
71
+ // ============================================================================
72
+ // Schema Validation Tests
73
+ // ============================================================================
74
+
75
+ describe('HeadlessAdapterSchema', () => {
76
+ describe('valid schemas', () => {
77
+ test('validates Claude headless schema', () => {
78
+ const result = HeadlessAdapterSchema.safeParse(validClaudeSchema)
79
+ expect(result.success).toBe(true)
80
+ })
81
+
82
+ test('validates Gemini headless schema', () => {
83
+ const result = HeadlessAdapterSchema.safeParse(validGeminiSchema)
84
+ expect(result.success).toBe(true)
85
+ })
86
+ })
87
+
88
+ describe('validates schema files from disk', () => {
89
+ const schemasDir = '.claude/skills/headless-adapters/schemas'
90
+
91
+ test('validates claude-headless.json from disk', async () => {
92
+ const content = await Bun.file(`${schemasDir}/claude-headless.json`).json()
93
+ const result = HeadlessAdapterSchema.safeParse(content)
94
+ expect(result.success).toBe(true)
95
+ })
96
+
97
+ test('validates gemini-headless.json from disk', async () => {
98
+ const content = await Bun.file(`${schemasDir}/gemini-headless.json`).json()
99
+ const result = HeadlessAdapterSchema.safeParse(content)
100
+ expect(result.success).toBe(true)
101
+ })
102
+ })
103
+
104
+ describe('minimal valid schema', () => {
105
+ test('validates minimal required fields', () => {
106
+ const minimal = {
107
+ version: 1,
108
+ name: 'minimal',
109
+ command: ['agent'],
110
+ sessionMode: 'iterative',
111
+ prompt: {},
112
+ output: { flag: '--format', value: 'json' },
113
+ outputEvents: [],
114
+ result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
115
+ }
116
+ const result = HeadlessAdapterSchema.safeParse(minimal)
117
+ expect(result.success).toBe(true)
118
+ })
119
+ })
120
+
121
+ describe('stdin mode configuration', () => {
122
+ test('validates schema with stdin: true', () => {
123
+ const stdinSchema = {
124
+ version: 1,
125
+ name: 'stdin-agent',
126
+ command: ['agent', 'exec', '-'],
127
+ sessionMode: 'stream',
128
+ prompt: { stdin: true },
129
+ output: { flag: '--format', value: 'json' },
130
+ outputEvents: [],
131
+ result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
132
+ }
133
+ const result = HeadlessAdapterSchema.safeParse(stdinSchema)
134
+ expect(result.success).toBe(true)
135
+ })
136
+
137
+ test('validates schema with stdin: false', () => {
138
+ const stdinSchema = {
139
+ version: 1,
140
+ name: 'stdin-agent',
141
+ command: ['agent'],
142
+ sessionMode: 'stream',
143
+ prompt: { stdin: false, flag: '-p' },
144
+ output: { flag: '--format', value: 'json' },
145
+ outputEvents: [],
146
+ result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
147
+ }
148
+ const result = HeadlessAdapterSchema.safeParse(stdinSchema)
149
+ expect(result.success).toBe(true)
150
+ })
151
+
152
+ test('validates schema with positional prompt and - in command', () => {
153
+ const stdinSchema = {
154
+ version: 1,
155
+ name: 'codex-like',
156
+ command: ['codex', 'exec', '--json', '-'],
157
+ sessionMode: 'iterative',
158
+ prompt: { stdin: true },
159
+ output: { flag: '', value: '' },
160
+ outputEvents: [
161
+ {
162
+ match: { path: '$.item.type', value: 'agent_message' },
163
+ emitAs: 'message',
164
+ extract: { content: '$.item.text' },
165
+ },
166
+ ],
167
+ result: { matchPath: '$.type', matchValue: 'turn.completed', contentPath: '$.usage.output_tokens' },
168
+ }
169
+ const result = HeadlessAdapterSchema.safeParse(stdinSchema)
170
+ expect(result.success).toBe(true)
171
+ })
172
+ })
173
+
174
+ describe('invalid schemas', () => {
175
+ test('rejects missing version', () => {
176
+ const invalid = { ...validClaudeSchema, version: undefined }
177
+ const result = HeadlessAdapterSchema.safeParse(invalid)
178
+ expect(result.success).toBe(false)
179
+ })
180
+
181
+ test('rejects unsupported version', () => {
182
+ const invalid = { ...validClaudeSchema, version: 2 }
183
+ const result = HeadlessAdapterSchema.safeParse(invalid)
184
+ expect(result.success).toBe(false)
185
+ })
186
+
187
+ test('rejects invalid sessionMode', () => {
188
+ const invalid = { ...validClaudeSchema, sessionMode: 'batch' }
189
+ const result = HeadlessAdapterSchema.safeParse(invalid)
190
+ expect(result.success).toBe(false)
191
+ })
192
+
193
+ test('rejects missing command', () => {
194
+ const invalid = { ...validClaudeSchema, command: undefined }
195
+ const result = HeadlessAdapterSchema.safeParse(invalid)
196
+ expect(result.success).toBe(false)
197
+ })
198
+
199
+ test('rejects both flag and stdin specified', () => {
200
+ const invalid = {
201
+ ...validClaudeSchema,
202
+ prompt: {
203
+ flag: '-p',
204
+ stdin: true,
205
+ },
206
+ }
207
+ const result = HeadlessAdapterSchema.safeParse(invalid)
208
+ expect(result.success).toBe(false)
209
+ // Type assertion after checking success is false
210
+ const error = (result as { success: false; error: { issues: Array<{ message: string }> } }).error
211
+ expect(error.issues.length).toBeGreaterThan(0)
212
+ expect(error.issues[0]!.message).toContain("Cannot specify both 'flag' and 'stdin' modes")
213
+ })
214
+
215
+ test('rejects invalid emitAs type', () => {
216
+ const invalid = {
217
+ ...validClaudeSchema,
218
+ outputEvents: [
219
+ {
220
+ match: { path: '$.type', value: 'x' },
221
+ emitAs: 'invalid_type',
222
+ },
223
+ ],
224
+ }
225
+ const result = HeadlessAdapterSchema.safeParse(invalid)
226
+ expect(result.success).toBe(false)
227
+ })
228
+ })
229
+
230
+ describe('parseHeadlessConfig', () => {
231
+ test('returns parsed config for valid input', () => {
232
+ const config = parseHeadlessConfig(validClaudeSchema)
233
+ expect(config.name).toBe('claude-headless')
234
+ expect(config.command).toEqual(['claude'])
235
+ expect(config.sessionMode).toBe('stream')
236
+ })
237
+
238
+ test('throws for invalid input', () => {
239
+ expect(() => parseHeadlessConfig({ version: 99 })).toThrow()
240
+ })
241
+ })
242
+
243
+ describe('safeParseHeadlessConfig', () => {
244
+ test('returns success for valid input', () => {
245
+ const result = safeParseHeadlessConfig(validClaudeSchema)
246
+ expect(result.success).toBe(true)
247
+ if (result.success) {
248
+ expect(result.data.name).toBe('claude-headless')
249
+ }
250
+ })
251
+
252
+ test('returns failure for invalid input', () => {
253
+ const result = safeParseHeadlessConfig({ version: 99 })
254
+ expect(result.success).toBe(false)
255
+ })
256
+ })
257
+ })
258
+
259
+ // ============================================================================
260
+ // JSONPath Tests
261
+ // ============================================================================
262
+
263
+ describe('jsonPath', () => {
264
+ const testObj = {
265
+ type: 'message',
266
+ message: {
267
+ text: 'Hello world',
268
+ nested: { value: 42 },
269
+ },
270
+ array: [1, 2, 3],
271
+ }
272
+
273
+ describe('basic extraction', () => {
274
+ test('extracts root field', () => {
275
+ expect(jsonPath(testObj, '$.type')).toBe('message')
276
+ })
277
+
278
+ test('extracts nested field', () => {
279
+ expect(jsonPath(testObj, '$.message.text')).toBe('Hello world')
280
+ })
281
+
282
+ test('extracts deeply nested field', () => {
283
+ expect(jsonPath(testObj, '$.message.nested.value')).toBe(42)
284
+ })
285
+
286
+ test('returns undefined for non-existent path', () => {
287
+ expect(jsonPath(testObj, '$.missing')).toBeUndefined()
288
+ })
289
+
290
+ test('returns undefined for non-existent nested path', () => {
291
+ expect(jsonPath(testObj, '$.message.missing.deep')).toBeUndefined()
292
+ })
293
+ })
294
+
295
+ describe('literal strings', () => {
296
+ test('returns literal string value', () => {
297
+ expect(jsonPath(testObj, "'pending'")).toBe('pending')
298
+ })
299
+
300
+ test('returns empty literal string', () => {
301
+ expect(jsonPath(testObj, "''")).toBe('')
302
+ })
303
+
304
+ test('returns literal with spaces', () => {
305
+ expect(jsonPath(testObj, "'hello world'")).toBe('hello world')
306
+ })
307
+ })
308
+
309
+ describe('edge cases', () => {
310
+ test('handles null input', () => {
311
+ expect(jsonPath(null, '$.type')).toBeUndefined()
312
+ })
313
+
314
+ test('handles undefined input', () => {
315
+ expect(jsonPath(undefined, '$.type')).toBeUndefined()
316
+ })
317
+
318
+ test('handles non-object input', () => {
319
+ expect(jsonPath('string', '$.type')).toBeUndefined()
320
+ })
321
+
322
+ test('handles invalid path format', () => {
323
+ expect(jsonPath(testObj, 'type')).toBeUndefined()
324
+ })
325
+ })
326
+ })
327
+
328
+ describe('jsonPathString', () => {
329
+ test('extracts string value', () => {
330
+ expect(jsonPathString({ text: 'hello' }, '$.text')).toBe('hello')
331
+ })
332
+
333
+ test('converts number to string', () => {
334
+ expect(jsonPathString({ num: 42 }, '$.num')).toBe('42')
335
+ })
336
+
337
+ test('returns undefined for missing path', () => {
338
+ expect(jsonPathString({ x: 1 }, '$.y')).toBeUndefined()
339
+ })
340
+
341
+ test('returns undefined for null value', () => {
342
+ expect(jsonPathString({ x: null }, '$.x')).toBeUndefined()
343
+ })
344
+ })
345
+
346
+ // ============================================================================
347
+ // Output Parser Tests
348
+ // ============================================================================
349
+
350
+ describe('createOutputParser', () => {
351
+ const config = parseHeadlessConfig(validClaudeSchema)
352
+ const parser = createOutputParser(config)
353
+
354
+ describe('parseLine', () => {
355
+ test('maps assistant type to message', () => {
356
+ const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } })
357
+ const result = parser.parseLine(line)
358
+ expect(result).not.toBeNull()
359
+ // Handle both single result and array of results
360
+ const singleResult = Array.isArray(result) ? result[0] : result
361
+ expect(singleResult?.type).toBe('message')
362
+ expect(singleResult?.content).toBe('Hello')
363
+ })
364
+
365
+ test('maps tool_use type to tool_call', () => {
366
+ const line = JSON.stringify({ type: 'tool_use', name: 'Read' })
367
+ const result = parser.parseLine(line)
368
+ expect(result).not.toBeNull()
369
+ // Handle both single result and array of results
370
+ const singleResult = Array.isArray(result) ? result[0] : result
371
+ expect(singleResult?.type).toBe('tool_call')
372
+ expect(singleResult?.title).toBe('Read')
373
+ expect(singleResult?.status).toBe('pending')
374
+ })
375
+
376
+ test('returns null for unmapped event types', () => {
377
+ const line = JSON.stringify({ type: 'unknown', data: 'test' })
378
+ const result = parser.parseLine(line)
379
+ expect(result).toBeNull()
380
+ })
381
+
382
+ test('returns null for invalid JSON', () => {
383
+ const result = parser.parseLine('not valid json')
384
+ expect(result).toBeNull()
385
+ })
386
+
387
+ test('returns null for empty line', () => {
388
+ const result = parser.parseLine('')
389
+ expect(result).toBeNull()
390
+ })
391
+
392
+ test('preserves raw event in result', () => {
393
+ const event = { type: 'assistant', message: { text: 'Hi' } }
394
+ const line = JSON.stringify(event)
395
+ const result = parser.parseLine(line)
396
+ // Handle both single result and array of results
397
+ const singleResult = Array.isArray(result) ? result[0] : result
398
+ expect(singleResult?.raw).toEqual(event)
399
+ })
400
+ })
401
+
402
+ describe('parseLine with array wildcards', () => {
403
+ const wildcardConfig = parseHeadlessConfig({
404
+ version: 1,
405
+ name: 'wildcard-test',
406
+ command: ['test'],
407
+ sessionMode: 'stream',
408
+ prompt: { flag: '-p' },
409
+ output: { flag: '--output', value: 'json' },
410
+ outputEvents: [
411
+ {
412
+ match: { path: '$.message.content[*].type', value: 'tool_use' },
413
+ emitAs: 'tool_call',
414
+ extract: { title: '$.name', status: "'pending'" },
415
+ },
416
+ {
417
+ match: { path: '$.items[*]', value: '*' },
418
+ emitAs: 'message',
419
+ extract: { content: '$.text' },
420
+ },
421
+ ],
422
+ result: {
423
+ matchPath: '$.type',
424
+ matchValue: 'result',
425
+ contentPath: '$.output',
426
+ },
427
+ })
428
+ const wildcardParser = createOutputParser(wildcardConfig)
429
+
430
+ test('returns array of updates for matching array items', () => {
431
+ const line = JSON.stringify({
432
+ message: {
433
+ content: [
434
+ { type: 'tool_use', name: 'Read', input: {} },
435
+ { type: 'text', value: 'Hello' },
436
+ { type: 'tool_use', name: 'Write', input: {} },
437
+ ],
438
+ },
439
+ })
440
+ const result = wildcardParser.parseLine(line)
441
+ expect(Array.isArray(result)).toBe(true)
442
+ if (Array.isArray(result)) {
443
+ expect(result).toHaveLength(2)
444
+ expect(result[0]!.type).toBe('tool_call')
445
+ expect(result[0]!.title).toBe('Read')
446
+ expect(result[0]!.status).toBe('pending')
447
+ expect(result[1]!.type).toBe('tool_call')
448
+ expect(result[1]!.title).toBe('Write')
449
+ expect(result[1]!.status).toBe('pending')
450
+ }
451
+ })
452
+
453
+ test('handles empty array gracefully', () => {
454
+ const line = JSON.stringify({
455
+ message: { content: [] },
456
+ })
457
+ const result = wildcardParser.parseLine(line)
458
+ expect(result).toBeNull()
459
+ })
460
+
461
+ test('handles non-matching array items', () => {
462
+ const line = JSON.stringify({
463
+ message: {
464
+ content: [
465
+ { type: 'text', value: 'No tool use here' },
466
+ { type: 'image', data: 'base64...' },
467
+ ],
468
+ },
469
+ })
470
+ const result = wildcardParser.parseLine(line)
471
+ expect(result).toBeNull()
472
+ })
473
+
474
+ test('matches wildcard value for all non-null items', () => {
475
+ const line = JSON.stringify({
476
+ items: [{ text: 'Item 1' }, { text: 'Item 2' }, { text: 'Item 3' }],
477
+ })
478
+ const result = wildcardParser.parseLine(line)
479
+ expect(Array.isArray(result)).toBe(true)
480
+ if (Array.isArray(result)) {
481
+ expect(result).toHaveLength(3)
482
+ expect(result[0]!.content).toBe('Item 1')
483
+ expect(result[1]!.content).toBe('Item 2')
484
+ expect(result[2]!.content).toBe('Item 3')
485
+ }
486
+ })
487
+
488
+ test('handles mixed array content with type guards', () => {
489
+ const line = JSON.stringify({
490
+ message: {
491
+ content: [
492
+ { type: 'tool_use', name: 'Valid' },
493
+ 'string-item',
494
+ { no_type_property: true },
495
+ null,
496
+ { type: 'tool_use', name: 'AlsoValid' },
497
+ ],
498
+ },
499
+ })
500
+ const result = wildcardParser.parseLine(line)
501
+ expect(Array.isArray(result)).toBe(true)
502
+ if (Array.isArray(result)) {
503
+ expect(result).toHaveLength(2)
504
+ expect(result[0]!.title).toBe('Valid')
505
+ expect(result[1]!.title).toBe('AlsoValid')
506
+ }
507
+ })
508
+ })
509
+
510
+ describe('jsonPath with array wildcard', () => {
511
+ test('extracts array with [*] wildcard', () => {
512
+ const obj = { items: [{ id: 1 }, { id: 2 }] }
513
+ const result = jsonPath(obj, '$.items[*]')
514
+ expect(Array.isArray(result)).toBe(true)
515
+ if (Array.isArray(result)) {
516
+ expect(result).toHaveLength(2)
517
+ }
518
+ })
519
+
520
+ test('returns undefined for non-array at wildcard position', () => {
521
+ const obj = { items: 'not-an-array' }
522
+ const result = jsonPath(obj, '$.items[*]')
523
+ expect(result).toBeUndefined()
524
+ })
525
+
526
+ test('handles empty array', () => {
527
+ const obj = { items: [] }
528
+ const result = jsonPath(obj, '$.items[*]')
529
+ expect(result).toEqual([])
530
+ })
531
+
532
+ test('handles nested path to array', () => {
533
+ const obj = { message: { content: [1, 2, 3] } }
534
+ const result = jsonPath(obj, '$.message.content[*]')
535
+ expect(result).toEqual([1, 2, 3])
536
+ })
537
+
538
+ test('returns undefined when path before wildcard is invalid', () => {
539
+ const obj = { items: [1, 2, 3] }
540
+ const result = jsonPath(obj, '$.missing[*]')
541
+ expect(result).toBeUndefined()
542
+ })
543
+ })
544
+
545
+ describe('parseResult', () => {
546
+ test('detects result event', () => {
547
+ const line = JSON.stringify({ type: 'result', result: 'Final answer' })
548
+ const result = parser.parseResult(line)
549
+ expect(result.isResult).toBe(true)
550
+ if (result.isResult) {
551
+ expect(result.content).toBe('Final answer')
552
+ }
553
+ })
554
+
555
+ test('returns not-result for non-result events', () => {
556
+ const line = JSON.stringify({ type: 'assistant', message: { text: 'Hi' } })
557
+ const result = parser.parseResult(line)
558
+ expect(result.isResult).toBe(false)
559
+ })
560
+
561
+ test('returns not-result for invalid JSON', () => {
562
+ const result = parser.parseResult('invalid')
563
+ expect(result.isResult).toBe(false)
564
+ })
565
+
566
+ test('handles missing content path', () => {
567
+ const line = JSON.stringify({ type: 'result' })
568
+ const result = parser.parseResult(line)
569
+ expect(result.isResult).toBe(true)
570
+ if (result.isResult) {
571
+ expect(result.content).toBe('')
572
+ }
573
+ })
574
+ })
575
+ })
576
+
577
+ // ============================================================================
578
+ // History Builder Tests
579
+ // ============================================================================
580
+
581
+ describe('createHistoryBuilder', () => {
582
+ describe('basic operations', () => {
583
+ test('starts with empty history', () => {
584
+ const builder = createHistoryBuilder()
585
+ expect(builder.getLength()).toBe(0)
586
+ expect(builder.getHistory()).toEqual([])
587
+ })
588
+
589
+ test('adds turns to history', () => {
590
+ const builder = createHistoryBuilder()
591
+ builder.addTurn('Hello', 'Hi there')
592
+ expect(builder.getLength()).toBe(1)
593
+ expect(builder.getHistory()).toEqual([{ input: 'Hello', output: 'Hi there' }])
594
+ })
595
+
596
+ test('accumulates multiple turns', () => {
597
+ const builder = createHistoryBuilder()
598
+ builder.addTurn('Hello', 'Hi')
599
+ builder.addTurn('How are you?', 'Fine')
600
+ expect(builder.getLength()).toBe(2)
601
+ })
602
+
603
+ test('clears history', () => {
604
+ const builder = createHistoryBuilder()
605
+ builder.addTurn('Hello', 'Hi')
606
+ builder.clear()
607
+ expect(builder.getLength()).toBe(0)
608
+ })
609
+ })
610
+
611
+ describe('formatHistory', () => {
612
+ test('uses default template', () => {
613
+ const builder = createHistoryBuilder()
614
+ builder.addTurn('Hello', 'Hi there')
615
+ const formatted = builder.formatHistory()
616
+ expect(formatted).toBe('User: Hello\nAssistant: Hi there')
617
+ })
618
+
619
+ test('uses custom template', () => {
620
+ const builder = createHistoryBuilder({
621
+ template: 'Q: {{input}}\nA: {{output}}',
622
+ })
623
+ builder.addTurn('Question', 'Answer')
624
+ const formatted = builder.formatHistory()
625
+ expect(formatted).toBe('Q: Question\nA: Answer')
626
+ })
627
+
628
+ test('separates multiple turns with double newline', () => {
629
+ const builder = createHistoryBuilder()
630
+ builder.addTurn('First', 'One')
631
+ builder.addTurn('Second', 'Two')
632
+ const formatted = builder.formatHistory()
633
+ expect(formatted).toBe('User: First\nAssistant: One\n\nUser: Second\nAssistant: Two')
634
+ })
635
+
636
+ test('returns empty string for no history', () => {
637
+ const builder = createHistoryBuilder()
638
+ expect(builder.formatHistory()).toBe('')
639
+ })
640
+ })
641
+
642
+ describe('buildPrompt', () => {
643
+ test('returns just input for first turn', () => {
644
+ const builder = createHistoryBuilder()
645
+ const prompt = builder.buildPrompt('Hello')
646
+ expect(prompt).toBe('Hello')
647
+ })
648
+
649
+ test('includes history for subsequent turns', () => {
650
+ const builder = createHistoryBuilder()
651
+ builder.addTurn('Hello', 'Hi')
652
+ const prompt = builder.buildPrompt('Next question')
653
+ expect(prompt).toContain('User: Hello')
654
+ expect(prompt).toContain('Assistant: Hi')
655
+ expect(prompt).toContain('User: Next question')
656
+ })
657
+
658
+ test('builds complete context with multiple turns', () => {
659
+ const builder = createHistoryBuilder()
660
+ builder.addTurn('One', 'Reply one')
661
+ builder.addTurn('Two', 'Reply two')
662
+ const prompt = builder.buildPrompt('Three')
663
+ expect(prompt).toContain('User: One')
664
+ expect(prompt).toContain('User: Two')
665
+ expect(prompt).toContain('User: Three')
666
+ })
667
+ })
668
+
669
+ describe('getHistory returns copy', () => {
670
+ test('modifying returned array does not affect internal state', () => {
671
+ const builder = createHistoryBuilder()
672
+ builder.addTurn('Hello', 'Hi')
673
+ const history = builder.getHistory()
674
+ history.push({ input: 'Fake', output: 'Fake' })
675
+ expect(builder.getLength()).toBe(1)
676
+ })
677
+ })
678
+ })