@plaited/agent-eval-harness 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-... # For Claude
25
25
  export GEMINI_API_KEY=... # For Gemini
26
26
  ```
27
27
 
28
- Pre-built schemas are available in `.agents/skills/headless-adapters/schemas/` for Claude and Gemini.
28
+ Create adapter schemas for any CLI agent that outputs JSON — see the [Schema Creation Guide](.agents/skills/headless-adapters/references/schema-creation-guide.md).
29
29
 
30
30
  ### Core Commands
31
31
 
@@ -416,7 +416,7 @@ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
416
416
  ## Requirements
417
417
 
418
418
  - **Runtime:** Bun >= 1.2.9
419
- - **Schema:** JSON schema describing CLI agent interaction (see `.agents/skills/headless-adapters/schemas/`)
419
+ - **Schema:** JSON schema describing CLI agent interaction (see [Schema Creation Guide](.agents/skills/headless-adapters/references/schema-creation-guide.md))
420
420
  - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
421
421
 
422
422
  ## License
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.12.1",
3
+ "version": "0.13.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -56,12 +56,12 @@
56
56
  ]
57
57
  },
58
58
  "dependencies": {
59
- "@plaited/development-skills": "0.7.0",
59
+ "@plaited/development-skills": "0.8.0",
60
60
  "zod": "^4.3.6"
61
61
  },
62
62
  "devDependencies": {
63
- "@biomejs/biome": "2.3.12",
64
- "@types/bun": "1.3.6",
63
+ "@biomejs/biome": "2.3.14",
64
+ "@types/bun": "1.3.9",
65
65
  "format-package": "7.0.0",
66
66
  "lint-staged": "16.2.7",
67
67
  "typescript": "5.9.3"
@@ -108,6 +108,7 @@ describe('extractTrajectory', () => {
108
108
  {
109
109
  type: 'thought',
110
110
  content: 'Let me think about this...',
111
+ timestamp: 100,
111
112
  raw: { type: 'thought', text: 'Let me think about this...' },
112
113
  },
113
114
  ]
@@ -125,6 +126,7 @@ describe('extractTrajectory', () => {
125
126
  {
126
127
  type: 'message',
127
128
  content: 'Here is my answer.',
129
+ timestamp: 200,
128
130
  raw: { type: 'message', text: 'Here is my answer.' },
129
131
  },
130
132
  ]
@@ -143,6 +145,7 @@ describe('extractTrajectory', () => {
143
145
  type: 'tool_call',
144
146
  title: 'Read',
145
147
  status: 'pending',
148
+ timestamp: 300,
146
149
  raw: { tool: 'Read', input: { file_path: '/test.ts' } },
147
150
  },
148
151
  ]
@@ -160,6 +163,7 @@ describe('extractTrajectory', () => {
160
163
  const updates: ParsedUpdate[] = [
161
164
  {
162
165
  type: 'plan',
166
+ timestamp: 400,
163
167
  raw: {
164
168
  entries: [
165
169
  { content: 'Step 1', status: 'completed' },
@@ -185,29 +189,26 @@ describe('extractTrajectory', () => {
185
189
  })
186
190
 
187
191
  test('assigns timestamps relative to start time', () => {
188
- const originalNow = Date.now
189
- try {
190
- let currentTime = 1000
191
-
192
- Date.now = () => currentTime
193
-
194
- const updates: ParsedUpdate[] = [
195
- {
196
- type: 'message',
197
- content: 'First',
198
- raw: { type: 'message', text: 'First' },
199
- },
200
- ]
201
-
202
- const startTime = 1000
203
- currentTime = 1500 // 500ms later
192
+ const startTime = 1000
193
+ const updates: ParsedUpdate[] = [
194
+ {
195
+ type: 'message',
196
+ content: 'First',
197
+ timestamp: 1500,
198
+ raw: { type: 'message', text: 'First' },
199
+ },
200
+ {
201
+ type: 'message',
202
+ content: 'Second',
203
+ timestamp: 2000,
204
+ raw: { type: 'message', text: 'Second' },
205
+ },
206
+ ]
204
207
 
205
- const trajectory = extractTrajectory(updates, startTime)
208
+ const trajectory = extractTrajectory(updates, startTime)
206
209
 
207
- expect(trajectory[0]?.timestamp).toBe(500)
208
- } finally {
209
- Date.now = originalNow
210
- }
210
+ expect(trajectory[0]?.timestamp).toBe(500)
211
+ expect(trajectory[1]?.timestamp).toBe(1000)
211
212
  })
212
213
 
213
214
  test('handles updates without content for message/thought types', () => {
@@ -215,11 +216,13 @@ describe('extractTrajectory', () => {
215
216
  {
216
217
  type: 'message',
217
218
  content: undefined, // No content - will have empty string
219
+ timestamp: 100,
218
220
  raw: { type: 'message' },
219
221
  },
220
222
  {
221
223
  type: 'message',
222
224
  content: 'Has content',
225
+ timestamp: 200,
223
226
  raw: { type: 'message', text: 'Has content' },
224
227
  },
225
228
  ]
@@ -231,6 +234,113 @@ describe('extractTrajectory', () => {
231
234
  expect(trajectory[0]?.type).toBe('message')
232
235
  expect(trajectory[1]?.type).toBe('message')
233
236
  })
237
+
238
+ test('attaches input to new tool call from update', () => {
239
+ const updates: ParsedUpdate[] = [
240
+ {
241
+ type: 'tool_call',
242
+ title: 'Read',
243
+ status: 'pending',
244
+ input: { file_path: '/src/main.ts' },
245
+ timestamp: 500,
246
+ raw: {},
247
+ },
248
+ ]
249
+
250
+ const trajectory = extractTrajectory(updates, baseTime)
251
+
252
+ expect(trajectory).toHaveLength(1)
253
+ const step = trajectory[0]!
254
+ expect(step.type === 'tool_call' && step.input).toEqual({ file_path: '/src/main.ts' })
255
+ })
256
+
257
+ test('attaches output to tool call on completion', () => {
258
+ const updates: ParsedUpdate[] = [
259
+ {
260
+ type: 'tool_call',
261
+ title: 'Read',
262
+ status: 'pending',
263
+ input: { file_path: '/src/main.ts' },
264
+ timestamp: 500,
265
+ raw: {},
266
+ },
267
+ {
268
+ type: 'tool_call',
269
+ title: 'Read',
270
+ status: 'completed',
271
+ output: 'file contents here',
272
+ timestamp: 800,
273
+ raw: {},
274
+ },
275
+ ]
276
+
277
+ const trajectory = extractTrajectory(updates, baseTime)
278
+
279
+ expect(trajectory).toHaveLength(1)
280
+ const step = trajectory[0]!
281
+ expect(step.type).toBe('tool_call')
282
+ if (step.type === 'tool_call') {
283
+ expect(step.input).toEqual({ file_path: '/src/main.ts' })
284
+ expect(step.output).toBe('file contents here')
285
+ expect(step.status).toBe('completed')
286
+ expect(step.duration).toBe(300)
287
+ }
288
+ })
289
+
290
+ test('handles sequential same-named tool calls independently', () => {
291
+ const updates: ParsedUpdate[] = [
292
+ // First Read: pending → completed
293
+ {
294
+ type: 'tool_call',
295
+ title: 'Read',
296
+ status: 'pending',
297
+ input: { file_path: '/src/a.ts' },
298
+ timestamp: 100,
299
+ raw: {},
300
+ },
301
+ {
302
+ type: 'tool_call',
303
+ title: 'Read',
304
+ status: 'completed',
305
+ output: 'contents of a.ts',
306
+ timestamp: 300,
307
+ raw: {},
308
+ },
309
+ // Second Read: pending → completed (same tool name, different args)
310
+ {
311
+ type: 'tool_call',
312
+ title: 'Read',
313
+ status: 'pending',
314
+ input: { file_path: '/src/b.ts' },
315
+ timestamp: 500,
316
+ raw: {},
317
+ },
318
+ {
319
+ type: 'tool_call',
320
+ title: 'Read',
321
+ status: 'completed',
322
+ output: 'contents of b.ts',
323
+ timestamp: 700,
324
+ raw: {},
325
+ },
326
+ ]
327
+
328
+ const trajectory = extractTrajectory(updates, baseTime)
329
+
330
+ // Both calls should appear as separate trajectory steps
331
+ const toolCalls = trajectory.filter((s) => s.type === 'tool_call')
332
+ expect(toolCalls).toHaveLength(2)
333
+
334
+ const first = toolCalls[0]!
335
+ expect(first.type === 'tool_call' && first.input).toEqual({ file_path: '/src/a.ts' })
336
+ expect(first.type === 'tool_call' && first.output).toBe('contents of a.ts')
337
+ expect(first.type === 'tool_call' && first.status).toBe('completed')
338
+
339
+ const second = toolCalls[1]!
340
+ expect(second.type === 'tool_call' && second.input).toEqual({ file_path: '/src/b.ts' })
341
+ expect(second.type === 'tool_call' && second.output).toBe('contents of b.ts')
342
+ expect(second.type === 'tool_call' && second.status).toBe('completed')
343
+ })
234
344
  })
235
345
 
236
346
  // ============================================================================
@@ -123,7 +123,7 @@ describe('extractTrajectory', () => {
123
123
  const startTime = 1000
124
124
 
125
125
  test('extracts message updates', () => {
126
- const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', raw: {} }]
126
+ const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', timestamp: 1100, raw: {} }]
127
127
  const trajectory = extractTrajectory(updates, startTime)
128
128
  expect(trajectory.length).toBe(1)
129
129
  expect(trajectory[0]?.type).toBe('message')
@@ -131,7 +131,7 @@ describe('extractTrajectory', () => {
131
131
  })
132
132
 
133
133
  test('extracts thought updates', () => {
134
- const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', raw: {} }]
134
+ const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', timestamp: 1200, raw: {} }]
135
135
  const trajectory = extractTrajectory(updates, startTime)
136
136
  expect(trajectory.length).toBe(1)
137
137
  expect(trajectory[0]?.type).toBe('thought')
@@ -143,6 +143,7 @@ describe('extractTrajectory', () => {
143
143
  type: 'tool_call',
144
144
  title: 'Read',
145
145
  status: 'completed',
146
+ timestamp: 1300,
146
147
  raw: {},
147
148
  },
148
149
  ]
@@ -30,7 +30,7 @@ export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): T
30
30
  const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
31
31
 
32
32
  for (const update of updates) {
33
- const timestamp = Date.now() - startTime
33
+ const timestamp = update.timestamp - startTime
34
34
 
35
35
  if (update.type === 'thought') {
36
36
  trajectory.push({
@@ -45,19 +45,25 @@ export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): T
45
45
  timestamp,
46
46
  })
47
47
  } else if (update.type === 'tool_call') {
48
- const toolCallId = update.title ?? `tool_${Date.now()}`
48
+ const toolCallId = update.title ?? `tool_${timestamp}`
49
49
  const existing = toolCallMap.get(toolCallId)
50
50
 
51
51
  if (existing && update.status === 'completed') {
52
52
  // Update existing tool call with completion info
53
53
  existing.step.status = update.status
54
54
  existing.step.duration = timestamp - existing.start
55
+ if (update.output !== undefined) {
56
+ existing.step.output = update.output
57
+ }
58
+ // Remove from map so a subsequent call with the same name starts fresh
59
+ toolCallMap.delete(toolCallId)
55
60
  } else if (!existing) {
56
61
  // New tool call
57
62
  const step: TrajectoryStep & { type: 'tool_call' } = {
58
63
  type: 'tool_call',
59
64
  name: update.title ?? 'unknown',
60
65
  status: update.status ?? 'pending',
66
+ ...(update.input !== undefined && { input: update.input }),
61
67
  timestamp,
62
68
  }
63
69
  toolCallMap.set(toolCallId, { start: timestamp, step })
@@ -23,6 +23,9 @@ export type ParsedUpdate = {
23
23
  content?: string
24
24
  title?: string
25
25
  status?: string
26
+ input?: unknown
27
+ output?: unknown
28
+ timestamp: number
26
29
  raw: unknown
27
30
  }
28
31
 
@@ -200,6 +203,9 @@ const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpda
200
203
  content: typeof event.content === 'string' ? event.content : undefined,
201
204
  title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
202
205
  status: typeof event.status === 'string' ? event.status : undefined,
206
+ input: event.input,
207
+ output: event.output,
208
+ timestamp: Date.now(),
203
209
  raw: event,
204
210
  }
205
211
  }
@@ -210,6 +216,9 @@ const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpda
210
216
  content: typeof event.content === 'string' ? event.content : undefined,
211
217
  title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
212
218
  status: typeof event.status === 'string' ? event.status : undefined,
219
+ input: event.input,
220
+ output: event.output,
221
+ timestamp: Date.now(),
213
222
  raw: event,
214
223
  }
215
224
  }
@@ -307,6 +316,7 @@ export const createOutputParser = (config: HeadlessAdapterConfig) => {
307
316
  const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => {
308
317
  const update: ParsedUpdate = {
309
318
  type: mapping.emitAs,
319
+ timestamp: Date.now(),
310
320
  raw: event,
311
321
  }
312
322
 
@@ -320,6 +330,18 @@ export const createOutputParser = (config: HeadlessAdapterConfig) => {
320
330
  if (mapping.extract.status) {
321
331
  update.status = jsonPathString(event, mapping.extract.status)
322
332
  }
333
+ if (mapping.extract.input) {
334
+ const value = jsonPath(event, mapping.extract.input)
335
+ if (value !== undefined) {
336
+ update.input = value
337
+ }
338
+ }
339
+ if (mapping.extract.output) {
340
+ const value = jsonPath(event, mapping.extract.output)
341
+ if (value !== undefined) {
342
+ update.output = value
343
+ }
344
+ }
323
345
  }
324
346
 
325
347
  return update
@@ -36,18 +36,29 @@ export type OutputEventMatch = z.infer<typeof OutputEventMatchSchema>
36
36
  * Schema for extracting content from matched events.
37
37
  *
38
38
  * @remarks
39
+ * Known fields (`content`, `title`, `status`, `input`, `output`) are used by the
40
+ * output parser to populate `ParsedUpdate` properties. Additional string-valued
41
+ * fields are preserved during validation for forward compatibility but are not
42
+ * consumed by the parser.
43
+ *
39
44
  * Paths can be:
40
45
  * - JSONPath expressions (e.g., "$.message.text")
41
46
  * - Literal strings in single quotes (e.g., "'pending'")
42
47
  */
43
- export const OutputEventExtractSchema = z.object({
44
- /** JSONPath to extract main content */
45
- content: z.string().optional(),
46
- /** JSONPath to extract title (for tool calls) */
47
- title: z.string().optional(),
48
- /** JSONPath to extract status (or literal like "'pending'") */
49
- status: z.string().optional(),
50
- })
48
+ export const OutputEventExtractSchema = z
49
+ .object({
50
+ /** JSONPath to extract main content */
51
+ content: z.string().optional(),
52
+ /** JSONPath to extract title (for tool calls) */
53
+ title: z.string().optional(),
54
+ /** JSONPath to extract status (or literal like "'pending'") */
55
+ status: z.string().optional(),
56
+ /** JSONPath to extract tool input arguments (e.g., "$.input") */
57
+ input: z.string().optional(),
58
+ /** JSONPath to extract tool output/result content (e.g., "$.content") */
59
+ output: z.string().optional(),
60
+ })
61
+ .catchall(z.string())
51
62
 
52
63
  /** Output event extract type */
53
64
  export type OutputEventExtract = z.infer<typeof OutputEventExtractSchema>
@@ -0,0 +1,40 @@
1
+ {
2
+ "version": 1,
3
+ "name": "claude-headless",
4
+ "command": ["claude"],
5
+ "sessionMode": "stream",
6
+ "prompt": {
7
+ "flag": "-p"
8
+ },
9
+ "output": {
10
+ "flag": "--output-format",
11
+ "value": "stream-json"
12
+ },
13
+ "autoApprove": ["--dangerously-skip-permissions", "--verbose"],
14
+ "resume": {
15
+ "flag": "--resume",
16
+ "sessionIdPath": "$.session_id"
17
+ },
18
+ "outputEvents": [
19
+ {
20
+ "match": { "path": "$.type", "value": "assistant" },
21
+ "emitAs": "message",
22
+ "extract": { "content": "$.message.content[0].text" }
23
+ },
24
+ {
25
+ "match": { "path": "$.type", "value": "tool_use" },
26
+ "emitAs": "tool_call",
27
+ "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
28
+ },
29
+ {
30
+ "match": { "path": "$.type", "value": "tool_result" },
31
+ "emitAs": "tool_call",
32
+ "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
33
+ }
34
+ ],
35
+ "result": {
36
+ "matchPath": "$.type",
37
+ "matchValue": "result",
38
+ "contentPath": "$.result"
39
+ }
40
+ }
@@ -0,0 +1,37 @@
1
+ {
2
+ "version": 1,
3
+ "name": "gemini-headless",
4
+ "command": ["gemini"],
5
+ "sessionMode": "iterative",
6
+ "prompt": {
7
+ "flag": ""
8
+ },
9
+ "output": {
10
+ "flag": "--output-format",
11
+ "value": "stream-json"
12
+ },
13
+ "autoApprove": ["--sandbox", "false"],
14
+ "outputEvents": [
15
+ {
16
+ "match": { "path": "$.type", "value": "message" },
17
+ "emitAs": "message",
18
+ "extract": { "content": "$.content" }
19
+ },
20
+ {
21
+ "match": { "path": "$.type", "value": "tool_use" },
22
+ "emitAs": "tool_call",
23
+ "extract": { "title": "$.tool_name", "status": "'pending'", "input": "$.args" }
24
+ },
25
+ {
26
+ "match": { "path": "$.type", "value": "tool_result" },
27
+ "emitAs": "tool_call",
28
+ "extract": { "title": "$.tool_name", "status": "'completed'", "output": "$.output" }
29
+ }
30
+ ],
31
+ "result": {
32
+ "matchPath": "$.type",
33
+ "matchValue": "result",
34
+ "contentPath": "$.content"
35
+ },
36
+ "historyTemplate": "User: {{input}}\nAssistant: {{output}}"
37
+ }
@@ -36,7 +36,12 @@ const validClaudeSchema = {
36
36
  {
37
37
  match: { path: '$.type', value: 'tool_use' },
38
38
  emitAs: 'tool_call',
39
- extract: { title: '$.name', status: "'pending'" },
39
+ extract: { title: '$.name', status: "'pending'", input: '$.input' },
40
+ },
41
+ {
42
+ match: { path: '$.type', value: 'tool_result' },
43
+ emitAs: 'tool_call',
44
+ extract: { title: '$.name', status: "'completed'", output: '$.content' },
40
45
  },
41
46
  ],
42
47
  result: {
@@ -86,21 +91,83 @@ describe('HeadlessAdapterSchema', () => {
86
91
  })
87
92
 
88
93
  describe('validates schema files from disk', () => {
89
- const schemasDir = '.claude/skills/headless-adapters/schemas'
94
+ const fixturesDir = 'src/headless/tests/fixtures'
90
95
 
91
96
  test('validates claude-headless.json from disk', async () => {
92
- const content = await Bun.file(`${schemasDir}/claude-headless.json`).json()
97
+ const content = await Bun.file(`${fixturesDir}/claude-headless.json`).json()
93
98
  const result = HeadlessAdapterSchema.safeParse(content)
94
99
  expect(result.success).toBe(true)
95
100
  })
96
101
 
97
102
  test('validates gemini-headless.json from disk', async () => {
98
- const content = await Bun.file(`${schemasDir}/gemini-headless.json`).json()
103
+ const content = await Bun.file(`${fixturesDir}/gemini-headless.json`).json()
99
104
  const result = HeadlessAdapterSchema.safeParse(content)
100
105
  expect(result.success).toBe(true)
101
106
  })
102
107
  })
103
108
 
109
+ describe('extract input/output fields', () => {
110
+ test('validates schema with input and output in extract config', () => {
111
+ const schemaWithIO = {
112
+ ...validClaudeSchema,
113
+ outputEvents: [
114
+ ...validClaudeSchema.outputEvents,
115
+ {
116
+ match: { path: '$.type', value: 'custom' },
117
+ emitAs: 'tool_call',
118
+ extract: { title: '$.name', input: '$.args', output: '$.result' },
119
+ },
120
+ ],
121
+ }
122
+ const result = HeadlessAdapterSchema.safeParse(schemaWithIO)
123
+ expect(result.success).toBe(true)
124
+ })
125
+
126
+ test('preserves extra extract fields via catchall', () => {
127
+ const schemaWithExtras = {
128
+ ...validClaudeSchema,
129
+ outputEvents: [
130
+ {
131
+ match: { path: '$.type', value: 'tool_use' },
132
+ emitAs: 'tool_call',
133
+ extract: {
134
+ title: '$.name',
135
+ status: "'pending'",
136
+ input: '$.input',
137
+ toolName: '$.name',
138
+ mcpServer: '$.server',
139
+ },
140
+ },
141
+ ],
142
+ }
143
+ const result = HeadlessAdapterSchema.safeParse(schemaWithExtras)
144
+ expect(result.success).toBe(true)
145
+ if (result.success) {
146
+ const extract = result.data.outputEvents![0]!.extract!
147
+ expect(extract.title).toBe('$.name')
148
+ expect(extract.input).toBe('$.input')
149
+ // Catchall fields aren't in the inferred type — cast needed to access them
150
+ expect((extract as Record<string, string>).toolName).toBe('$.name')
151
+ expect((extract as Record<string, string>).mcpServer).toBe('$.server')
152
+ }
153
+ })
154
+
155
+ test('rejects non-string extra extract fields', () => {
156
+ const schemaWithBadExtras = {
157
+ ...validClaudeSchema,
158
+ outputEvents: [
159
+ {
160
+ match: { path: '$.type', value: 'tool_use' },
161
+ emitAs: 'tool_call',
162
+ extract: { title: '$.name', badField: 123 },
163
+ },
164
+ ],
165
+ }
166
+ const result = HeadlessAdapterSchema.safeParse(schemaWithBadExtras)
167
+ expect(result.success).toBe(false)
168
+ })
169
+ })
170
+
104
171
  describe('minimal valid schema', () => {
105
172
  test('validates minimal required fields', () => {
106
173
  const minimal = {
@@ -397,6 +464,70 @@ describe('createOutputParser', () => {
397
464
  const singleResult = Array.isArray(result) ? result[0] : result
398
465
  expect(singleResult?.raw).toEqual(event)
399
466
  })
467
+
468
+ test('extracts input from tool_use event', () => {
469
+ const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' } })
470
+ const result = parser.parseLine(line)
471
+ const singleResult = Array.isArray(result) ? result[0] : result
472
+ expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
473
+ })
474
+
475
+ test('extracts output from tool_result event', () => {
476
+ const line = JSON.stringify({ type: 'tool_result', name: 'Read', content: 'file contents' })
477
+ const result = parser.parseLine(line)
478
+ const singleResult = Array.isArray(result) ? result[0] : result
479
+ expect(singleResult?.output).toBe('file contents')
480
+ })
481
+
482
+ test('sets timestamp on parsed updates', () => {
483
+ const before = Date.now()
484
+ const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } })
485
+ const result = parser.parseLine(line)
486
+ const after = Date.now()
487
+ const singleResult = Array.isArray(result) ? result[0] : result
488
+ expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
489
+ expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
490
+ })
491
+ })
492
+
493
+ describe('parseLine with extra extract fields', () => {
494
+ test('extra extract fields do not break parser', () => {
495
+ const configWithExtras = parseHeadlessConfig({
496
+ version: 1,
497
+ name: 'extras-test',
498
+ command: ['test'],
499
+ sessionMode: 'stream',
500
+ prompt: { flag: '-p' },
501
+ output: { flag: '--output', value: 'json' },
502
+ outputEvents: [
503
+ {
504
+ match: { path: '$.type', value: 'tool_use' },
505
+ emitAs: 'tool_call',
506
+ extract: {
507
+ title: '$.name',
508
+ status: "'pending'",
509
+ input: '$.input',
510
+ toolName: '$.name',
511
+ mcpServer: '$.server',
512
+ },
513
+ },
514
+ ],
515
+ result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
516
+ })
517
+ const extrasParser = createOutputParser(configWithExtras)
518
+ const line = JSON.stringify({
519
+ type: 'tool_use',
520
+ name: 'WebSearch',
521
+ input: { query: 'test' },
522
+ server: 'mcp-search',
523
+ })
524
+ const result = extrasParser.parseLine(line)
525
+ const singleResult = Array.isArray(result) ? result[0] : result
526
+ expect(singleResult).not.toBeNull()
527
+ expect(singleResult?.type).toBe('tool_call')
528
+ expect(singleResult?.title).toBe('WebSearch')
529
+ expect(singleResult?.input).toEqual({ query: 'test' })
530
+ })
400
531
  })
401
532
 
402
533
  describe('parseLine with array wildcards', () => {
@@ -574,6 +705,70 @@ describe('createOutputParser', () => {
574
705
  })
575
706
  })
576
707
 
708
+ // ============================================================================
709
+ // Passthrough Mode Tests
710
+ // ============================================================================
711
+
712
+ describe('passthrough mode', () => {
713
+ const passthroughConfig = parseHeadlessConfig({
714
+ version: 1,
715
+ name: 'passthrough-test',
716
+ command: ['test-agent'],
717
+ sessionMode: 'stream',
718
+ prompt: { flag: '-p' },
719
+ output: { flag: '--output', value: 'json' },
720
+ outputMode: 'passthrough',
721
+ passthroughTypeMap: {
722
+ typeField: 'type',
723
+ typeValues: { tool_use: 'tool_call', tool_result: 'tool_call' },
724
+ },
725
+ result: { matchPath: '$.type', matchValue: 'result', contentPath: '$.content' },
726
+ })
727
+ const passthroughParser = createOutputParser(passthroughConfig)
728
+
729
+ test('extracts input from tool_call event', () => {
730
+ const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' }, status: 'pending' })
731
+ const result = passthroughParser.parseLine(line)
732
+ const singleResult = Array.isArray(result) ? result[0] : result
733
+ expect(singleResult?.type).toBe('tool_call')
734
+ expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
735
+ })
736
+
737
+ test('extracts output from tool_result event', () => {
738
+ const line = JSON.stringify({ type: 'tool_result', name: 'Read', output: 'file contents', status: 'completed' })
739
+ const result = passthroughParser.parseLine(line)
740
+ const singleResult = Array.isArray(result) ? result[0] : result
741
+ expect(singleResult?.type).toBe('tool_call')
742
+ expect(singleResult?.output).toBe('file contents')
743
+ })
744
+
745
+ test('preserves object input type', () => {
746
+ const line = JSON.stringify({ type: 'tool_use', name: 'Write', input: { path: '/a.ts', content: 'code' } })
747
+ const result = passthroughParser.parseLine(line)
748
+ const singleResult = Array.isArray(result) ? result[0] : result
749
+ expect(singleResult?.input).toEqual({ path: '/a.ts', content: 'code' })
750
+ })
751
+
752
+ test('sets timestamp on passthrough updates', () => {
753
+ const before = Date.now()
754
+ const line = JSON.stringify({ type: 'message', content: 'Hello' })
755
+ const result = passthroughParser.parseLine(line)
756
+ const after = Date.now()
757
+ const singleResult = Array.isArray(result) ? result[0] : result
758
+ expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
759
+ expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
760
+ })
761
+
762
+ test('handles absent input/output fields gracefully', () => {
763
+ const line = JSON.stringify({ type: 'tool_use', name: 'Bash', status: 'pending' })
764
+ const result = passthroughParser.parseLine(line)
765
+ const singleResult = Array.isArray(result) ? result[0] : result
766
+ expect(singleResult?.type).toBe('tool_call')
767
+ expect(singleResult?.input).toBeUndefined()
768
+ expect(singleResult?.output).toBeUndefined()
769
+ })
770
+ })
771
+
577
772
  // ============================================================================
578
773
  // History Builder Tests
579
774
  // ============================================================================
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * @remarks
5
5
  * Tests verify the headless session manager works correctly with Claude Code CLI
6
- * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
6
+ * using the schema-driven headless adapter approach.
7
7
  *
8
8
  * Run locally with API key:
9
9
  * ```bash
@@ -29,7 +29,7 @@ setDefaultTimeout(120000)
29
29
  const PROJECT_ROOT = process.cwd()
30
30
 
31
31
  // Schema path for Claude headless adapter
32
- const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/claude-headless.json')
32
+ const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/claude-headless.json')
33
33
 
34
34
  // Get API key from environment
35
35
  const API_KEY = process.env.ANTHROPIC_API_KEY ?? ''
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * @remarks
5
5
  * Tests verify the headless session manager works correctly with Gemini CLI
6
- * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
6
+ * using the schema-driven headless adapter approach.
7
7
  *
8
8
  * Run locally with API key:
9
9
  * ```bash
@@ -29,7 +29,7 @@ setDefaultTimeout(120000)
29
29
  const PROJECT_ROOT = process.cwd()
30
30
 
31
31
  // Schema path for Gemini headless adapter
32
- const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/gemini-headless.json')
32
+ const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/gemini-headless.json')
33
33
 
34
34
  // Get API key from environment
35
35
  const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? ''
@@ -262,6 +262,7 @@ const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResu
262
262
 
263
263
  return {
264
264
  metrics: {
265
+ type: 'trial',
265
266
  avgScore: sum / rawScores.length,
266
267
  medianScore: percentile(sorted, 0.5),
267
268
  p25Score: percentile(sorted, 0.25),
@@ -365,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
365
365
  const fails = results.length - passes
366
366
 
367
367
  quality[label] = {
368
+ type: 'run',
368
369
  avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
369
370
  passRate: results.length > 0 ? passes / results.length : 0,
370
371
  passCount: passes,
@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
105
105
  // Verify reliability metrics include type discriminator
106
106
  expect(report.reliability.high?.type).toBe('run')
107
107
  expect(report.reliability.low?.type).toBe('run')
108
+
109
+ // Verify quality metrics include type discriminator
110
+ expect(report.quality.high?.type).toBe('run')
111
+ expect(report.quality.low?.type).toBe('run')
108
112
  })
109
113
 
110
114
  test('computes confidence intervals for performance metrics', async () => {
@@ -477,6 +477,7 @@ describe('runTrialsCompare', () => {
477
477
  expect(report.quality?.run1).toBeDefined()
478
478
 
479
479
  const qual = report.quality?.run1
480
+ expect(qual?.type).toBe('trial')
480
481
  expect(qual?.avgScore).toBeGreaterThan(0)
481
482
  expect(qual?.medianScore).toBeGreaterThan(0)
482
483
  expect(qual?.p25Score).toBeDefined()
@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
620
620
  * Quality metrics for a single run in comparison.
621
621
  */
622
622
  export const QualityMetricsSchema = z.object({
623
+ /** Discriminator for run-level quality metrics */
624
+ type: z.literal('run'),
623
625
  /** Mean grader score (0-1) */
624
626
  avgScore: z.number(),
625
627
  /** Percentage of pass=true results */
@@ -942,6 +944,8 @@ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfi
942
944
  * Only present when a grader was used during trials capture.
943
945
  */
944
946
  export const TrialsQualityMetricsSchema = z.object({
947
+ /** Discriminator for trial-level quality metrics */
948
+ type: z.literal('trial'),
945
949
  /** Average score across all trials */
946
950
  avgScore: z.number(),
947
951
  /** Median score */