npm - @plaited/agent-eval-harness - Versions diffs - 0.12.1 → 0.13.0 - Mend

@plaited/agent-eval-harness 0.12.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +2 -2
package/package.json +4 -4
package/src/commands/tests/capture-helpers.spec.ts +131 -21
package/src/core/tests/core.spec.ts +3 -2
package/src/core/trajectory.ts +8 -2
package/src/headless/headless-output-parser.ts +22 -0
package/src/headless/headless.schemas.ts +19 -8
package/src/headless/tests/fixtures/claude-headless.json +40 -0
package/src/headless/tests/fixtures/gemini-headless.json +37 -0
package/src/headless/tests/headless.spec.ts +199 -4
package/src/integration_tests/claude.spec.ts +2 -2
package/src/integration_tests/gemini.spec.ts +2 -2
package/src/pipeline/compare-trials.ts +1 -0
package/src/pipeline/compare.ts +1 -0
package/src/pipeline/tests/compare-statistical.spec.ts +4 -0
package/src/pipeline/tests/compare-trials.spec.ts +1 -0
package/src/schemas/schemas.ts +4 -0

package/README.md CHANGED Viewed

@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-...   # For Claude
 export GEMINI_API_KEY=...         # For Gemini
 ```
-Pre-built schemas are available in `.agents/skills/headless-adapters/schemas/` for Claude and Gemini.
+Create adapter schemas for any CLI agent that outputs JSON — see the [Schema Creation Guide](.agents/skills/headless-adapters/references/schema-creation-guide.md).
 ### Core Commands
@@ -416,7 +416,7 @@ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
 ## Requirements
 - **Runtime:** Bun >= 1.2.9
-- **Schema:** JSON schema describing CLI agent interaction (see `.agents/skills/headless-adapters/schemas/`)
+- **Schema:** JSON schema describing CLI agent interaction (see [Schema Creation Guide](.agents/skills/headless-adapters/references/schema-creation-guide.md))
 - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
 ## License

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.12.1",
+  "version": "0.13.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {
@@ -56,12 +56,12 @@
     ]
   },
   "dependencies": {
-    "@plaited/development-skills": "0.7.0",
+    "@plaited/development-skills": "0.8.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {
-    "@biomejs/biome": "2.3.12",
-    "@types/bun": "1.3.6",
+    "@biomejs/biome": "2.3.14",
+    "@types/bun": "1.3.9",
     "format-package": "7.0.0",
     "lint-staged": "16.2.7",
     "typescript": "5.9.3"

package/src/commands/tests/capture-helpers.spec.ts CHANGED Viewed

@@ -108,6 +108,7 @@ describe('extractTrajectory', () => {
       {
         type: 'thought',
         content: 'Let me think about this...',
+        timestamp: 100,
         raw: { type: 'thought', text: 'Let me think about this...' },
       },
     ]
@@ -125,6 +126,7 @@ describe('extractTrajectory', () => {
       {
         type: 'message',
         content: 'Here is my answer.',
+        timestamp: 200,
         raw: { type: 'message', text: 'Here is my answer.' },
       },
     ]
@@ -143,6 +145,7 @@ describe('extractTrajectory', () => {
         type: 'tool_call',
         title: 'Read',
         status: 'pending',
+        timestamp: 300,
         raw: { tool: 'Read', input: { file_path: '/test.ts' } },
       },
     ]
@@ -160,6 +163,7 @@ describe('extractTrajectory', () => {
     const updates: ParsedUpdate[] = [
       {
         type: 'plan',
+        timestamp: 400,
         raw: {
           entries: [
             { content: 'Step 1', status: 'completed' },
@@ -185,29 +189,26 @@ describe('extractTrajectory', () => {
   })
   test('assigns timestamps relative to start time', () => {
-    const originalNow = Date.now
-    try {
-      let currentTime = 1000
-      Date.now = () => currentTime
-      const updates: ParsedUpdate[] = [
-        {
-          type: 'message',
-          content: 'First',
-          raw: { type: 'message', text: 'First' },
-        },
-      ]
-      const startTime = 1000
-      currentTime = 1500 // 500ms later
+    const startTime = 1000
+    const updates: ParsedUpdate[] = [
+      {
+        type: 'message',
+        content: 'First',
+        timestamp: 1500,
+        raw: { type: 'message', text: 'First' },
+      },
+      {
+        type: 'message',
+        content: 'Second',
+        timestamp: 2000,
+        raw: { type: 'message', text: 'Second' },
+      },
+    ]
-      const trajectory = extractTrajectory(updates, startTime)
+    const trajectory = extractTrajectory(updates, startTime)
-      expect(trajectory[0]?.timestamp).toBe(500)
-    } finally {
-      Date.now = originalNow
-    }
+    expect(trajectory[0]?.timestamp).toBe(500)
+    expect(trajectory[1]?.timestamp).toBe(1000)
   })
   test('handles updates without content for message/thought types', () => {
@@ -215,11 +216,13 @@ describe('extractTrajectory', () => {
       {
         type: 'message',
         content: undefined, // No content - will have empty string
+        timestamp: 100,
         raw: { type: 'message' },
       },
       {
         type: 'message',
         content: 'Has content',
+        timestamp: 200,
         raw: { type: 'message', text: 'Has content' },
       },
     ]
@@ -231,6 +234,113 @@ describe('extractTrajectory', () => {
     expect(trajectory[0]?.type).toBe('message')
     expect(trajectory[1]?.type).toBe('message')
   })
+  test('attaches input to new tool call from update', () => {
+    const updates: ParsedUpdate[] = [
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'pending',
+        input: { file_path: '/src/main.ts' },
+        timestamp: 500,
+        raw: {},
+      },
+    ]
+    const trajectory = extractTrajectory(updates, baseTime)
+    expect(trajectory).toHaveLength(1)
+    const step = trajectory[0]!
+    expect(step.type === 'tool_call' && step.input).toEqual({ file_path: '/src/main.ts' })
+  })
+  test('attaches output to tool call on completion', () => {
+    const updates: ParsedUpdate[] = [
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'pending',
+        input: { file_path: '/src/main.ts' },
+        timestamp: 500,
+        raw: {},
+      },
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'completed',
+        output: 'file contents here',
+        timestamp: 800,
+        raw: {},
+      },
+    ]
+    const trajectory = extractTrajectory(updates, baseTime)
+    expect(trajectory).toHaveLength(1)
+    const step = trajectory[0]!
+    expect(step.type).toBe('tool_call')
+    if (step.type === 'tool_call') {
+      expect(step.input).toEqual({ file_path: '/src/main.ts' })
+      expect(step.output).toBe('file contents here')
+      expect(step.status).toBe('completed')
+      expect(step.duration).toBe(300)
+    }
+  })
+  test('handles sequential same-named tool calls independently', () => {
+    const updates: ParsedUpdate[] = [
+      // First Read: pending → completed
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'pending',
+        input: { file_path: '/src/a.ts' },
+        timestamp: 100,
+        raw: {},
+      },
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'completed',
+        output: 'contents of a.ts',
+        timestamp: 300,
+        raw: {},
+      },
+      // Second Read: pending → completed (same tool name, different args)
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'pending',
+        input: { file_path: '/src/b.ts' },
+        timestamp: 500,
+        raw: {},
+      },
+      {
+        type: 'tool_call',
+        title: 'Read',
+        status: 'completed',
+        output: 'contents of b.ts',
+        timestamp: 700,
+        raw: {},
+      },
+    ]
+    const trajectory = extractTrajectory(updates, baseTime)
+    // Both calls should appear as separate trajectory steps
+    const toolCalls = trajectory.filter((s) => s.type === 'tool_call')
+    expect(toolCalls).toHaveLength(2)
+    const first = toolCalls[0]!
+    expect(first.type === 'tool_call' && first.input).toEqual({ file_path: '/src/a.ts' })
+    expect(first.type === 'tool_call' && first.output).toBe('contents of a.ts')
+    expect(first.type === 'tool_call' && first.status).toBe('completed')
+    const second = toolCalls[1]!
+    expect(second.type === 'tool_call' && second.input).toEqual({ file_path: '/src/b.ts' })
+    expect(second.type === 'tool_call' && second.output).toBe('contents of b.ts')
+    expect(second.type === 'tool_call' && second.status).toBe('completed')
+  })
 })
 // ============================================================================

package/src/core/tests/core.spec.ts CHANGED Viewed

@@ -123,7 +123,7 @@ describe('extractTrajectory', () => {
   const startTime = 1000
   test('extracts message updates', () => {
-    const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', raw: {} }]
+    const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', timestamp: 1100, raw: {} }]
     const trajectory = extractTrajectory(updates, startTime)
     expect(trajectory.length).toBe(1)
     expect(trajectory[0]?.type).toBe('message')
@@ -131,7 +131,7 @@ describe('extractTrajectory', () => {
   })
   test('extracts thought updates', () => {
-    const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', raw: {} }]
+    const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', timestamp: 1200, raw: {} }]
     const trajectory = extractTrajectory(updates, startTime)
     expect(trajectory.length).toBe(1)
     expect(trajectory[0]?.type).toBe('thought')
@@ -143,6 +143,7 @@ describe('extractTrajectory', () => {
         type: 'tool_call',
         title: 'Read',
         status: 'completed',
+        timestamp: 1300,
         raw: {},
       },
     ]

package/src/core/trajectory.ts CHANGED Viewed

@@ -30,7 +30,7 @@ export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): T
   const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
   for (const update of updates) {
-    const timestamp = Date.now() - startTime
+    const timestamp = update.timestamp - startTime
     if (update.type === 'thought') {
       trajectory.push({
@@ -45,19 +45,25 @@ export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): T
         timestamp,
       })
     } else if (update.type === 'tool_call') {
-      const toolCallId = update.title ?? `tool_${Date.now()}`
+      const toolCallId = update.title ?? `tool_${timestamp}`
       const existing = toolCallMap.get(toolCallId)
       if (existing && update.status === 'completed') {
         // Update existing tool call with completion info
         existing.step.status = update.status
         existing.step.duration = timestamp - existing.start
+        if (update.output !== undefined) {
+          existing.step.output = update.output
+        }
+        // Remove from map so a subsequent call with the same name starts fresh
+        toolCallMap.delete(toolCallId)
       } else if (!existing) {
         // New tool call
         const step: TrajectoryStep & { type: 'tool_call' } = {
           type: 'tool_call',
           name: update.title ?? 'unknown',
           status: update.status ?? 'pending',
+          ...(update.input !== undefined && { input: update.input }),
           timestamp,
         }
         toolCallMap.set(toolCallId, { start: timestamp, step })

package/src/headless/headless-output-parser.ts CHANGED Viewed

@@ -23,6 +23,9 @@ export type ParsedUpdate = {
   content?: string
   title?: string
   status?: string
+  input?: unknown
+  output?: unknown
+  timestamp: number
   raw: unknown
 }
@@ -200,6 +203,9 @@ const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpda
       content: typeof event.content === 'string' ? event.content : undefined,
       title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
       status: typeof event.status === 'string' ? event.status : undefined,
+      input: event.input,
+      output: event.output,
+      timestamp: Date.now(),
       raw: event,
     }
   }
@@ -210,6 +216,9 @@ const parsePassthrough = (line: string, typeMap: PassthroughTypeMap): ParsedUpda
     content: typeof event.content === 'string' ? event.content : undefined,
     title: typeof event.name === 'string' ? event.name : typeof event.title === 'string' ? event.title : undefined,
     status: typeof event.status === 'string' ? event.status : undefined,
+    input: event.input,
+    output: event.output,
+    timestamp: Date.now(),
     raw: event,
   }
 }
@@ -307,6 +316,7 @@ export const createOutputParser = (config: HeadlessAdapterConfig) => {
   const createUpdate = (event: unknown, mapping: OutputEventMapping): ParsedUpdate => {
     const update: ParsedUpdate = {
       type: mapping.emitAs,
+      timestamp: Date.now(),
       raw: event,
     }
@@ -320,6 +330,18 @@ export const createOutputParser = (config: HeadlessAdapterConfig) => {
       if (mapping.extract.status) {
         update.status = jsonPathString(event, mapping.extract.status)
       }
+      if (mapping.extract.input) {
+        const value = jsonPath(event, mapping.extract.input)
+        if (value !== undefined) {
+          update.input = value
+        }
+      }
+      if (mapping.extract.output) {
+        const value = jsonPath(event, mapping.extract.output)
+        if (value !== undefined) {
+          update.output = value
+        }
+      }
     }
     return update

package/src/headless/headless.schemas.ts CHANGED Viewed

@@ -36,18 +36,29 @@ export type OutputEventMatch = z.infer<typeof OutputEventMatchSchema>
  * Schema for extracting content from matched events.
  *
  * @remarks
+ * Known fields (`content`, `title`, `status`, `input`, `output`) are used by the
+ * output parser to populate `ParsedUpdate` properties. Additional string-valued
+ * fields are preserved during validation for forward compatibility but are not
+ * consumed by the parser.
+ *
  * Paths can be:
  * - JSONPath expressions (e.g., "$.message.text")
  * - Literal strings in single quotes (e.g., "'pending'")
  */
-export const OutputEventExtractSchema = z.object({
-  /** JSONPath to extract main content */
-  content: z.string().optional(),
-  /** JSONPath to extract title (for tool calls) */
-  title: z.string().optional(),
-  /** JSONPath to extract status (or literal like "'pending'") */
-  status: z.string().optional(),
-})
+export const OutputEventExtractSchema = z
+  .object({
+    /** JSONPath to extract main content */
+    content: z.string().optional(),
+    /** JSONPath to extract title (for tool calls) */
+    title: z.string().optional(),
+    /** JSONPath to extract status (or literal like "'pending'") */
+    status: z.string().optional(),
+    /** JSONPath to extract tool input arguments (e.g., "$.input") */
+    input: z.string().optional(),
+    /** JSONPath to extract tool output/result content (e.g., "$.content") */
+    output: z.string().optional(),
+  })
+  .catchall(z.string())
 /** Output event extract type */
 export type OutputEventExtract = z.infer<typeof OutputEventExtractSchema>

package/src/headless/tests/fixtures/claude-headless.json ADDED Viewed

@@ -0,0 +1,40 @@
+{
+  "version": 1,
+  "name": "claude-headless",
+  "command": ["claude"],
+  "sessionMode": "stream",
+  "prompt": {
+    "flag": "-p"
+  },
+  "output": {
+    "flag": "--output-format",
+    "value": "stream-json"
+  },
+  "autoApprove": ["--dangerously-skip-permissions", "--verbose"],
+  "resume": {
+    "flag": "--resume",
+    "sessionIdPath": "$.session_id"
+  },
+  "outputEvents": [
+    {
+      "match": { "path": "$.type", "value": "assistant" },
+      "emitAs": "message",
+      "extract": { "content": "$.message.content[0].text" }
+    },
+    {
+      "match": { "path": "$.type", "value": "tool_use" },
+      "emitAs": "tool_call",
+      "extract": { "title": "$.name", "status": "'pending'", "input": "$.input" }
+    },
+    {
+      "match": { "path": "$.type", "value": "tool_result" },
+      "emitAs": "tool_call",
+      "extract": { "title": "$.name", "status": "'completed'", "output": "$.content" }
+    }
+  ],
+  "result": {
+    "matchPath": "$.type",
+    "matchValue": "result",
+    "contentPath": "$.result"
+  }
+}

package/src/headless/tests/fixtures/gemini-headless.json ADDED Viewed

@@ -0,0 +1,37 @@
+{
+  "version": 1,
+  "name": "gemini-headless",
+  "command": ["gemini"],
+  "sessionMode": "iterative",
+  "prompt": {
+    "flag": ""
+  },
+  "output": {
+    "flag": "--output-format",
+    "value": "stream-json"
+  },
+  "autoApprove": ["--sandbox", "false"],
+  "outputEvents": [
+    {
+      "match": { "path": "$.type", "value": "message" },
+      "emitAs": "message",
+      "extract": { "content": "$.content" }
+    },
+    {
+      "match": { "path": "$.type", "value": "tool_use" },
+      "emitAs": "tool_call",
+      "extract": { "title": "$.tool_name", "status": "'pending'", "input": "$.args" }
+    },
+    {
+      "match": { "path": "$.type", "value": "tool_result" },
+      "emitAs": "tool_call",
+      "extract": { "title": "$.tool_name", "status": "'completed'", "output": "$.output" }
+    }
+  ],
+  "result": {
+    "matchPath": "$.type",
+    "matchValue": "result",
+    "contentPath": "$.content"
+  },
+  "historyTemplate": "User: {{input}}\nAssistant: {{output}}"
+}

package/src/headless/tests/headless.spec.ts CHANGED Viewed

@@ -36,7 +36,12 @@ const validClaudeSchema = {
     {
       match: { path: '$.type', value: 'tool_use' },
       emitAs: 'tool_call',
-      extract: { title: '$.name', status: "'pending'" },
+      extract: { title: '$.name', status: "'pending'", input: '$.input' },
+    },
+    {
+      match: { path: '$.type', value: 'tool_result' },
+      emitAs: 'tool_call',
+      extract: { title: '$.name', status: "'completed'", output: '$.content' },
     },
   ],
   result: {
@@ -86,21 +91,83 @@ describe('HeadlessAdapterSchema', () => {
   })
   describe('validates schema files from disk', () => {
-    const schemasDir = '.claude/skills/headless-adapters/schemas'
+    const fixturesDir = 'src/headless/tests/fixtures'
     test('validates claude-headless.json from disk', async () => {
-      const content = await Bun.file(`${schemasDir}/claude-headless.json`).json()
+      const content = await Bun.file(`${fixturesDir}/claude-headless.json`).json()
       const result = HeadlessAdapterSchema.safeParse(content)
       expect(result.success).toBe(true)
     })
     test('validates gemini-headless.json from disk', async () => {
-      const content = await Bun.file(`${schemasDir}/gemini-headless.json`).json()
+      const content = await Bun.file(`${fixturesDir}/gemini-headless.json`).json()
       const result = HeadlessAdapterSchema.safeParse(content)
       expect(result.success).toBe(true)
     })
   })
+  describe('extract input/output fields', () => {
+    test('validates schema with input and output in extract config', () => {
+      const schemaWithIO = {
+        ...validClaudeSchema,
+        outputEvents: [
+          ...validClaudeSchema.outputEvents,
+          {
+            match: { path: '$.type', value: 'custom' },
+            emitAs: 'tool_call',
+            extract: { title: '$.name', input: '$.args', output: '$.result' },
+          },
+        ],
+      }
+      const result = HeadlessAdapterSchema.safeParse(schemaWithIO)
+      expect(result.success).toBe(true)
+    })
+    test('preserves extra extract fields via catchall', () => {
+      const schemaWithExtras = {
+        ...validClaudeSchema,
+        outputEvents: [
+          {
+            match: { path: '$.type', value: 'tool_use' },
+            emitAs: 'tool_call',
+            extract: {
+              title: '$.name',
+              status: "'pending'",
+              input: '$.input',
+              toolName: '$.name',
+              mcpServer: '$.server',
+            },
+          },
+        ],
+      }
+      const result = HeadlessAdapterSchema.safeParse(schemaWithExtras)
+      expect(result.success).toBe(true)
+      if (result.success) {
+        const extract = result.data.outputEvents![0]!.extract!
+        expect(extract.title).toBe('$.name')
+        expect(extract.input).toBe('$.input')
+        // Catchall fields aren't in the inferred type — cast needed to access them
+        expect((extract as Record<string, string>).toolName).toBe('$.name')
+        expect((extract as Record<string, string>).mcpServer).toBe('$.server')
+      }
+    })
+    test('rejects non-string extra extract fields', () => {
+      const schemaWithBadExtras = {
+        ...validClaudeSchema,
+        outputEvents: [
+          {
+            match: { path: '$.type', value: 'tool_use' },
+            emitAs: 'tool_call',
+            extract: { title: '$.name', badField: 123 },
+          },
+        ],
+      }
+      const result = HeadlessAdapterSchema.safeParse(schemaWithBadExtras)
+      expect(result.success).toBe(false)
+    })
+  })
   describe('minimal valid schema', () => {
     test('validates minimal required fields', () => {
       const minimal = {
@@ -397,6 +464,70 @@ describe('createOutputParser', () => {
       const singleResult = Array.isArray(result) ? result[0] : result
       expect(singleResult?.raw).toEqual(event)
     })
+    test('extracts input from tool_use event', () => {
+      const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' } })
+      const result = parser.parseLine(line)
+      const singleResult = Array.isArray(result) ? result[0] : result
+      expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
+    })
+    test('extracts output from tool_result event', () => {
+      const line = JSON.stringify({ type: 'tool_result', name: 'Read', content: 'file contents' })
+      const result = parser.parseLine(line)
+      const singleResult = Array.isArray(result) ? result[0] : result
+      expect(singleResult?.output).toBe('file contents')
+    })
+    test('sets timestamp on parsed updates', () => {
+      const before = Date.now()
+      const line = JSON.stringify({ type: 'assistant', message: { text: 'Hello' } })
+      const result = parser.parseLine(line)
+      const after = Date.now()
+      const singleResult = Array.isArray(result) ? result[0] : result
+      expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
+      expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
+    })
+  })
+  describe('parseLine with extra extract fields', () => {
+    test('extra extract fields do not break parser', () => {
+      const configWithExtras = parseHeadlessConfig({
+        version: 1,
+        name: 'extras-test',
+        command: ['test'],
+        sessionMode: 'stream',
+        prompt: { flag: '-p' },
+        output: { flag: '--output', value: 'json' },
+        outputEvents: [
+          {
+            match: { path: '$.type', value: 'tool_use' },
+            emitAs: 'tool_call',
+            extract: {
+              title: '$.name',
+              status: "'pending'",
+              input: '$.input',
+              toolName: '$.name',
+              mcpServer: '$.server',
+            },
+          },
+        ],
+        result: { matchPath: '$.type', matchValue: 'done', contentPath: '$.text' },
+      })
+      const extrasParser = createOutputParser(configWithExtras)
+      const line = JSON.stringify({
+        type: 'tool_use',
+        name: 'WebSearch',
+        input: { query: 'test' },
+        server: 'mcp-search',
+      })
+      const result = extrasParser.parseLine(line)
+      const singleResult = Array.isArray(result) ? result[0] : result
+      expect(singleResult).not.toBeNull()
+      expect(singleResult?.type).toBe('tool_call')
+      expect(singleResult?.title).toBe('WebSearch')
+      expect(singleResult?.input).toEqual({ query: 'test' })
+    })
   })
   describe('parseLine with array wildcards', () => {
@@ -574,6 +705,70 @@ describe('createOutputParser', () => {
   })
 })
+// ============================================================================
+// Passthrough Mode Tests
+// ============================================================================
+describe('passthrough mode', () => {
+  const passthroughConfig = parseHeadlessConfig({
+    version: 1,
+    name: 'passthrough-test',
+    command: ['test-agent'],
+    sessionMode: 'stream',
+    prompt: { flag: '-p' },
+    output: { flag: '--output', value: 'json' },
+    outputMode: 'passthrough',
+    passthroughTypeMap: {
+      typeField: 'type',
+      typeValues: { tool_use: 'tool_call', tool_result: 'tool_call' },
+    },
+    result: { matchPath: '$.type', matchValue: 'result', contentPath: '$.content' },
+  })
+  const passthroughParser = createOutputParser(passthroughConfig)
+  test('extracts input from tool_call event', () => {
+    const line = JSON.stringify({ type: 'tool_use', name: 'Read', input: { file_path: '/test.ts' }, status: 'pending' })
+    const result = passthroughParser.parseLine(line)
+    const singleResult = Array.isArray(result) ? result[0] : result
+    expect(singleResult?.type).toBe('tool_call')
+    expect(singleResult?.input).toEqual({ file_path: '/test.ts' })
+  })
+  test('extracts output from tool_result event', () => {
+    const line = JSON.stringify({ type: 'tool_result', name: 'Read', output: 'file contents', status: 'completed' })
+    const result = passthroughParser.parseLine(line)
+    const singleResult = Array.isArray(result) ? result[0] : result
+    expect(singleResult?.type).toBe('tool_call')
+    expect(singleResult?.output).toBe('file contents')
+  })
+  test('preserves object input type', () => {
+    const line = JSON.stringify({ type: 'tool_use', name: 'Write', input: { path: '/a.ts', content: 'code' } })
+    const result = passthroughParser.parseLine(line)
+    const singleResult = Array.isArray(result) ? result[0] : result
+    expect(singleResult?.input).toEqual({ path: '/a.ts', content: 'code' })
+  })
+  test('sets timestamp on passthrough updates', () => {
+    const before = Date.now()
+    const line = JSON.stringify({ type: 'message', content: 'Hello' })
+    const result = passthroughParser.parseLine(line)
+    const after = Date.now()
+    const singleResult = Array.isArray(result) ? result[0] : result
+    expect(singleResult?.timestamp).toBeGreaterThanOrEqual(before)
+    expect(singleResult?.timestamp).toBeLessThanOrEqual(after)
+  })
+  test('handles absent input/output fields gracefully', () => {
+    const line = JSON.stringify({ type: 'tool_use', name: 'Bash', status: 'pending' })
+    const result = passthroughParser.parseLine(line)
+    const singleResult = Array.isArray(result) ? result[0] : result
+    expect(singleResult?.type).toBe('tool_call')
+    expect(singleResult?.input).toBeUndefined()
+    expect(singleResult?.output).toBeUndefined()
+  })
+})
 // ============================================================================
 // History Builder Tests
 // ============================================================================

package/src/integration_tests/claude.spec.ts CHANGED Viewed

@@ -3,7 +3,7 @@
  *
  * @remarks
  * Tests verify the headless session manager works correctly with Claude Code CLI
- * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
+ * using the schema-driven headless adapter approach.
  *
  * Run locally with API key:
  * ```bash
@@ -29,7 +29,7 @@ setDefaultTimeout(120000)
 const PROJECT_ROOT = process.cwd()
 // Schema path for Claude headless adapter
-const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/claude-headless.json')
+const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/claude-headless.json')
 // Get API key from environment
 const API_KEY = process.env.ANTHROPIC_API_KEY ?? ''

package/src/integration_tests/gemini.spec.ts CHANGED Viewed

@@ -3,7 +3,7 @@
  *
  * @remarks
  * Tests verify the headless session manager works correctly with Gemini CLI
- * using the schema-driven approach from `.claude/skills/headless-adapters/schemas/`.
+ * using the schema-driven headless adapter approach.
  *
  * Run locally with API key:
  * ```bash
@@ -29,7 +29,7 @@ setDefaultTimeout(120000)
 const PROJECT_ROOT = process.cwd()
 // Schema path for Gemini headless adapter
-const SCHEMA_PATH = join(PROJECT_ROOT, '.claude/skills/headless-adapters/schemas/gemini-headless.json')
+const SCHEMA_PATH = join(PROJECT_ROOT, 'src/headless/tests/fixtures/gemini-headless.json')
 // Get API key from environment
 const GEMINI_API_KEY = process.env.GEMINI_API_KEY ?? ''

package/src/pipeline/compare-trials.ts CHANGED Viewed

@@ -262,6 +262,7 @@ const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResu
   return {
     metrics: {
+      type: 'trial',
       avgScore: sum / rawScores.length,
       medianScore: percentile(sorted, 0.5),
       p25Score: percentile(sorted, 0.25),

package/src/pipeline/compare.ts CHANGED Viewed

@@ -365,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
     const fails = results.length - passes
     quality[label] = {
+      type: 'run',
       avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
       passRate: results.length > 0 ? passes / results.length : 0,
       passCount: passes,

package/src/pipeline/tests/compare-statistical.spec.ts CHANGED Viewed

@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
     // Verify reliability metrics include type discriminator
     expect(report.reliability.high?.type).toBe('run')
     expect(report.reliability.low?.type).toBe('run')
+    // Verify quality metrics include type discriminator
+    expect(report.quality.high?.type).toBe('run')
+    expect(report.quality.low?.type).toBe('run')
   })
   test('computes confidence intervals for performance metrics', async () => {

package/src/pipeline/tests/compare-trials.spec.ts CHANGED Viewed

@@ -477,6 +477,7 @@ describe('runTrialsCompare', () => {
     expect(report.quality?.run1).toBeDefined()
     const qual = report.quality?.run1
+    expect(qual?.type).toBe('trial')
     expect(qual?.avgScore).toBeGreaterThan(0)
     expect(qual?.medianScore).toBeGreaterThan(0)
     expect(qual?.p25Score).toBeDefined()

package/src/schemas/schemas.ts CHANGED Viewed

@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
  * Quality metrics for a single run in comparison.
  */
 export const QualityMetricsSchema = z.object({
+  /** Discriminator for run-level quality metrics */
+  type: z.literal('run'),
   /** Mean grader score (0-1) */
   avgScore: z.number(),
   /** Percentage of pass=true results */
@@ -942,6 +944,8 @@ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfi
  * Only present when a grader was used during trials capture.
  */
 export const TrialsQualityMetricsSchema = z.object({
+  /** Discriminator for trial-level quality metrics */
+  type: z.literal('trial'),
   /** Average score across all trials */
   avgScore: z.number(),
   /** Median score */