@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/LICENSE +15 -0
  2. package/README.md +273 -0
  3. package/bin/cli.ts +162 -0
  4. package/bin/tests/cli.spec.ts +529 -0
  5. package/package.json +67 -0
  6. package/src/commands/balance.ts +257 -0
  7. package/src/commands/calibrate.ts +313 -0
  8. package/src/commands/capture.ts +393 -0
  9. package/src/commands/summarize.ts +228 -0
  10. package/src/commands/tests/balance-helpers.spec.ts +279 -0
  11. package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
  12. package/src/commands/tests/capture-cli.spec.ts +190 -0
  13. package/src/commands/tests/capture-helpers.spec.ts +524 -0
  14. package/src/commands/tests/summarize-helpers.spec.ts +339 -0
  15. package/src/commands/tests/trials-calculations.spec.ts +209 -0
  16. package/src/commands/tests/trials-cli.spec.ts +147 -0
  17. package/src/commands/trials.ts +388 -0
  18. package/src/commands/validate-refs.ts +188 -0
  19. package/src/commands.ts +33 -0
  20. package/src/core/core.ts +25 -0
  21. package/src/core/loading.ts +96 -0
  22. package/src/core/output.ts +121 -0
  23. package/src/core/tests/core.spec.ts +309 -0
  24. package/src/core/trajectory.ts +166 -0
  25. package/src/core.ts +28 -0
  26. package/src/harness.ts +46 -0
  27. package/src/headless/headless-cli.ts +430 -0
  28. package/src/headless/headless-history-builder.ts +141 -0
  29. package/src/headless/headless-output-parser.ts +366 -0
  30. package/src/headless/headless-session-manager.ts +587 -0
  31. package/src/headless/headless.schemas.ts +310 -0
  32. package/src/headless/headless.types.ts +19 -0
  33. package/src/headless/tests/headless.spec.ts +678 -0
  34. package/src/headless.ts +72 -0
  35. package/src/integration_tests/claude.spec.ts +157 -0
  36. package/src/integration_tests/gemini.spec.ts +139 -0
  37. package/src/pipeline/compare.ts +325 -0
  38. package/src/pipeline/extract.ts +241 -0
  39. package/src/pipeline/format.ts +292 -0
  40. package/src/pipeline/grade.ts +169 -0
  41. package/src/pipeline/pipeline.ts +41 -0
  42. package/src/pipeline/pipeline.types.ts +241 -0
  43. package/src/pipeline/run.ts +412 -0
  44. package/src/pipeline/tests/pipeline.spec.ts +356 -0
  45. package/src/pipeline.ts +34 -0
  46. package/src/schemas/constants.ts +94 -0
  47. package/src/schemas/grader-loader.ts +174 -0
  48. package/src/schemas/schemas-cli.ts +239 -0
  49. package/src/schemas/schemas.ts +558 -0
  50. package/src/schemas/tests/constants.spec.ts +121 -0
  51. package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
  52. package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
  53. package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
  54. package/src/schemas/tests/fixtures/grader-exec.py +29 -0
  55. package/src/schemas/tests/fixtures/grader-module.ts +14 -0
  56. package/src/schemas/tests/grader-loader.spec.ts +153 -0
  57. package/src/schemas/tests/schemas-cli.spec.ts +142 -0
  58. package/src/schemas/tests/schemas.spec.ts +606 -0
  59. package/src/schemas.ts +90 -0
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Shared loading utilities for JSONL files.
3
+ *
4
+ * @remarks
5
+ * Provides consistent loading and parsing of prompts and results files.
6
+ * Used by capture, trials, summarize, calibrate, and pipeline commands.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import type { CaptureResult, PromptCase } from '../schemas.ts'
12
+ import { CaptureResultSchema, PromptCaseSchema } from '../schemas.ts'
13
+
14
+ /**
15
+ * Load prompts from a JSONL file.
16
+ *
17
+ * @remarks
18
+ * Each line in the file should be a valid JSON object matching PromptCaseSchema.
19
+ * Supports both single-turn (string input) and multi-turn (string[] input) formats.
20
+ *
21
+ * @param path - Path to the prompts.jsonl file
22
+ * @returns Parsed and validated prompt cases
23
+ * @throws Error if file cannot be read or any line is invalid
24
+ *
25
+ * @public
26
+ */
27
+ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
28
+ const content = await Bun.file(path).text()
29
+ return content
30
+ .trim()
31
+ .split('\n')
32
+ .filter(Boolean)
33
+ .map((line, index) => {
34
+ try {
35
+ return PromptCaseSchema.parse(JSON.parse(line))
36
+ } catch (error) {
37
+ throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
38
+ }
39
+ })
40
+ }
41
+
42
+ /**
43
+ * Load capture results from a JSONL file.
44
+ *
45
+ * @remarks
46
+ * Each line should be a valid JSON object matching CaptureResultSchema.
47
+ * Used by summarize, calibrate, and compare commands.
48
+ *
49
+ * @param path - Path to the results.jsonl file
50
+ * @returns Parsed and validated capture results
51
+ * @throws Error if file cannot be read or any line is invalid
52
+ *
53
+ * @public
54
+ */
55
+ export const loadResults = async (path: string): Promise<CaptureResult[]> => {
56
+ const content = await Bun.file(path).text()
57
+ return content
58
+ .trim()
59
+ .split('\n')
60
+ .filter(Boolean)
61
+ .map((line, index) => {
62
+ try {
63
+ return CaptureResultSchema.parse(JSON.parse(line))
64
+ } catch (error) {
65
+ throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
66
+ }
67
+ })
68
+ }
69
+
70
+ /**
71
+ * Load raw JSONL file as parsed JSON objects.
72
+ *
73
+ * @remarks
74
+ * Lower-level loading without schema validation.
75
+ * Useful for pipeline commands that need flexible input handling.
76
+ *
77
+ * @param path - Path to JSONL file
78
+ * @returns Array of parsed JSON objects
79
+ * @throws Error if file cannot be read or any line is invalid JSON
80
+ *
81
+ * @public
82
+ */
83
+ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
84
+ const content = await Bun.file(path).text()
85
+ return content
86
+ .trim()
87
+ .split('\n')
88
+ .filter(Boolean)
89
+ .map((line, index) => {
90
+ try {
91
+ return JSON.parse(line) as T
92
+ } catch (error) {
93
+ throw new Error(`Invalid JSON at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
94
+ }
95
+ })
96
+ }
@@ -0,0 +1,121 @@
1
+ /**
2
+ * Shared output utilities for writing results and logging.
3
+ *
4
+ * @remarks
5
+ * Provides consistent output handling across all commands:
6
+ * - Writing to stdout or files
7
+ * - Progress logging to stderr
8
+ * - Path resolution
9
+ * - Content preview (head/tail)
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { appendFile } from 'node:fs/promises'
15
+ import { HEAD_LINES, TAIL_LINES } from '../schemas/constants.ts'
16
+
17
+ /**
18
+ * Write output line to stdout or file.
19
+ *
20
+ * @remarks
21
+ * When writing to a file, supports both overwrite and append modes.
22
+ * When writing to stdout, uses console.log.
23
+ *
24
+ * @param line - Content to write (without trailing newline)
25
+ * @param outputPath - Optional file path (stdout if undefined)
26
+ * @param append - If true, append to file instead of overwrite
27
+ *
28
+ * @public
29
+ */
30
+ export const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
31
+ if (outputPath) {
32
+ if (append) {
33
+ await appendFile(outputPath, `${line}\n`)
34
+ } else {
35
+ await Bun.write(outputPath, `${line}\n`)
36
+ }
37
+ } else {
38
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
39
+ console.log(line)
40
+ }
41
+ }
42
+
43
+ /**
44
+ * Log progress message to stderr.
45
+ *
46
+ * @remarks
47
+ * Progress output goes to stderr to avoid polluting stdout
48
+ * when piping command output.
49
+ *
50
+ * @param message - Progress message to display
51
+ * @param showProgress - If false, message is suppressed
52
+ *
53
+ * @public
54
+ */
55
+ export const logProgress = (message: string, showProgress: boolean): void => {
56
+ if (showProgress) {
57
+ console.error(message)
58
+ }
59
+ }
60
+
61
+ /**
62
+ * Resolve path relative to process.cwd().
63
+ *
64
+ * @remarks
65
+ * Absolute paths (starting with /) are returned as-is.
66
+ * Relative paths are joined with current working directory.
67
+ *
68
+ * @param path - Path to resolve
69
+ * @returns Absolute path
70
+ *
71
+ * @public
72
+ */
73
+ export const resolvePath = (path: string): string => {
74
+ if (path.startsWith('/')) return path
75
+ return `${process.cwd()}/${path}`
76
+ }
77
+
78
+ /**
79
+ * Create head/tail preview of content.
80
+ *
81
+ * @remarks
82
+ * Shows first N and last M lines with omission indicator in between.
83
+ * Useful for large files/content in markdown output.
84
+ *
85
+ * @param content - Full content string
86
+ * @param headLines - Number of lines from start (default from constants)
87
+ * @param tailLines - Number of lines from end (default from constants)
88
+ * @returns Truncated content with omission indicator
89
+ *
90
+ * @public
91
+ */
92
+ export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
93
+ const lines = content.split('\n')
94
+ if (lines.length <= headLines + tailLines) {
95
+ return content
96
+ }
97
+ const head = lines.slice(0, headLines).join('\n')
98
+ const tail = lines.slice(-tailLines).join('\n')
99
+ const omitted = lines.length - headLines - tailLines
100
+ return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
101
+ }
102
+
103
+ /**
104
+ * Get preview text for input (handles string or array).
105
+ *
106
+ * @remarks
107
+ * For arrays (multi-turn), shows turn count and preview of first turn.
108
+ * For strings, shows first 50 characters.
109
+ *
110
+ * @param input - String or array input
111
+ * @returns Preview text suitable for progress display
112
+ *
113
+ * @public
114
+ */
115
+ export const getInputPreview = (input: string | string[]): string => {
116
+ if (Array.isArray(input)) {
117
+ const first = input[0] ?? ''
118
+ return `[${input.length} turns] ${first.slice(0, 40)}...`
119
+ }
120
+ return input.slice(0, 50)
121
+ }
@@ -0,0 +1,309 @@
1
+ /**
2
+ * Unit tests for core utilities.
3
+ *
4
+ * @remarks
5
+ * Tests for shared utility functions in the core module:
6
+ * - loading: loadPrompts, loadResults, loadJsonl
7
+ * - trajectory: extractTrajectory, extractOutput, hasToolErrors
8
+ * - output: writeOutput, logProgress, headTailPreview
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ import { afterEach, describe, expect, test } from 'bun:test'
14
+ import { unlink, writeFile } from 'node:fs/promises'
15
+ import type { ParsedUpdate } from '../../headless/headless-output-parser.ts'
16
+ import { loadJsonl, loadPrompts, loadResults } from '../loading.ts'
17
+ import { headTailPreview, resolvePath } from '../output.ts'
18
+ import { detectTrajectoryRichness, extractOutput, extractTrajectory, hasToolErrors } from '../trajectory.ts'
19
+
20
+ // ============================================================================
21
+ // Loading Tests
22
+ // ============================================================================
23
+
24
+ describe('loadJsonl', () => {
25
+ const testFile = '/tmp/core-test-jsonl.jsonl'
26
+
27
+ afterEach(async () => {
28
+ try {
29
+ await unlink(testFile)
30
+ } catch {
31
+ // Ignore if file doesn't exist
32
+ }
33
+ })
34
+
35
+ test('loads and parses JSONL file', async () => {
36
+ await writeFile(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
37
+ const results = await loadJsonl<{ a: number }>(testFile)
38
+ expect(results.length).toBe(3)
39
+ expect(results[0]?.a).toBe(1)
40
+ expect(results[2]?.a).toBe(3)
41
+ })
42
+
43
+ test('skips empty lines', async () => {
44
+ await writeFile(testFile, '{"a":1}\n\n{"a":2}\n')
45
+ const results = await loadJsonl<{ a: number }>(testFile)
46
+ expect(results.length).toBe(2)
47
+ })
48
+
49
+ test('handles empty file', async () => {
50
+ await writeFile(testFile, '')
51
+ const results = await loadJsonl(testFile)
52
+ expect(results.length).toBe(0)
53
+ })
54
+ })
55
+
56
+ describe('loadPrompts', () => {
57
+ const testFile = '/tmp/core-test-prompts.jsonl'
58
+
59
+ afterEach(async () => {
60
+ try {
61
+ await unlink(testFile)
62
+ } catch {
63
+ // Ignore
64
+ }
65
+ })
66
+
67
+ test('loads valid prompts', async () => {
68
+ await writeFile(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
69
+ const prompts = await loadPrompts(testFile)
70
+ expect(prompts.length).toBe(2)
71
+ expect(prompts[0]?.id).toBe('p1')
72
+ expect(prompts[0]?.input).toBe('hello')
73
+ })
74
+
75
+ test('loads multi-turn prompts', async () => {
76
+ await writeFile(testFile, '{"id":"m1","input":["turn1","turn2"]}')
77
+ const prompts = await loadPrompts(testFile)
78
+ expect(prompts.length).toBe(1)
79
+ expect(Array.isArray(prompts[0]?.input)).toBe(true)
80
+ expect((prompts[0]?.input as string[]).length).toBe(2)
81
+ })
82
+ })
83
+
84
+ describe('loadResults', () => {
85
+ const testFile = '/tmp/core-test-results.jsonl'
86
+
87
+ afterEach(async () => {
88
+ try {
89
+ await unlink(testFile)
90
+ } catch {
91
+ // Ignore
92
+ }
93
+ })
94
+
95
+ test('loads capture results with full schema', async () => {
96
+ const result = {
97
+ id: 'r1',
98
+ input: 'test',
99
+ output: 'result',
100
+ trajectory: [],
101
+ metadata: {},
102
+ toolErrors: false,
103
+ timing: {
104
+ start: 0,
105
+ end: 100,
106
+ total: 100,
107
+ sessionCreation: 10,
108
+ },
109
+ }
110
+ await writeFile(testFile, JSON.stringify(result))
111
+ const results = await loadResults(testFile)
112
+ expect(results.length).toBe(1)
113
+ expect(results[0]?.id).toBe('r1')
114
+ expect(results[0]?.output).toBe('result')
115
+ })
116
+ })
117
+
118
+ // ============================================================================
119
+ // Trajectory Tests
120
+ // ============================================================================
121
+
122
+ describe('extractTrajectory', () => {
123
+ const startTime = 1000
124
+
125
+ test('extracts message updates', () => {
126
+ const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', raw: {} }]
127
+ const trajectory = extractTrajectory(updates, startTime)
128
+ expect(trajectory.length).toBe(1)
129
+ expect(trajectory[0]?.type).toBe('message')
130
+ expect(trajectory[0]?.type === 'message' && trajectory[0]?.content).toBe('Hello')
131
+ })
132
+
133
+ test('extracts thought updates', () => {
134
+ const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', raw: {} }]
135
+ const trajectory = extractTrajectory(updates, startTime)
136
+ expect(trajectory.length).toBe(1)
137
+ expect(trajectory[0]?.type).toBe('thought')
138
+ })
139
+
140
+ test('extracts tool_call with title', () => {
141
+ const updates: ParsedUpdate[] = [
142
+ {
143
+ type: 'tool_call',
144
+ title: 'Read',
145
+ status: 'completed',
146
+ raw: {},
147
+ },
148
+ ]
149
+ const trajectory = extractTrajectory(updates, startTime)
150
+ expect(trajectory.length).toBe(1)
151
+ expect(trajectory[0]?.type).toBe('tool_call')
152
+ const step = trajectory[0]
153
+ if (step?.type === 'tool_call') {
154
+ expect(step.name).toBe('Read')
155
+ }
156
+ })
157
+
158
+ test('handles empty updates', () => {
159
+ const trajectory = extractTrajectory([], startTime)
160
+ expect(trajectory.length).toBe(0)
161
+ })
162
+ })
163
+
164
+ describe('extractOutput', () => {
165
+ test('concatenates all message content', () => {
166
+ const trajectory = [
167
+ { type: 'thought' as const, content: 'Thinking', timestamp: 50 },
168
+ { type: 'message' as const, content: 'First message', timestamp: 100 },
169
+ { type: 'message' as const, content: 'Final answer', timestamp: 150 },
170
+ ]
171
+ const output = extractOutput(trajectory)
172
+ // extractOutput joins all messages with newline
173
+ expect(output).toBe('First message\nFinal answer')
174
+ })
175
+
176
+ test('returns empty string when no messages', () => {
177
+ const trajectory = [{ type: 'thought' as const, content: 'Thinking only', timestamp: 50 }]
178
+ const output = extractOutput(trajectory)
179
+ expect(output).toBe('')
180
+ })
181
+
182
+ test('handles empty trajectory', () => {
183
+ const output = extractOutput([])
184
+ expect(output).toBe('')
185
+ })
186
+ })
187
+
188
+ describe('hasToolErrors', () => {
189
+ test('returns false for successful tool calls', () => {
190
+ const trajectory = [
191
+ {
192
+ type: 'tool_call' as const,
193
+ name: 'Read',
194
+ status: 'completed',
195
+ timestamp: 100,
196
+ },
197
+ ]
198
+ expect(hasToolErrors(trajectory)).toBe(false)
199
+ })
200
+
201
+ test('returns true for failed status', () => {
202
+ const trajectory = [
203
+ {
204
+ type: 'tool_call' as const,
205
+ name: 'Read',
206
+ status: 'failed',
207
+ timestamp: 100,
208
+ },
209
+ ]
210
+ // hasToolErrors checks for status === 'failed'
211
+ expect(hasToolErrors(trajectory)).toBe(true)
212
+ })
213
+
214
+ test('returns false for error status (not failed)', () => {
215
+ // The implementation checks for 'failed', not 'error'
216
+ const trajectory = [
217
+ {
218
+ type: 'tool_call' as const,
219
+ name: 'Read',
220
+ status: 'error',
221
+ timestamp: 100,
222
+ },
223
+ ]
224
+ expect(hasToolErrors(trajectory)).toBe(false)
225
+ })
226
+
227
+ test('returns false for empty trajectory', () => {
228
+ expect(hasToolErrors([])).toBe(false)
229
+ })
230
+ })
231
+
232
+ describe('detectTrajectoryRichness', () => {
233
+ test('returns full when has thoughts', () => {
234
+ const trajectory = [
235
+ { type: 'thought' as const, content: 'Let me think', timestamp: 50 },
236
+ { type: 'message' as const, content: 'Done', timestamp: 150 },
237
+ ]
238
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
239
+ })
240
+
241
+ test('returns full when has tool_calls', () => {
242
+ const trajectory = [
243
+ {
244
+ type: 'tool_call' as const,
245
+ name: 'Read',
246
+ status: 'completed',
247
+ timestamp: 100,
248
+ },
249
+ { type: 'message' as const, content: 'Done', timestamp: 150 },
250
+ ]
251
+ // Any tool_call means 'full'
252
+ expect(detectTrajectoryRichness(trajectory)).toBe('full')
253
+ })
254
+
255
+ test('returns messages-only when only messages', () => {
256
+ const trajectory = [{ type: 'message' as const, content: 'Just a message', timestamp: 100 }]
257
+ expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
258
+ })
259
+
260
+ test('returns minimal for empty trajectory', () => {
261
+ // Empty trajectory returns 'minimal', not 'messages-only'
262
+ expect(detectTrajectoryRichness([])).toBe('minimal')
263
+ })
264
+ })
265
+
266
+ // ============================================================================
267
+ // Output Tests
268
+ // ============================================================================
269
+
270
+ describe('headTailPreview', () => {
271
+ test('returns full content when short', () => {
272
+ const content = 'line1\nline2\nline3'
273
+ const preview = headTailPreview(content, 5, 3)
274
+ expect(preview).toBe(content)
275
+ })
276
+
277
+ test('truncates long content with omission indicator', () => {
278
+ const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`).join('\n')
279
+ const preview = headTailPreview(lines, 3, 2)
280
+
281
+ expect(preview).toContain('line1')
282
+ expect(preview).toContain('line2')
283
+ expect(preview).toContain('line3')
284
+ // Actual format uses "// ... N lines omitted ..."
285
+ expect(preview).toContain('// ... 15 lines omitted ...')
286
+ expect(preview).toContain('line19')
287
+ expect(preview).toContain('line20')
288
+ })
289
+
290
+ test('handles exact boundary', () => {
291
+ const lines = 'line1\nline2\nline3\nline4\nline5'
292
+ const preview = headTailPreview(lines, 3, 2)
293
+ // 5 lines is exactly head(3) + tail(2), no truncation needed
294
+ expect(preview).toBe(lines)
295
+ })
296
+ })
297
+
298
+ describe('resolvePath', () => {
299
+ test('resolves relative path from cwd', () => {
300
+ const resolved = resolvePath('./test.txt')
301
+ expect(resolved.endsWith('test.txt')).toBe(true)
302
+ expect(resolved.startsWith('/')).toBe(true)
303
+ })
304
+
305
+ test('returns absolute path unchanged', () => {
306
+ const path = '/absolute/path/file.txt'
307
+ expect(resolvePath(path)).toBe(path)
308
+ })
309
+ })
@@ -0,0 +1,166 @@
1
+ /**
2
+ * Shared trajectory utilities for extraction and analysis.
3
+ *
4
+ * @remarks
5
+ * Provides functions for extracting trajectory data from parsed updates,
6
+ * detecting richness levels, and checking for tool errors.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
12
+ import type { TrajectoryRichness, TrajectoryStep } from '../schemas.ts'
13
+ import { ToolInputSchema } from '../schemas.ts'
14
+
15
+ /**
16
+ * Extract trajectory from parsed updates.
17
+ *
18
+ * @remarks
19
+ * Converts ParsedUpdate stream into TrajectoryStep array.
20
+ * Handles tool call deduplication (start/completion events).
21
+ *
22
+ * @param updates - Parsed updates from output parser
23
+ * @param startTime - Reference time for timestamp calculation
24
+ * @returns Array of trajectory steps with relative timestamps
25
+ *
26
+ * @public
27
+ */
28
+ export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): TrajectoryStep[] => {
29
+ const trajectory: TrajectoryStep[] = []
30
+ const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
31
+
32
+ for (const update of updates) {
33
+ const timestamp = Date.now() - startTime
34
+
35
+ if (update.type === 'thought') {
36
+ trajectory.push({
37
+ type: 'thought',
38
+ content: update.content ?? '',
39
+ timestamp,
40
+ })
41
+ } else if (update.type === 'message') {
42
+ trajectory.push({
43
+ type: 'message',
44
+ content: update.content ?? '',
45
+ timestamp,
46
+ })
47
+ } else if (update.type === 'tool_call') {
48
+ const toolCallId = update.title ?? `tool_${Date.now()}`
49
+ const existing = toolCallMap.get(toolCallId)
50
+
51
+ if (existing && update.status === 'completed') {
52
+ // Update existing tool call with completion info
53
+ existing.step.status = update.status
54
+ existing.step.duration = timestamp - existing.start
55
+ } else if (!existing) {
56
+ // New tool call
57
+ const step: TrajectoryStep & { type: 'tool_call' } = {
58
+ type: 'tool_call',
59
+ name: update.title ?? 'unknown',
60
+ status: update.status ?? 'pending',
61
+ timestamp,
62
+ }
63
+ toolCallMap.set(toolCallId, { start: timestamp, step })
64
+ trajectory.push(step)
65
+ }
66
+ } else if (update.type === 'plan') {
67
+ trajectory.push({
68
+ type: 'plan',
69
+ entries: [],
70
+ timestamp,
71
+ })
72
+ }
73
+ }
74
+
75
+ return trajectory
76
+ }
77
+
78
+ /**
79
+ * Extract final text output from trajectory.
80
+ *
81
+ * @remarks
82
+ * Concatenates all message step content to produce final output string.
83
+ *
84
+ * @param trajectory - Trajectory steps from capture
85
+ * @returns Concatenated message content
86
+ *
87
+ * @public
88
+ */
89
+ export const extractOutput = (trajectory: TrajectoryStep[]): string => {
90
+ return trajectory
91
+ .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
92
+ .map((step) => step.content)
93
+ .join('\n')
94
+ }
95
+
96
+ /**
97
+ * Check if any tool calls failed in trajectory.
98
+ *
99
+ * @param trajectory - Trajectory steps from capture
100
+ * @returns True if any tool call has 'failed' status
101
+ *
102
+ * @public
103
+ */
104
+ export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
105
+ return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
106
+ }
107
+
108
+ /**
109
+ * Detect trajectory richness level from captured steps.
110
+ *
111
+ * @remarks
112
+ * Different adapters provide varying levels of detail:
113
+ * - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code)
114
+ * - `messages-only`: Only message steps present
115
+ * - `minimal`: Empty or unknown content
116
+ *
117
+ * Uses single-pass iteration with early exit for efficiency.
118
+ *
119
+ * @param trajectory - Trajectory steps from capture
120
+ * @returns Detected richness level
121
+ *
122
+ * @public
123
+ */
124
+ export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
125
+ let hasMessages = false
126
+
127
+ for (const step of trajectory) {
128
+ // Early exit: any of these means 'full' richness
129
+ if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') {
130
+ return 'full'
131
+ }
132
+ if (step.type === 'message') {
133
+ hasMessages = true
134
+ }
135
+ }
136
+
137
+ return hasMessages ? 'messages-only' : 'minimal'
138
+ }
139
+
140
+ /**
141
+ * Extract file path from tool input if present.
142
+ *
143
+ * @param input - Tool call input object
144
+ * @returns File path string or undefined
145
+ *
146
+ * @public
147
+ */
148
+ export const extractFilePath = (input: unknown): string | undefined => {
149
+ const result = ToolInputSchema.safeParse(input)
150
+ if (!result.success) return undefined
151
+ return result.data.file_path ?? result.data.path
152
+ }
153
+
154
+ /**
155
+ * Extract content from tool input if present.
156
+ *
157
+ * @param input - Tool call input object
158
+ * @returns Content string or undefined
159
+ *
160
+ * @public
161
+ */
162
+ export const extractContent = (input: unknown): string | undefined => {
163
+ const result = ToolInputSchema.safeParse(input)
164
+ if (!result.success) return undefined
165
+ return result.data.content ?? result.data.new_string
166
+ }