@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared loading utilities for JSONL files.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Provides consistent loading and parsing of prompts and results files.
|
|
6
|
+
* Used by capture, trials, summarize, calibrate, and pipeline commands.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { CaptureResult, PromptCase } from '../schemas.ts'
|
|
12
|
+
import { CaptureResultSchema, PromptCaseSchema } from '../schemas.ts'
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Load prompts from a JSONL file.
|
|
16
|
+
*
|
|
17
|
+
* @remarks
|
|
18
|
+
* Each line in the file should be a valid JSON object matching PromptCaseSchema.
|
|
19
|
+
* Supports both single-turn (string input) and multi-turn (string[] input) formats.
|
|
20
|
+
*
|
|
21
|
+
* @param path - Path to the prompts.jsonl file
|
|
22
|
+
* @returns Parsed and validated prompt cases
|
|
23
|
+
* @throws Error if file cannot be read or any line is invalid
|
|
24
|
+
*
|
|
25
|
+
* @public
|
|
26
|
+
*/
|
|
27
|
+
export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
|
|
28
|
+
const content = await Bun.file(path).text()
|
|
29
|
+
return content
|
|
30
|
+
.trim()
|
|
31
|
+
.split('\n')
|
|
32
|
+
.filter(Boolean)
|
|
33
|
+
.map((line, index) => {
|
|
34
|
+
try {
|
|
35
|
+
return PromptCaseSchema.parse(JSON.parse(line))
|
|
36
|
+
} catch (error) {
|
|
37
|
+
throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
38
|
+
}
|
|
39
|
+
})
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Load capture results from a JSONL file.
|
|
44
|
+
*
|
|
45
|
+
* @remarks
|
|
46
|
+
* Each line should be a valid JSON object matching CaptureResultSchema.
|
|
47
|
+
* Used by summarize, calibrate, and compare commands.
|
|
48
|
+
*
|
|
49
|
+
* @param path - Path to the results.jsonl file
|
|
50
|
+
* @returns Parsed and validated capture results
|
|
51
|
+
* @throws Error if file cannot be read or any line is invalid
|
|
52
|
+
*
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
export const loadResults = async (path: string): Promise<CaptureResult[]> => {
|
|
56
|
+
const content = await Bun.file(path).text()
|
|
57
|
+
return content
|
|
58
|
+
.trim()
|
|
59
|
+
.split('\n')
|
|
60
|
+
.filter(Boolean)
|
|
61
|
+
.map((line, index) => {
|
|
62
|
+
try {
|
|
63
|
+
return CaptureResultSchema.parse(JSON.parse(line))
|
|
64
|
+
} catch (error) {
|
|
65
|
+
throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
66
|
+
}
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Load raw JSONL file as parsed JSON objects.
|
|
72
|
+
*
|
|
73
|
+
* @remarks
|
|
74
|
+
* Lower-level loading without schema validation.
|
|
75
|
+
* Useful for pipeline commands that need flexible input handling.
|
|
76
|
+
*
|
|
77
|
+
* @param path - Path to JSONL file
|
|
78
|
+
* @returns Array of parsed JSON objects
|
|
79
|
+
* @throws Error if file cannot be read or any line is invalid JSON
|
|
80
|
+
*
|
|
81
|
+
* @public
|
|
82
|
+
*/
|
|
83
|
+
export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
|
|
84
|
+
const content = await Bun.file(path).text()
|
|
85
|
+
return content
|
|
86
|
+
.trim()
|
|
87
|
+
.split('\n')
|
|
88
|
+
.filter(Boolean)
|
|
89
|
+
.map((line, index) => {
|
|
90
|
+
try {
|
|
91
|
+
return JSON.parse(line) as T
|
|
92
|
+
} catch (error) {
|
|
93
|
+
throw new Error(`Invalid JSON at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
94
|
+
}
|
|
95
|
+
})
|
|
96
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared output utilities for writing results and logging.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Provides consistent output handling across all commands:
|
|
6
|
+
* - Writing to stdout or files
|
|
7
|
+
* - Progress logging to stderr
|
|
8
|
+
* - Path resolution
|
|
9
|
+
* - Content preview (head/tail)
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { appendFile } from 'node:fs/promises'
|
|
15
|
+
import { HEAD_LINES, TAIL_LINES } from '../schemas/constants.ts'
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Write output line to stdout or file.
|
|
19
|
+
*
|
|
20
|
+
* @remarks
|
|
21
|
+
* When writing to a file, supports both overwrite and append modes.
|
|
22
|
+
* When writing to stdout, uses console.log.
|
|
23
|
+
*
|
|
24
|
+
* @param line - Content to write (without trailing newline)
|
|
25
|
+
* @param outputPath - Optional file path (stdout if undefined)
|
|
26
|
+
* @param append - If true, append to file instead of overwrite
|
|
27
|
+
*
|
|
28
|
+
* @public
|
|
29
|
+
*/
|
|
30
|
+
export const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
|
|
31
|
+
if (outputPath) {
|
|
32
|
+
if (append) {
|
|
33
|
+
await appendFile(outputPath, `${line}\n`)
|
|
34
|
+
} else {
|
|
35
|
+
await Bun.write(outputPath, `${line}\n`)
|
|
36
|
+
}
|
|
37
|
+
} else {
|
|
38
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
39
|
+
console.log(line)
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Log progress message to stderr.
|
|
45
|
+
*
|
|
46
|
+
* @remarks
|
|
47
|
+
* Progress output goes to stderr to avoid polluting stdout
|
|
48
|
+
* when piping command output.
|
|
49
|
+
*
|
|
50
|
+
* @param message - Progress message to display
|
|
51
|
+
* @param showProgress - If false, message is suppressed
|
|
52
|
+
*
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
export const logProgress = (message: string, showProgress: boolean): void => {
|
|
56
|
+
if (showProgress) {
|
|
57
|
+
console.error(message)
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Resolve path relative to process.cwd().
|
|
63
|
+
*
|
|
64
|
+
* @remarks
|
|
65
|
+
* Absolute paths (starting with /) are returned as-is.
|
|
66
|
+
* Relative paths are joined with current working directory.
|
|
67
|
+
*
|
|
68
|
+
* @param path - Path to resolve
|
|
69
|
+
* @returns Absolute path
|
|
70
|
+
*
|
|
71
|
+
* @public
|
|
72
|
+
*/
|
|
73
|
+
export const resolvePath = (path: string): string => {
|
|
74
|
+
if (path.startsWith('/')) return path
|
|
75
|
+
return `${process.cwd()}/${path}`
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Create head/tail preview of content.
|
|
80
|
+
*
|
|
81
|
+
* @remarks
|
|
82
|
+
* Shows first N and last M lines with omission indicator in between.
|
|
83
|
+
* Useful for large files/content in markdown output.
|
|
84
|
+
*
|
|
85
|
+
* @param content - Full content string
|
|
86
|
+
* @param headLines - Number of lines from start (default from constants)
|
|
87
|
+
* @param tailLines - Number of lines from end (default from constants)
|
|
88
|
+
* @returns Truncated content with omission indicator
|
|
89
|
+
*
|
|
90
|
+
* @public
|
|
91
|
+
*/
|
|
92
|
+
export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
|
|
93
|
+
const lines = content.split('\n')
|
|
94
|
+
if (lines.length <= headLines + tailLines) {
|
|
95
|
+
return content
|
|
96
|
+
}
|
|
97
|
+
const head = lines.slice(0, headLines).join('\n')
|
|
98
|
+
const tail = lines.slice(-tailLines).join('\n')
|
|
99
|
+
const omitted = lines.length - headLines - tailLines
|
|
100
|
+
return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Get preview text for input (handles string or array).
|
|
105
|
+
*
|
|
106
|
+
* @remarks
|
|
107
|
+
* For arrays (multi-turn), shows turn count and preview of first turn.
|
|
108
|
+
* For strings, shows first 50 characters.
|
|
109
|
+
*
|
|
110
|
+
* @param input - String or array input
|
|
111
|
+
* @returns Preview text suitable for progress display
|
|
112
|
+
*
|
|
113
|
+
* @public
|
|
114
|
+
*/
|
|
115
|
+
export const getInputPreview = (input: string | string[]): string => {
|
|
116
|
+
if (Array.isArray(input)) {
|
|
117
|
+
const first = input[0] ?? ''
|
|
118
|
+
return `[${input.length} turns] ${first.slice(0, 40)}...`
|
|
119
|
+
}
|
|
120
|
+
return input.slice(0, 50)
|
|
121
|
+
}
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for core utilities.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for shared utility functions in the core module:
|
|
6
|
+
* - loading: loadPrompts, loadResults, loadJsonl
|
|
7
|
+
* - trajectory: extractTrajectory, extractOutput, hasToolErrors
|
|
8
|
+
* - output: writeOutput, logProgress, headTailPreview
|
|
9
|
+
*
|
|
10
|
+
* @packageDocumentation
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
14
|
+
import { unlink, writeFile } from 'node:fs/promises'
|
|
15
|
+
import type { ParsedUpdate } from '../../headless/headless-output-parser.ts'
|
|
16
|
+
import { loadJsonl, loadPrompts, loadResults } from '../loading.ts'
|
|
17
|
+
import { headTailPreview, resolvePath } from '../output.ts'
|
|
18
|
+
import { detectTrajectoryRichness, extractOutput, extractTrajectory, hasToolErrors } from '../trajectory.ts'
|
|
19
|
+
|
|
20
|
+
// ============================================================================
|
|
21
|
+
// Loading Tests
|
|
22
|
+
// ============================================================================
|
|
23
|
+
|
|
24
|
+
describe('loadJsonl', () => {
|
|
25
|
+
const testFile = '/tmp/core-test-jsonl.jsonl'
|
|
26
|
+
|
|
27
|
+
afterEach(async () => {
|
|
28
|
+
try {
|
|
29
|
+
await unlink(testFile)
|
|
30
|
+
} catch {
|
|
31
|
+
// Ignore if file doesn't exist
|
|
32
|
+
}
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
test('loads and parses JSONL file', async () => {
|
|
36
|
+
await writeFile(testFile, '{"a":1}\n{"a":2}\n{"a":3}')
|
|
37
|
+
const results = await loadJsonl<{ a: number }>(testFile)
|
|
38
|
+
expect(results.length).toBe(3)
|
|
39
|
+
expect(results[0]?.a).toBe(1)
|
|
40
|
+
expect(results[2]?.a).toBe(3)
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
test('skips empty lines', async () => {
|
|
44
|
+
await writeFile(testFile, '{"a":1}\n\n{"a":2}\n')
|
|
45
|
+
const results = await loadJsonl<{ a: number }>(testFile)
|
|
46
|
+
expect(results.length).toBe(2)
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
test('handles empty file', async () => {
|
|
50
|
+
await writeFile(testFile, '')
|
|
51
|
+
const results = await loadJsonl(testFile)
|
|
52
|
+
expect(results.length).toBe(0)
|
|
53
|
+
})
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
describe('loadPrompts', () => {
|
|
57
|
+
const testFile = '/tmp/core-test-prompts.jsonl'
|
|
58
|
+
|
|
59
|
+
afterEach(async () => {
|
|
60
|
+
try {
|
|
61
|
+
await unlink(testFile)
|
|
62
|
+
} catch {
|
|
63
|
+
// Ignore
|
|
64
|
+
}
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('loads valid prompts', async () => {
|
|
68
|
+
await writeFile(testFile, '{"id":"p1","input":"hello"}\n{"id":"p2","input":"world"}')
|
|
69
|
+
const prompts = await loadPrompts(testFile)
|
|
70
|
+
expect(prompts.length).toBe(2)
|
|
71
|
+
expect(prompts[0]?.id).toBe('p1')
|
|
72
|
+
expect(prompts[0]?.input).toBe('hello')
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('loads multi-turn prompts', async () => {
|
|
76
|
+
await writeFile(testFile, '{"id":"m1","input":["turn1","turn2"]}')
|
|
77
|
+
const prompts = await loadPrompts(testFile)
|
|
78
|
+
expect(prompts.length).toBe(1)
|
|
79
|
+
expect(Array.isArray(prompts[0]?.input)).toBe(true)
|
|
80
|
+
expect((prompts[0]?.input as string[]).length).toBe(2)
|
|
81
|
+
})
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
describe('loadResults', () => {
|
|
85
|
+
const testFile = '/tmp/core-test-results.jsonl'
|
|
86
|
+
|
|
87
|
+
afterEach(async () => {
|
|
88
|
+
try {
|
|
89
|
+
await unlink(testFile)
|
|
90
|
+
} catch {
|
|
91
|
+
// Ignore
|
|
92
|
+
}
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
test('loads capture results with full schema', async () => {
|
|
96
|
+
const result = {
|
|
97
|
+
id: 'r1',
|
|
98
|
+
input: 'test',
|
|
99
|
+
output: 'result',
|
|
100
|
+
trajectory: [],
|
|
101
|
+
metadata: {},
|
|
102
|
+
toolErrors: false,
|
|
103
|
+
timing: {
|
|
104
|
+
start: 0,
|
|
105
|
+
end: 100,
|
|
106
|
+
total: 100,
|
|
107
|
+
sessionCreation: 10,
|
|
108
|
+
},
|
|
109
|
+
}
|
|
110
|
+
await writeFile(testFile, JSON.stringify(result))
|
|
111
|
+
const results = await loadResults(testFile)
|
|
112
|
+
expect(results.length).toBe(1)
|
|
113
|
+
expect(results[0]?.id).toBe('r1')
|
|
114
|
+
expect(results[0]?.output).toBe('result')
|
|
115
|
+
})
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
// ============================================================================
|
|
119
|
+
// Trajectory Tests
|
|
120
|
+
// ============================================================================
|
|
121
|
+
|
|
122
|
+
describe('extractTrajectory', () => {
|
|
123
|
+
const startTime = 1000
|
|
124
|
+
|
|
125
|
+
test('extracts message updates', () => {
|
|
126
|
+
const updates: ParsedUpdate[] = [{ type: 'message', content: 'Hello', raw: {} }]
|
|
127
|
+
const trajectory = extractTrajectory(updates, startTime)
|
|
128
|
+
expect(trajectory.length).toBe(1)
|
|
129
|
+
expect(trajectory[0]?.type).toBe('message')
|
|
130
|
+
expect(trajectory[0]?.type === 'message' && trajectory[0]?.content).toBe('Hello')
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
test('extracts thought updates', () => {
|
|
134
|
+
const updates: ParsedUpdate[] = [{ type: 'thought', content: 'Thinking...', raw: {} }]
|
|
135
|
+
const trajectory = extractTrajectory(updates, startTime)
|
|
136
|
+
expect(trajectory.length).toBe(1)
|
|
137
|
+
expect(trajectory[0]?.type).toBe('thought')
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
test('extracts tool_call with title', () => {
|
|
141
|
+
const updates: ParsedUpdate[] = [
|
|
142
|
+
{
|
|
143
|
+
type: 'tool_call',
|
|
144
|
+
title: 'Read',
|
|
145
|
+
status: 'completed',
|
|
146
|
+
raw: {},
|
|
147
|
+
},
|
|
148
|
+
]
|
|
149
|
+
const trajectory = extractTrajectory(updates, startTime)
|
|
150
|
+
expect(trajectory.length).toBe(1)
|
|
151
|
+
expect(trajectory[0]?.type).toBe('tool_call')
|
|
152
|
+
const step = trajectory[0]
|
|
153
|
+
if (step?.type === 'tool_call') {
|
|
154
|
+
expect(step.name).toBe('Read')
|
|
155
|
+
}
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
test('handles empty updates', () => {
|
|
159
|
+
const trajectory = extractTrajectory([], startTime)
|
|
160
|
+
expect(trajectory.length).toBe(0)
|
|
161
|
+
})
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
describe('extractOutput', () => {
|
|
165
|
+
test('concatenates all message content', () => {
|
|
166
|
+
const trajectory = [
|
|
167
|
+
{ type: 'thought' as const, content: 'Thinking', timestamp: 50 },
|
|
168
|
+
{ type: 'message' as const, content: 'First message', timestamp: 100 },
|
|
169
|
+
{ type: 'message' as const, content: 'Final answer', timestamp: 150 },
|
|
170
|
+
]
|
|
171
|
+
const output = extractOutput(trajectory)
|
|
172
|
+
// extractOutput joins all messages with newline
|
|
173
|
+
expect(output).toBe('First message\nFinal answer')
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
test('returns empty string when no messages', () => {
|
|
177
|
+
const trajectory = [{ type: 'thought' as const, content: 'Thinking only', timestamp: 50 }]
|
|
178
|
+
const output = extractOutput(trajectory)
|
|
179
|
+
expect(output).toBe('')
|
|
180
|
+
})
|
|
181
|
+
|
|
182
|
+
test('handles empty trajectory', () => {
|
|
183
|
+
const output = extractOutput([])
|
|
184
|
+
expect(output).toBe('')
|
|
185
|
+
})
|
|
186
|
+
})
|
|
187
|
+
|
|
188
|
+
describe('hasToolErrors', () => {
|
|
189
|
+
test('returns false for successful tool calls', () => {
|
|
190
|
+
const trajectory = [
|
|
191
|
+
{
|
|
192
|
+
type: 'tool_call' as const,
|
|
193
|
+
name: 'Read',
|
|
194
|
+
status: 'completed',
|
|
195
|
+
timestamp: 100,
|
|
196
|
+
},
|
|
197
|
+
]
|
|
198
|
+
expect(hasToolErrors(trajectory)).toBe(false)
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
test('returns true for failed status', () => {
|
|
202
|
+
const trajectory = [
|
|
203
|
+
{
|
|
204
|
+
type: 'tool_call' as const,
|
|
205
|
+
name: 'Read',
|
|
206
|
+
status: 'failed',
|
|
207
|
+
timestamp: 100,
|
|
208
|
+
},
|
|
209
|
+
]
|
|
210
|
+
// hasToolErrors checks for status === 'failed'
|
|
211
|
+
expect(hasToolErrors(trajectory)).toBe(true)
|
|
212
|
+
})
|
|
213
|
+
|
|
214
|
+
test('returns false for error status (not failed)', () => {
|
|
215
|
+
// The implementation checks for 'failed', not 'error'
|
|
216
|
+
const trajectory = [
|
|
217
|
+
{
|
|
218
|
+
type: 'tool_call' as const,
|
|
219
|
+
name: 'Read',
|
|
220
|
+
status: 'error',
|
|
221
|
+
timestamp: 100,
|
|
222
|
+
},
|
|
223
|
+
]
|
|
224
|
+
expect(hasToolErrors(trajectory)).toBe(false)
|
|
225
|
+
})
|
|
226
|
+
|
|
227
|
+
test('returns false for empty trajectory', () => {
|
|
228
|
+
expect(hasToolErrors([])).toBe(false)
|
|
229
|
+
})
|
|
230
|
+
})
|
|
231
|
+
|
|
232
|
+
describe('detectTrajectoryRichness', () => {
|
|
233
|
+
test('returns full when has thoughts', () => {
|
|
234
|
+
const trajectory = [
|
|
235
|
+
{ type: 'thought' as const, content: 'Let me think', timestamp: 50 },
|
|
236
|
+
{ type: 'message' as const, content: 'Done', timestamp: 150 },
|
|
237
|
+
]
|
|
238
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
test('returns full when has tool_calls', () => {
|
|
242
|
+
const trajectory = [
|
|
243
|
+
{
|
|
244
|
+
type: 'tool_call' as const,
|
|
245
|
+
name: 'Read',
|
|
246
|
+
status: 'completed',
|
|
247
|
+
timestamp: 100,
|
|
248
|
+
},
|
|
249
|
+
{ type: 'message' as const, content: 'Done', timestamp: 150 },
|
|
250
|
+
]
|
|
251
|
+
// Any tool_call means 'full'
|
|
252
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('full')
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
test('returns messages-only when only messages', () => {
|
|
256
|
+
const trajectory = [{ type: 'message' as const, content: 'Just a message', timestamp: 100 }]
|
|
257
|
+
expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
test('returns minimal for empty trajectory', () => {
|
|
261
|
+
// Empty trajectory returns 'minimal', not 'messages-only'
|
|
262
|
+
expect(detectTrajectoryRichness([])).toBe('minimal')
|
|
263
|
+
})
|
|
264
|
+
})
|
|
265
|
+
|
|
266
|
+
// ============================================================================
|
|
267
|
+
// Output Tests
|
|
268
|
+
// ============================================================================
|
|
269
|
+
|
|
270
|
+
describe('headTailPreview', () => {
|
|
271
|
+
test('returns full content when short', () => {
|
|
272
|
+
const content = 'line1\nline2\nline3'
|
|
273
|
+
const preview = headTailPreview(content, 5, 3)
|
|
274
|
+
expect(preview).toBe(content)
|
|
275
|
+
})
|
|
276
|
+
|
|
277
|
+
test('truncates long content with omission indicator', () => {
|
|
278
|
+
const lines = Array.from({ length: 20 }, (_, i) => `line${i + 1}`).join('\n')
|
|
279
|
+
const preview = headTailPreview(lines, 3, 2)
|
|
280
|
+
|
|
281
|
+
expect(preview).toContain('line1')
|
|
282
|
+
expect(preview).toContain('line2')
|
|
283
|
+
expect(preview).toContain('line3')
|
|
284
|
+
// Actual format uses "// ... N lines omitted ..."
|
|
285
|
+
expect(preview).toContain('// ... 15 lines omitted ...')
|
|
286
|
+
expect(preview).toContain('line19')
|
|
287
|
+
expect(preview).toContain('line20')
|
|
288
|
+
})
|
|
289
|
+
|
|
290
|
+
test('handles exact boundary', () => {
|
|
291
|
+
const lines = 'line1\nline2\nline3\nline4\nline5'
|
|
292
|
+
const preview = headTailPreview(lines, 3, 2)
|
|
293
|
+
// 5 lines is exactly head(3) + tail(2), no truncation needed
|
|
294
|
+
expect(preview).toBe(lines)
|
|
295
|
+
})
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
describe('resolvePath', () => {
|
|
299
|
+
test('resolves relative path from cwd', () => {
|
|
300
|
+
const resolved = resolvePath('./test.txt')
|
|
301
|
+
expect(resolved.endsWith('test.txt')).toBe(true)
|
|
302
|
+
expect(resolved.startsWith('/')).toBe(true)
|
|
303
|
+
})
|
|
304
|
+
|
|
305
|
+
test('returns absolute path unchanged', () => {
|
|
306
|
+
const path = '/absolute/path/file.txt'
|
|
307
|
+
expect(resolvePath(path)).toBe(path)
|
|
308
|
+
})
|
|
309
|
+
})
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared trajectory utilities for extraction and analysis.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Provides functions for extracting trajectory data from parsed updates,
|
|
6
|
+
* detecting richness levels, and checking for tool errors.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
12
|
+
import type { TrajectoryRichness, TrajectoryStep } from '../schemas.ts'
|
|
13
|
+
import { ToolInputSchema } from '../schemas.ts'
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Extract trajectory from parsed updates.
|
|
17
|
+
*
|
|
18
|
+
* @remarks
|
|
19
|
+
* Converts ParsedUpdate stream into TrajectoryStep array.
|
|
20
|
+
* Handles tool call deduplication (start/completion events).
|
|
21
|
+
*
|
|
22
|
+
* @param updates - Parsed updates from output parser
|
|
23
|
+
* @param startTime - Reference time for timestamp calculation
|
|
24
|
+
* @returns Array of trajectory steps with relative timestamps
|
|
25
|
+
*
|
|
26
|
+
* @public
|
|
27
|
+
*/
|
|
28
|
+
export const extractTrajectory = (updates: ParsedUpdate[], startTime: number): TrajectoryStep[] => {
|
|
29
|
+
const trajectory: TrajectoryStep[] = []
|
|
30
|
+
const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
|
|
31
|
+
|
|
32
|
+
for (const update of updates) {
|
|
33
|
+
const timestamp = Date.now() - startTime
|
|
34
|
+
|
|
35
|
+
if (update.type === 'thought') {
|
|
36
|
+
trajectory.push({
|
|
37
|
+
type: 'thought',
|
|
38
|
+
content: update.content ?? '',
|
|
39
|
+
timestamp,
|
|
40
|
+
})
|
|
41
|
+
} else if (update.type === 'message') {
|
|
42
|
+
trajectory.push({
|
|
43
|
+
type: 'message',
|
|
44
|
+
content: update.content ?? '',
|
|
45
|
+
timestamp,
|
|
46
|
+
})
|
|
47
|
+
} else if (update.type === 'tool_call') {
|
|
48
|
+
const toolCallId = update.title ?? `tool_${Date.now()}`
|
|
49
|
+
const existing = toolCallMap.get(toolCallId)
|
|
50
|
+
|
|
51
|
+
if (existing && update.status === 'completed') {
|
|
52
|
+
// Update existing tool call with completion info
|
|
53
|
+
existing.step.status = update.status
|
|
54
|
+
existing.step.duration = timestamp - existing.start
|
|
55
|
+
} else if (!existing) {
|
|
56
|
+
// New tool call
|
|
57
|
+
const step: TrajectoryStep & { type: 'tool_call' } = {
|
|
58
|
+
type: 'tool_call',
|
|
59
|
+
name: update.title ?? 'unknown',
|
|
60
|
+
status: update.status ?? 'pending',
|
|
61
|
+
timestamp,
|
|
62
|
+
}
|
|
63
|
+
toolCallMap.set(toolCallId, { start: timestamp, step })
|
|
64
|
+
trajectory.push(step)
|
|
65
|
+
}
|
|
66
|
+
} else if (update.type === 'plan') {
|
|
67
|
+
trajectory.push({
|
|
68
|
+
type: 'plan',
|
|
69
|
+
entries: [],
|
|
70
|
+
timestamp,
|
|
71
|
+
})
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return trajectory
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Extract final text output from trajectory.
|
|
80
|
+
*
|
|
81
|
+
* @remarks
|
|
82
|
+
* Concatenates all message step content to produce final output string.
|
|
83
|
+
*
|
|
84
|
+
* @param trajectory - Trajectory steps from capture
|
|
85
|
+
* @returns Concatenated message content
|
|
86
|
+
*
|
|
87
|
+
* @public
|
|
88
|
+
*/
|
|
89
|
+
export const extractOutput = (trajectory: TrajectoryStep[]): string => {
|
|
90
|
+
return trajectory
|
|
91
|
+
.filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
|
|
92
|
+
.map((step) => step.content)
|
|
93
|
+
.join('\n')
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Check if any tool calls failed in trajectory.
|
|
98
|
+
*
|
|
99
|
+
* @param trajectory - Trajectory steps from capture
|
|
100
|
+
* @returns True if any tool call has 'failed' status
|
|
101
|
+
*
|
|
102
|
+
* @public
|
|
103
|
+
*/
|
|
104
|
+
export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
|
|
105
|
+
return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Detect trajectory richness level from captured steps.
|
|
110
|
+
*
|
|
111
|
+
* @remarks
|
|
112
|
+
* Different adapters provide varying levels of detail:
|
|
113
|
+
* - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code)
|
|
114
|
+
* - `messages-only`: Only message steps present
|
|
115
|
+
* - `minimal`: Empty or unknown content
|
|
116
|
+
*
|
|
117
|
+
* Uses single-pass iteration with early exit for efficiency.
|
|
118
|
+
*
|
|
119
|
+
* @param trajectory - Trajectory steps from capture
|
|
120
|
+
* @returns Detected richness level
|
|
121
|
+
*
|
|
122
|
+
* @public
|
|
123
|
+
*/
|
|
124
|
+
export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
|
|
125
|
+
let hasMessages = false
|
|
126
|
+
|
|
127
|
+
for (const step of trajectory) {
|
|
128
|
+
// Early exit: any of these means 'full' richness
|
|
129
|
+
if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') {
|
|
130
|
+
return 'full'
|
|
131
|
+
}
|
|
132
|
+
if (step.type === 'message') {
|
|
133
|
+
hasMessages = true
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return hasMessages ? 'messages-only' : 'minimal'
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Extract file path from tool input if present.
|
|
142
|
+
*
|
|
143
|
+
* @param input - Tool call input object
|
|
144
|
+
* @returns File path string or undefined
|
|
145
|
+
*
|
|
146
|
+
* @public
|
|
147
|
+
*/
|
|
148
|
+
export const extractFilePath = (input: unknown): string | undefined => {
|
|
149
|
+
const result = ToolInputSchema.safeParse(input)
|
|
150
|
+
if (!result.success) return undefined
|
|
151
|
+
return result.data.file_path ?? result.data.path
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Extract content from tool input if present.
|
|
156
|
+
*
|
|
157
|
+
* @param input - Tool call input object
|
|
158
|
+
* @returns Content string or undefined
|
|
159
|
+
*
|
|
160
|
+
* @public
|
|
161
|
+
*/
|
|
162
|
+
export const extractContent = (input: unknown): string | undefined => {
|
|
163
|
+
const result = ToolInputSchema.safeParse(input)
|
|
164
|
+
if (!result.success) return undefined
|
|
165
|
+
return result.data.content ?? result.data.new_string
|
|
166
|
+
}
|