@plaited/acp-harness 0.2.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +120 -16
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +20 -4
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +232 -0
  11. package/src/calibrate.ts +300 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +245 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  22. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  23. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  24. package/src/tests/fixtures/grader-exec.py +29 -0
  25. package/src/tests/fixtures/grader-module.ts +14 -0
  26. package/src/tests/grader-loader.spec.ts +153 -0
  27. package/src/trials.ts +395 -0
  28. package/src/validate-refs.ts +188 -0
  29. package/.claude/rules/accuracy.md +0 -43
  30. package/.claude/rules/bun-apis.md +0 -80
  31. package/.claude/rules/code-review.md +0 -254
  32. package/.claude/rules/git-workflow.md +0 -37
  33. package/.claude/rules/github.md +0 -154
  34. package/.claude/rules/testing.md +0 -172
  35. package/.claude/skills/acp-harness/SKILL.md +0 -310
  36. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  37. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  38. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  39. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  40. package/.claude-plugin/marketplace.json +0 -15
  41. package/.claude-plugin/plugin.json +0 -16
  42. package/.github/CODEOWNERS +0 -6
  43. package/.github/workflows/ci.yml +0 -63
  44. package/.github/workflows/publish.yml +0 -146
  45. package/.mcp.json +0 -20
  46. package/CLAUDE.md +0 -92
  47. package/Dockerfile.test +0 -23
  48. package/biome.json +0 -96
  49. package/bun.lock +0 -513
  50. package/docker-compose.test.yml +0 -21
  51. package/scripts/bun-test-wrapper.sh +0 -46
  52. package/src/acp.constants.ts +0 -56
  53. package/src/acp.schemas.ts +0 -161
  54. package/src/acp.types.ts +0 -28
  55. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  56. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  57. package/tsconfig.json +0 -32
package/src/capture.ts ADDED
@@ -0,0 +1,457 @@
1
+ /**
2
+ * Core trajectory capture command.
3
+ *
4
+ * @remarks
5
+ * Executes prompts against an ACP agent and captures full trajectories.
6
+ * This is the foundational command - all other views derive from its output.
7
+ *
8
+ * Output format is always full trajectory JSONL (`CaptureResultSchema`).
9
+ * Use `summarize` command to derive compact views.
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { appendFile } from 'node:fs/promises'
15
+ import { parseArgs } from 'node:util'
16
+ import type { SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
17
+ import { createACPClient } from './acp-client.ts'
18
+ import { createPrompt } from './acp-helpers.ts'
19
+ import { DEFAULT_HARNESS_TIMEOUT, HEAD_LINES, TAIL_LINES } from './constants.ts'
20
+ import { loadGrader } from './grader-loader.ts'
21
+ import type { CaptureResult, Grader, PromptCase, TrajectoryStep } from './schemas.ts'
22
+ import { McpServerSchema, PromptCaseSchema, ToolInputSchema } from './schemas.ts'
23
+
24
+ // ============================================================================
25
+ // Types
26
+ // ============================================================================
27
+
28
+ /** Configuration for capture command */
29
+ export type CaptureConfig = {
30
+ /** Path to prompts.jsonl file */
31
+ promptsPath: string
32
+ /** ACP agent command (e.g., ['bunx', 'claude-code-acp']) */
33
+ agentCommand: string[]
34
+ /** Output file path (undefined for stdout) */
35
+ outputPath?: string
36
+ /** Working directory for agent */
37
+ cwd?: string
38
+ /** Timeout per prompt in milliseconds */
39
+ timeout?: number
40
+ /** Show progress to stderr */
41
+ progress?: boolean
42
+ /** Append to output file instead of overwriting */
43
+ append?: boolean
44
+ /** MCP server configurations */
45
+ mcpServers?: unknown[]
46
+ /** Optional grader function */
47
+ grader?: Grader
48
+ }
49
+
50
+ // ============================================================================
51
+ // Helpers
52
+ // ============================================================================
53
+
54
+ /** Load prompts from JSONL file */
55
+ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
56
+ const content = await Bun.file(path).text()
57
+ return content
58
+ .trim()
59
+ .split('\n')
60
+ .filter(Boolean)
61
+ .map((line, index) => {
62
+ try {
63
+ return PromptCaseSchema.parse(JSON.parse(line))
64
+ } catch (error) {
65
+ throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
66
+ }
67
+ })
68
+ }
69
+
70
+ /** Extract trajectory from session notifications */
71
+ export const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
72
+ const trajectory: TrajectoryStep[] = []
73
+ const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
74
+
75
+ for (const notification of notifications) {
76
+ const timestamp = Date.now() - startTime
77
+ const update = notification.update
78
+
79
+ if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
80
+ trajectory.push({
81
+ type: 'thought',
82
+ content: update.content.text,
83
+ timestamp,
84
+ })
85
+ } else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
86
+ trajectory.push({
87
+ type: 'message',
88
+ content: update.content.text,
89
+ timestamp,
90
+ })
91
+ } else if (update.sessionUpdate === 'tool_call') {
92
+ const toolCall = update as ToolCall
93
+ const existing = toolCallMap.get(toolCall.toolCallId)
94
+
95
+ if (existing) {
96
+ // Update existing tool call with completion info
97
+ existing.step.status = toolCall.status ?? 'pending'
98
+ if (toolCall.content) {
99
+ existing.step.output = toolCall.content
100
+ }
101
+ if (toolCall.rawOutput) {
102
+ existing.step.output = toolCall.rawOutput
103
+ }
104
+ existing.step.duration = timestamp - existing.start
105
+ } else {
106
+ // New tool call
107
+ const step: TrajectoryStep & { type: 'tool_call' } = {
108
+ type: 'tool_call',
109
+ name: toolCall.title,
110
+ status: toolCall.status ?? 'pending',
111
+ input: toolCall.rawInput,
112
+ timestamp,
113
+ }
114
+ toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
115
+ trajectory.push(step)
116
+ }
117
+ } else if (update.sessionUpdate === 'plan') {
118
+ trajectory.push({
119
+ type: 'plan',
120
+ entries: update.entries,
121
+ timestamp,
122
+ })
123
+ }
124
+ }
125
+
126
+ return trajectory
127
+ }
128
+
129
+ /** Extract final text output from trajectory */
130
+ export const extractOutput = (trajectory: TrajectoryStep[]): string => {
131
+ return trajectory
132
+ .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
133
+ .map((step) => step.content)
134
+ .join('\n')
135
+ }
136
+
137
+ /** Check if any tool calls failed */
138
+ export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
139
+ return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
140
+ }
141
+
142
+ /** Head/tail preview of content */
143
+ export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
144
+ const lines = content.split('\n')
145
+ if (lines.length <= headLines + tailLines) {
146
+ return content
147
+ }
148
+ const head = lines.slice(0, headLines).join('\n')
149
+ const tail = lines.slice(-tailLines).join('\n')
150
+ const omitted = lines.length - headLines - tailLines
151
+ return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
152
+ }
153
+
154
+ /** Extract file path from tool input if present */
155
+ export const extractFilePath = (input: unknown): string | undefined => {
156
+ const result = ToolInputSchema.safeParse(input)
157
+ if (!result.success) return undefined
158
+ return result.data.file_path ?? result.data.path
159
+ }
160
+
161
+ /** Extract content from tool input if present */
162
+ export const extractContent = (input: unknown): string | undefined => {
163
+ const result = ToolInputSchema.safeParse(input)
164
+ if (!result.success) return undefined
165
+ return result.data.content ?? result.data.new_string
166
+ }
167
+
168
+ /** Write output line (to stdout or file) */
169
+ const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
170
+ if (outputPath) {
171
+ if (append) {
172
+ await appendFile(outputPath, `${line}\n`)
173
+ } else {
174
+ await Bun.write(outputPath, `${line}\n`)
175
+ }
176
+ } else {
177
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
178
+ console.log(line)
179
+ }
180
+ }
181
+
182
+ /** Log progress to stderr (doesn't pollute stdout) */
183
+ const logProgress = (message: string, showProgress: boolean): void => {
184
+ if (showProgress) {
185
+ console.error(message)
186
+ }
187
+ }
188
+
189
+ /** Resolve path relative to process.cwd() */
190
+ const resolvePath = (path: string): string => {
191
+ if (path.startsWith('/')) return path
192
+ return `${process.cwd()}/${path}`
193
+ }
194
+
195
+ // ============================================================================
196
+ // Capture Implementation
197
+ // ============================================================================
198
+
199
+ /**
200
+ * Execute capture with configuration object.
201
+ *
202
+ * @param config - Capture configuration
203
+ * @returns Array of capture results
204
+ */
205
+ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
206
+ const {
207
+ promptsPath,
208
+ agentCommand,
209
+ outputPath,
210
+ cwd,
211
+ timeout = DEFAULT_HARNESS_TIMEOUT,
212
+ progress = false,
213
+ append = false,
214
+ mcpServers = [],
215
+ grader,
216
+ } = config
217
+
218
+ // Parse MCP server configurations
219
+ const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
220
+
221
+ // Load prompts
222
+ const prompts = await loadPrompts(promptsPath)
223
+
224
+ // Resolve output path
225
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
226
+
227
+ // Log progress info
228
+ logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
229
+ logProgress(`Command: ${agentCommand.join(' ')}`, progress)
230
+ if (resolvedOutputPath) {
231
+ logProgress(`Output: ${resolvedOutputPath}`, progress)
232
+ }
233
+ if (parsedMcpServers.length > 0) {
234
+ logProgress(`MCP Servers: ${parsedMcpServers.map((s) => s.name).join(', ')}`, progress)
235
+ }
236
+
237
+ // Create ACP client
238
+ const client = createACPClient({
239
+ command: agentCommand,
240
+ cwd,
241
+ timeout,
242
+ })
243
+
244
+ // Clear output file if not appending
245
+ if (resolvedOutputPath && !append) {
246
+ await Bun.write(resolvedOutputPath, '')
247
+ }
248
+
249
+ // Session params with MCP servers
250
+ const sessionParams = {
251
+ cwd: cwd ?? process.cwd(),
252
+ mcpServers: parsedMcpServers,
253
+ }
254
+
255
+ const results: CaptureResult[] = []
256
+ let isFirstOutput = true
257
+
258
+ try {
259
+ logProgress('Connecting to agent...', progress)
260
+ await client.connect()
261
+ logProgress('Connected!', progress)
262
+
263
+ // Create session with MCP servers
264
+ const session = await client.createSession(sessionParams)
265
+ logProgress(`Session: ${session.id}`, progress)
266
+
267
+ // Run evaluations sequentially
268
+ for (let i = 0; i < prompts.length; i++) {
269
+ const promptCase = prompts[i]
270
+ if (!promptCase) continue
271
+
272
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, progress)
273
+
274
+ const startTime = Date.now()
275
+ let result: CaptureResult
276
+
277
+ try {
278
+ const prompt = createPrompt(promptCase.input)
279
+ const { updates } = await client.promptSync(session.id, prompt)
280
+
281
+ const endTime = Date.now()
282
+ const trajectory = extractTrajectory(updates, startTime)
283
+ const output = extractOutput(trajectory)
284
+ const toolErrors = hasToolErrors(trajectory)
285
+
286
+ result = {
287
+ id: promptCase.id,
288
+ input: promptCase.input,
289
+ output,
290
+ ...(promptCase.expected && { expected: promptCase.expected }),
291
+ trajectory,
292
+ metadata: {
293
+ ...promptCase.metadata,
294
+ agent: agentCommand.join(' '),
295
+ },
296
+ timing: {
297
+ start: startTime,
298
+ end: endTime,
299
+ firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
300
+ },
301
+ toolErrors,
302
+ }
303
+
304
+ // Apply grader if provided
305
+ if (grader) {
306
+ result.score = await grader({
307
+ input: promptCase.input,
308
+ output,
309
+ expected: promptCase.expected,
310
+ trajectory,
311
+ })
312
+ }
313
+ } catch (error) {
314
+ const endTime = Date.now()
315
+ const message = error instanceof Error ? error.message : String(error)
316
+
317
+ result = {
318
+ id: promptCase.id,
319
+ input: promptCase.input,
320
+ output: '',
321
+ trajectory: [],
322
+ metadata: {
323
+ ...promptCase.metadata,
324
+ agent: agentCommand.join(' '),
325
+ },
326
+ timing: {
327
+ start: startTime,
328
+ end: endTime,
329
+ },
330
+ toolErrors: true,
331
+ errors: [message],
332
+ }
333
+ }
334
+
335
+ results.push(result)
336
+
337
+ // Write result immediately
338
+ const formatted = JSON.stringify(result)
339
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
340
+ isFirstOutput = false
341
+
342
+ const statusIcon = result.toolErrors ? '!' : '✓'
343
+ logProgress(` ${statusIcon} (${result.timing.end - result.timing.start}ms)`, progress)
344
+ }
345
+ } finally {
346
+ logProgress('Disconnecting...', progress)
347
+ await client.disconnect()
348
+ }
349
+
350
+ logProgress('Done!', progress)
351
+ return results
352
+ }
353
+
354
+ // ============================================================================
355
+ // CLI Entry Point
356
+ // ============================================================================
357
+
358
+ /**
359
+ * Capture command CLI handler.
360
+ *
361
+ * @param args - Command line arguments (after 'capture')
362
+ */
363
+ export const capture = async (args: string[]): Promise<void> => {
364
+ const { values, positionals } = parseArgs({
365
+ args,
366
+ options: {
367
+ output: { type: 'string', short: 'o' },
368
+ cwd: { type: 'string', short: 'c' },
369
+ timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
370
+ progress: { type: 'boolean', default: false },
371
+ append: { type: 'boolean', default: false },
372
+ 'mcp-server': { type: 'string', multiple: true },
373
+ grader: { type: 'string', short: 'g' },
374
+ help: { type: 'boolean', short: 'h' },
375
+ },
376
+ allowPositionals: true,
377
+ })
378
+
379
+ if (values.help) {
380
+ // biome-ignore lint/suspicious/noConsole: CLI help output
381
+ console.log(`
382
+ Usage: acp-harness capture <prompts.jsonl> <command> [args...] [options]
383
+
384
+ Arguments:
385
+ prompts.jsonl Input file with evaluation prompts
386
+ command [args] ACP agent command to execute
387
+
388
+ Options:
389
+ -o, --output Output file (default: stdout)
390
+ -c, --cwd Working directory for agent
391
+ -t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
392
+ --progress Show progress to stderr
393
+ --append Append to output file instead of overwriting
394
+ --mcp-server MCP server config JSON (repeatable)
395
+ -g, --grader Path to grader (.ts/.js module or executable script)
396
+ -h, --help Show this help message
397
+
398
+ Output Format:
399
+ Full trajectory JSONL with toolErrors indicator.
400
+ Use 'acp-harness summarize' to derive compact views.
401
+
402
+ Graders:
403
+ TS/JS modules must export a 'grade' function.
404
+ Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
405
+
406
+ Examples:
407
+ # Basic capture
408
+ acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
409
+
410
+ # With TypeScript grader
411
+ acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.ts -o results.jsonl
412
+
413
+ # With Python grader
414
+ acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.py -o results.jsonl
415
+ `)
416
+ return
417
+ }
418
+
419
+ const promptsPath = positionals[0]
420
+ if (!promptsPath) {
421
+ console.error('Error: prompts.jsonl path is required')
422
+ process.exit(1)
423
+ }
424
+
425
+ const agentCommand = positionals.slice(1)
426
+ if (agentCommand.length === 0) {
427
+ console.error('Error: ACP agent command is required')
428
+ console.error('Example: acp-harness capture prompts.jsonl bunx claude-code-acp')
429
+ process.exit(1)
430
+ }
431
+
432
+ // Load grader if specified
433
+ let grader: Grader | undefined
434
+ if (values.grader) {
435
+ try {
436
+ grader = await loadGrader(values.grader)
437
+ } catch (error) {
438
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
439
+ process.exit(1)
440
+ }
441
+ }
442
+
443
+ // Parse MCP server configurations
444
+ const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
445
+
446
+ await runCapture({
447
+ promptsPath,
448
+ agentCommand,
449
+ outputPath: values.output,
450
+ cwd: values.cwd,
451
+ timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
452
+ progress: values.progress ?? false,
453
+ append: values.append ?? false,
454
+ mcpServers,
455
+ grader,
456
+ })
457
+ }
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Constants for ACP client and harness operations.
3
+ *
4
+ * @remarks
5
+ * Contains all constant values used across the implementation:
6
+ * - ACP protocol method names and version
7
+ * - JSON-RPC error codes
8
+ * - Harness defaults (timeouts, preview limits)
9
+ *
10
+ * @packageDocumentation
11
+ */
12
+
13
+ // ============================================================================
14
+ // ACP Protocol Methods
15
+ // ============================================================================
16
+
17
+ /** ACP method names */
18
+ export const ACP_METHODS = {
19
+ // Lifecycle
20
+ INITIALIZE: 'initialize',
21
+ SHUTDOWN: 'shutdown',
22
+
23
+ // Sessions
24
+ CREATE_SESSION: 'session/new',
25
+ LOAD_SESSION: 'session/load',
26
+ PROMPT: 'session/prompt',
27
+ CANCEL: 'session/cancel',
28
+ UPDATE: 'session/update',
29
+ REQUEST_PERMISSION: 'session/request_permission',
30
+ SET_MODEL: 'session/set_model',
31
+
32
+ // Protocol-level
33
+ CANCEL_REQUEST: '$/cancel_request',
34
+ } as const
35
+
36
+ // ============================================================================
37
+ // ACP Protocol Version
38
+ // ============================================================================
39
+
40
+ /** Current protocol version - SDK uses number type */
41
+ export const ACP_PROTOCOL_VERSION = 1 as const
42
+
43
+ // ============================================================================
44
+ // JSON-RPC Error Codes
45
+ // ============================================================================
46
+
47
+ /** Standard JSON-RPC error codes */
48
+ export const JSON_RPC_ERRORS = {
49
+ PARSE_ERROR: -32700,
50
+ INVALID_REQUEST: -32600,
51
+ METHOD_NOT_FOUND: -32601,
52
+ INVALID_PARAMS: -32602,
53
+ INTERNAL_ERROR: -32603,
54
+ REQUEST_CANCELLED: -32800,
55
+ } as const
56
+
57
+ // ============================================================================
58
+ // ACP Client Defaults
59
+ // ============================================================================
60
+
61
+ /** Default ACP Client Name */
62
+ export const DEFAULT_ACP_CLIENT_NAME = 'plaited-acp-client'
63
+
64
+ /** Default timeout for ACP operations in milliseconds */
65
+ export const DEFAULT_ACP_TIMEOUT = 30000
66
+
67
+ /** Default polling interval for streaming updates in milliseconds */
68
+ export const DEFAULT_POLLING_INTERVAL = 50
69
+
70
+ // ============================================================================
71
+ // Harness Preview Configuration
72
+ // ============================================================================
73
+
74
+ /** Number of lines to show at the head of content previews */
75
+ export const HEAD_LINES = 8
76
+
77
+ /** Number of lines to show at the tail of content previews */
78
+ export const TAIL_LINES = 4
79
+
80
+ /** Maximum content length before applying head/tail preview */
81
+ export const MAX_CONTENT_LENGTH = 500
82
+
83
+ // ============================================================================
84
+ // Harness Defaults
85
+ // ============================================================================
86
+
87
+ /** Default timeout for prompt evaluation in milliseconds */
88
+ export const DEFAULT_HARNESS_TIMEOUT = 60000
89
+
90
+ /** Default number of trials for pass@k analysis */
91
+ export const DEFAULT_TRIAL_COUNT = 5
92
+
93
+ /** Default sample size for calibration */
94
+ export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10