@plaited/acp-harness 0.2.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +120 -16
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +20 -4
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +232 -0
  11. package/src/calibrate.ts +300 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +245 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  22. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  23. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  24. package/src/tests/fixtures/grader-exec.py +29 -0
  25. package/src/tests/fixtures/grader-module.ts +14 -0
  26. package/src/tests/grader-loader.spec.ts +153 -0
  27. package/src/trials.ts +395 -0
  28. package/src/validate-refs.ts +188 -0
  29. package/.claude/rules/accuracy.md +0 -43
  30. package/.claude/rules/bun-apis.md +0 -80
  31. package/.claude/rules/code-review.md +0 -254
  32. package/.claude/rules/git-workflow.md +0 -37
  33. package/.claude/rules/github.md +0 -154
  34. package/.claude/rules/testing.md +0 -172
  35. package/.claude/skills/acp-harness/SKILL.md +0 -310
  36. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  37. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  38. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  39. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  40. package/.claude-plugin/marketplace.json +0 -15
  41. package/.claude-plugin/plugin.json +0 -16
  42. package/.github/CODEOWNERS +0 -6
  43. package/.github/workflows/ci.yml +0 -63
  44. package/.github/workflows/publish.yml +0 -146
  45. package/.mcp.json +0 -20
  46. package/CLAUDE.md +0 -92
  47. package/Dockerfile.test +0 -23
  48. package/biome.json +0 -96
  49. package/bun.lock +0 -513
  50. package/docker-compose.test.yml +0 -21
  51. package/scripts/bun-test-wrapper.sh +0 -46
  52. package/src/acp.constants.ts +0 -56
  53. package/src/acp.schemas.ts +0 -161
  54. package/src/acp.types.ts +0 -28
  55. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  56. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  57. package/tsconfig.json +0 -32
package/bin/cli.ts CHANGED
@@ -1,670 +1,139 @@
1
1
  #!/usr/bin/env bun
2
2
 
3
3
  /**
4
- * Execute evaluation prompts against an ACP agent.
4
+ * ACP Harness CLI - Agent evaluation toolkit.
5
5
  *
6
6
  * @remarks
7
- * Connects to an ACP-compatible agent (Claude Code, Droid, etc.) and
8
- * runs evaluation prompts, capturing full trajectories for analysis.
7
+ * Router for harness commands. Thin wrapper that delegates to command modules.
9
8
  *
10
- * Usage:
11
- * acp-harness <prompts.jsonl> --command <cmd> -o <results.jsonl>
9
+ * Commands:
10
+ * - capture: Core trajectory capture
11
+ * - trials: Multi-run pass@k/pass^k analysis
12
+ * - summarize: Derive compact views from results
13
+ * - calibrate: Sample failures for grader review
14
+ * - validate-refs: Check reference solutions
15
+ * - balance: Analyze test set coverage
16
+ * - schemas: Export JSON schemas for non-TS users
17
+ * - adapter:scaffold: Scaffold new ACP adapter project
18
+ * - adapter:check: Validate adapter ACP compliance
12
19
  */
13
20
 
14
- import { appendFile } from 'node:fs/promises'
15
- import { parseArgs } from 'node:util'
16
- import type { PlanEntry, SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
17
- import { z } from 'zod'
18
- import { createACPClient, createPrompt } from '../src/acp.ts'
21
+ import { adapterCheck } from '../src/adapter-check.ts'
22
+ import { adapterScaffold } from '../src/adapter-scaffold.ts'
23
+ import { balance } from '../src/balance.ts'
24
+ import { calibrate } from '../src/calibrate.ts'
25
+ import { capture } from '../src/capture.ts'
26
+ import { schemasCli } from '../src/schemas-cli.ts'
27
+ import { summarize } from '../src/summarize.ts'
28
+ import { trials } from '../src/trials.ts'
29
+ import { validateRefs } from '../src/validate-refs.ts'
19
30
 
20
- // ============================================================================
21
- // Schemas (SDK-compatible MCP server format)
22
- // ============================================================================
31
+ const [command, ...args] = Bun.argv.slice(2)
23
32
 
24
- const EnvVariableSchema = z.object({
25
- name: z.string(),
26
- value: z.string(),
27
- })
28
-
29
- const HttpHeaderSchema = z.object({
30
- name: z.string(),
31
- value: z.string(),
32
- })
33
-
34
- const McpServerStdioSchema = z.object({
35
- type: z.literal('stdio').optional(),
36
- name: z.string(),
37
- command: z.string(),
38
- args: z.array(z.string()),
39
- env: z.array(EnvVariableSchema),
40
- })
41
-
42
- const McpServerHttpSchema = z.object({
43
- type: z.literal('http'),
44
- name: z.string(),
45
- url: z.string(),
46
- headers: z.array(HttpHeaderSchema),
47
- })
48
-
49
- const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
50
-
51
- const PromptCaseSchema = z.object({
52
- id: z.string(),
53
- input: z.string(),
54
- expected: z.string().optional(),
55
- metadata: z.record(z.string(), z.unknown()).optional(),
56
- timeout: z.number().optional(),
57
- })
58
-
59
- const ToolInputSchema = z
60
- .object({
61
- file_path: z.string().optional(),
62
- path: z.string().optional(),
63
- content: z.string().optional(),
64
- new_string: z.string().optional(),
65
- })
66
- .passthrough()
67
-
68
- // ============================================================================
69
- // Types
70
- // ============================================================================
71
-
72
- type McpServerConfig = z.infer<typeof McpServerSchema>
73
- type PromptCase = z.infer<typeof PromptCaseSchema>
74
-
75
- /** Trajectory step types */
76
- type TrajectoryStep =
77
- | { type: 'thought'; content: string; timestamp: number }
78
- | { type: 'message'; content: string; timestamp: number }
79
- | {
80
- type: 'tool_call'
81
- name: string
82
- status: string
83
- input?: unknown
84
- output?: unknown
85
- duration?: number
86
- timestamp: number
87
- }
88
- | { type: 'plan'; entries: PlanEntry[]; timestamp: number }
89
-
90
- /** Full output format */
91
- type FullResult = {
92
- id: string
93
- input: string
94
- output: string
95
- expected?: string
96
- trajectory: TrajectoryStep[]
97
- metadata: Record<string, unknown>
98
- timing: {
99
- start: number
100
- end: number
101
- firstResponse?: number
102
- }
103
- status: 'passed' | 'failed' | 'error' | 'timeout'
104
- errors?: string[]
105
- }
106
-
107
- /** Summary output format */
108
- type SummaryResult = {
109
- id: string
110
- input: string
111
- output: string
112
- toolCalls: string[]
113
- status: 'passed' | 'failed' | 'error' | 'timeout'
114
- duration: number
115
- }
116
-
117
- type OutputFormat = 'summary' | 'judge'
118
-
119
- /** Step with unique ID for judge format correlation */
120
- type IndexedStep = TrajectoryStep & { stepId: string }
121
-
122
- // ============================================================================
123
- // Argument Parsing
124
- // ============================================================================
125
-
126
- const { values, positionals } = parseArgs({
127
- args: Bun.argv.slice(2),
128
- options: {
129
- command: {
130
- type: 'string',
131
- },
132
- cmd: {
133
- type: 'string',
134
- },
135
- output: {
136
- type: 'string',
137
- short: 'o',
138
- },
139
- cwd: {
140
- type: 'string',
141
- short: 'c',
142
- },
143
- timeout: {
144
- type: 'string',
145
- short: 't',
146
- default: '60000',
147
- },
148
- format: {
149
- type: 'string',
150
- short: 'f',
151
- default: 'summary',
152
- },
153
- progress: {
154
- type: 'boolean',
155
- default: false,
156
- },
157
- append: {
158
- type: 'boolean',
159
- default: false,
160
- },
161
- 'mcp-server': {
162
- type: 'string',
163
- multiple: true,
164
- },
165
- help: {
166
- type: 'boolean',
167
- short: 'h',
168
- },
169
- },
170
- allowPositionals: true,
171
- })
172
-
173
- if (values.help || positionals.length === 0) {
33
+ const printHelp = () => {
174
34
  // biome-ignore lint/suspicious/noConsole: CLI help output
175
35
  console.log(`
176
- Usage: acp-harness <prompts.jsonl> [options]
177
-
178
- Arguments:
179
- prompts.jsonl Input file with evaluation prompts
36
+ acp-harness - CLI tool for agent evaluation
180
37
 
181
- Options:
182
- --cmd, --command ACP agent command (default: "claude-code-acp")
183
- -o, --output Output file (default: stdout)
184
- -c, --cwd Working directory for agent
185
- -t, --timeout Request timeout in ms (default: 60000)
186
- -f, --format Output format: summary, judge (default: summary)
187
- --progress Show progress to stderr
188
- --append Append to output file instead of overwriting
189
- --mcp-server MCP server config JSON (repeatable)
190
- -h, --help Show this help message
38
+ Commands:
39
+ capture Capture trajectories from ACP agent
40
+ trials Run prompts multiple times for pass@k/pass^k metrics
41
+ summarize Derive compact views from results
42
+ calibrate Sample failures for grader review
43
+ validate-refs Check reference solutions against grader
44
+ balance Analyze test set coverage
45
+ schemas Export JSON schemas for non-TypeScript users
46
+ adapter:scaffold Scaffold a new ACP adapter project
47
+ adapter:check Validate adapter ACP compliance
191
48
 
192
- Input Format (JSONL):
193
- {"id":"test-001","input":"Create a button","expected":"should contain <button>","metadata":{"category":"ui"}}
194
-
195
- Output Formats:
196
- summary - Minimal JSONL: id, input, output, toolCalls, status, duration
197
- judge - Two-tier output:
198
- 1. Markdown with step IDs and head/tail previews → <output>.md
199
- 2. Full trajectory JSONL for reference → <output>.full.jsonl
49
+ Run 'acp-harness <command> --help' for command-specific help.
200
50
 
201
51
  Examples:
202
- # Using the default claude-code-acp adapter
203
- acp-harness prompts.jsonl -o results.jsonl
52
+ # Basic capture
53
+ acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
204
54
 
205
- # Using bunx to run an adapter
206
- acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
55
+ # With grader
56
+ acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.ts -o results.jsonl
207
57
 
208
- # Using a local adapter script
209
- acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
58
+ # Multi-run trials
59
+ acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts -o trials.jsonl
210
60
 
211
- # With judge format for LLM evaluation
212
- acp-harness prompts.jsonl --cmd "bunx claude-code-acp" --format judge -o results
61
+ # Derive summary view
62
+ acp-harness summarize results.jsonl -o summary.jsonl
213
63
 
214
- Note: Requires an ACP-compatible agent. For Claude Code, install the adapter:
215
- npm install -g @zed-industries/claude-code-acp
216
- ANTHROPIC_API_KEY=sk-... acp-harness prompts.jsonl -o results.jsonl
217
- `)
218
- process.exit(values.help ? 0 : 1)
219
- }
64
+ # Export schemas
65
+ acp-harness schemas --json -o schemas.json
220
66
 
221
- // ============================================================================
222
- // Helpers
223
- // ============================================================================
224
-
225
- /** Parse command string into command array */
226
- const parseCommand = (cmd: string): string[] => {
227
- return cmd.split(/\s+/).filter(Boolean)
228
- }
229
-
230
- /** Parse MCP server config from JSON string (SDK-compatible format) */
231
- const parseMcpServerConfig = (json: string): McpServerConfig => {
232
- return McpServerSchema.parse(JSON.parse(json))
233
- }
234
-
235
- /** Load prompts from JSONL file */
236
- const loadPrompts = async (path: string): Promise<PromptCase[]> => {
237
- const content = await Bun.file(path).text()
238
- return content
239
- .trim()
240
- .split('\n')
241
- .filter(Boolean)
242
- .map((line, index) => {
243
- try {
244
- return PromptCaseSchema.parse(JSON.parse(line))
245
- } catch (error) {
246
- throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
247
- }
248
- })
249
- }
250
-
251
- /** Extract trajectory from session notifications */
252
- const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
253
- const trajectory: TrajectoryStep[] = []
254
- const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
255
-
256
- for (const notification of notifications) {
257
- const timestamp = Date.now() - startTime
258
- const update = notification.update
259
-
260
- if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
261
- trajectory.push({
262
- type: 'thought',
263
- content: update.content.text,
264
- timestamp,
265
- })
266
- } else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
267
- trajectory.push({
268
- type: 'message',
269
- content: update.content.text,
270
- timestamp,
271
- })
272
- } else if (update.sessionUpdate === 'tool_call') {
273
- const toolCall = update as ToolCall
274
- const existing = toolCallMap.get(toolCall.toolCallId)
275
-
276
- if (existing) {
277
- // Update existing tool call with completion info
278
- existing.step.status = toolCall.status ?? 'pending'
279
- if (toolCall.content) {
280
- existing.step.output = toolCall.content
281
- }
282
- if (toolCall.rawOutput) {
283
- existing.step.output = toolCall.rawOutput
284
- }
285
- existing.step.duration = timestamp - existing.start
286
- } else {
287
- // New tool call
288
- const step: TrajectoryStep & { type: 'tool_call' } = {
289
- type: 'tool_call',
290
- name: toolCall.title,
291
- status: toolCall.status ?? 'pending',
292
- input: toolCall.rawInput,
293
- timestamp,
294
- }
295
- toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
296
- trajectory.push(step)
297
- }
298
- } else if (update.sessionUpdate === 'plan') {
299
- trajectory.push({
300
- type: 'plan',
301
- entries: update.entries,
302
- timestamp,
303
- })
304
- }
305
- }
306
-
307
- return trajectory
308
- }
309
-
310
- /** Extract final text output from trajectory */
311
- const extractOutput = (trajectory: TrajectoryStep[]): string => {
312
- return trajectory
313
- .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
314
- .map((step) => step.content)
315
- .join('\n')
316
- }
317
-
318
- /** Check if any tool calls failed */
319
- const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
320
- return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
321
- }
322
-
323
- /** Head/tail preview configuration */
324
- const HEAD_LINES = 8
325
- const TAIL_LINES = 4
326
- const MAX_CONTENT_LENGTH = 500
327
-
328
- /** Extract head and tail lines from content */
329
- const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
330
- const lines = content.split('\n')
331
- if (lines.length <= headLines + tailLines) {
332
- return content
333
- }
334
- const head = lines.slice(0, headLines).join('\n')
335
- const tail = lines.slice(-tailLines).join('\n')
336
- const omitted = lines.length - headLines - tailLines
337
- return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
338
- }
339
-
340
- /** Extract file path from tool input if present */
341
- const extractFilePath = (input: unknown): string | undefined => {
342
- const result = ToolInputSchema.safeParse(input)
343
- if (!result.success) return undefined
344
- return result.data.file_path ?? result.data.path
345
- }
346
-
347
- /** Extract content from tool input if present */
348
- const extractContent = (input: unknown): string | undefined => {
349
- const result = ToolInputSchema.safeParse(input)
350
- if (!result.success) return undefined
351
- return result.data.content ?? result.data.new_string
352
- }
353
-
354
- /** Format result as summary JSONL */
355
- const formatSummary = (result: FullResult): string => {
356
- const summary: SummaryResult = {
357
- id: result.id,
358
- input: result.input,
359
- output: result.output,
360
- toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
361
- status: result.status,
362
- duration: result.timing.end - result.timing.start,
363
- }
364
- return JSON.stringify(summary)
365
- }
67
+ # Scaffold new adapter
68
+ acp-harness adapter:scaffold my-agent -o ./adapters/my-agent
366
69
 
367
- /** Format result as judge markdown with step IDs */
368
- const formatJudgeMarkdown = (result: FullResult): string => {
369
- const lines: string[] = [
370
- `## Evaluation Record: ${result.id}`,
371
- '',
372
- `**Input:** ${result.input}`,
373
- '',
374
- '**Trajectory:**',
375
- ]
70
+ # Validate adapter compliance
71
+ acp-harness adapter:check bun ./my-adapter/src/index.ts
376
72
 
377
- let stepNum = 1
378
- for (const step of result.trajectory) {
379
- const stepId = `${result.id}-step-${stepNum}`
380
-
381
- if (step.type === 'thought') {
382
- const preview = step.content.slice(0, 100)
383
- const truncated = step.content.length > 100 ? '...' : ''
384
- lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
385
- stepNum++
386
- } else if (step.type === 'tool_call') {
387
- const duration = step.duration ? ` (${step.duration}ms)` : ''
388
- const filePath = extractFilePath(step.input)
389
- const content = extractContent(step.input)
390
-
391
- lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
392
-
393
- // Add file path if present
394
- if (filePath) {
395
- const charCount = content?.length ?? 0
396
- lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
397
- }
398
-
399
- // Add head/tail preview for content-producing tools
400
- if (content && content.length > 0) {
401
- const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content) : content
402
- // Detect file extension for syntax highlighting
403
- const ext = filePath?.split('.').pop() ?? 'typescript'
404
- lines.push(` \`\`\`${ext}`)
405
- lines.push(` ${preview.split('\n').join('\n ')}`)
406
- lines.push(` \`\`\``)
407
- }
408
- stepNum++
409
- } else if (step.type === 'plan') {
410
- const planSummary = step.entries.map((e) => `${e.content}: ${e.status}`).join(', ')
411
- const truncated = planSummary.length > 80 ? '...' : ''
412
- lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
413
- stepNum++
414
- } else if (step.type === 'message') {
415
- const preview = step.content.slice(0, 100)
416
- const truncated = step.content.length > 100 ? '...' : ''
417
- lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
418
- stepNum++
419
- }
420
- }
421
-
422
- lines.push('')
423
- const outputPreview = result.output.slice(0, 200)
424
- const outputTruncated = result.output.length > 200 ? '...' : ''
425
- lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
426
- lines.push('')
427
-
428
- const metadataStr = Object.entries(result.metadata)
429
- .map(([k, v]) => `${k}=${v}`)
430
- .join(', ')
431
- lines.push(`**Metadata:** ${metadataStr}`)
432
- lines.push(`**Status:** ${result.status}`)
433
- lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
434
- lines.push('')
435
- lines.push('---')
436
- lines.push('')
437
-
438
- return lines.join('\n')
439
- }
440
-
441
- /** Add step IDs to trajectory for full JSONL output */
442
- const addStepIds = (result: FullResult): FullResult & { trajectory: IndexedStep[] } => {
443
- let stepNum = 1
444
- const indexedTrajectory = result.trajectory.map((step) => ({
445
- ...step,
446
- stepId: `${result.id}-step-${stepNum++}`,
447
- }))
448
- return { ...result, trajectory: indexedTrajectory }
449
- }
450
-
451
- /** Format result based on output format (returns markdown for judge, JSONL for summary) */
452
- const formatResult = (result: FullResult, format: OutputFormat): string => {
453
- if (format === 'summary') {
454
- return formatSummary(result)
455
- }
456
- // Judge format returns markdown
457
- return formatJudgeMarkdown(result)
458
- }
459
-
460
- /** Format result as full JSONL with step IDs (for judge format's paired file) */
461
- const formatFullWithStepIds = (result: FullResult): string => {
462
- return JSON.stringify(addStepIds(result))
463
- }
464
-
465
- /** Write output line (to stdout or file) */
466
- const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
467
- if (outputPath) {
468
- if (append) {
469
- await appendFile(outputPath, `${line}\n`)
470
- } else {
471
- await Bun.write(outputPath, `${line}\n`)
472
- }
473
- } else {
474
- // biome-ignore lint/suspicious/noConsole: CLI stdout output
475
- console.log(line)
476
- }
477
- }
478
-
479
- /** Log progress to stderr (doesn't pollute stdout) */
480
- const logProgress = (message: string, showProgress: boolean): void => {
481
- if (showProgress) {
482
- console.error(message)
483
- }
484
- }
485
-
486
- /** Resolve path relative to process.cwd() */
487
- const resolvePath = (path: string): string => {
488
- if (path.startsWith('/')) return path
489
- return `${process.cwd()}/${path}`
73
+ Documentation: https://github.com/plaited/acp-harness
74
+ `)
490
75
  }
491
76
 
492
- // ============================================================================
493
- // Main
494
- // ============================================================================
495
-
496
77
  const main = async () => {
497
- const promptsPath = positionals[0]
498
- if (!promptsPath) {
499
- console.error('Error: prompts.jsonl path is required')
500
- process.exit(1)
501
- }
502
-
503
- const agentCommand = parseCommand(values.cmd ?? values.command ?? 'claude-code-acp')
504
- const outputPath = values.output
505
- const timeout = Number.parseInt(values.timeout ?? '60000', 10)
506
- const cwd = values.cwd
507
- const format = (values.format ?? 'summary') as OutputFormat
508
- const showProgress = values.progress ?? false
509
- const appendOutput = values.append ?? false
510
-
511
- // Validate format
512
- if (!['summary', 'judge'].includes(format)) {
513
- console.error(`Error: Invalid format "${format}". Must be: summary, judge`)
514
- process.exit(1)
515
- }
516
-
517
- // Judge format requires output path (creates two files)
518
- if (format === 'judge' && !outputPath) {
519
- console.error('Error: --format judge requires --output <path> (creates <path>.md and <path>.full.jsonl)')
520
- process.exit(1)
521
- }
522
-
523
- // Parse MCP server configurations (already SDK-compatible format)
524
- const mcpServers = (values['mcp-server'] ?? []).map(parseMcpServerConfig)
525
-
526
- // Load prompts
527
- const prompts = await loadPrompts(promptsPath)
528
-
529
- // Resolve output path relative to process.cwd()
530
- const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
531
-
532
- // Compute output paths for judge format (creates two files)
533
- const judgeMarkdownPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.md` : undefined
534
- const judgeFullPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.full.jsonl` : undefined
535
-
536
- // Log progress info
537
- logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, showProgress)
538
- logProgress(`Command: ${agentCommand.join(' ')}`, showProgress)
539
- logProgress(`Format: ${format}`, showProgress)
540
- if (format === 'judge') {
541
- logProgress(`Output: ${judgeMarkdownPath} + ${judgeFullPath}`, showProgress)
542
- } else if (resolvedOutputPath) {
543
- logProgress(`Output: ${resolvedOutputPath}`, showProgress)
544
- }
545
- if (mcpServers.length > 0) {
546
- logProgress(`MCP Servers: ${mcpServers.map((s) => s.name).join(', ')}`, showProgress)
547
- }
548
-
549
- // Create ACP client
550
- const client = createACPClient({
551
- command: agentCommand,
552
- cwd,
553
- timeout,
554
- })
555
-
556
- // Clear output file(s) if not appending
557
- if (resolvedOutputPath && !appendOutput) {
558
- if (format === 'judge') {
559
- await Bun.write(judgeMarkdownPath!, '')
560
- await Bun.write(judgeFullPath!, '')
561
- } else {
562
- await Bun.write(resolvedOutputPath, '')
78
+ switch (command) {
79
+ case 'capture':
80
+ await capture(args)
81
+ break
82
+
83
+ case 'trials':
84
+ await trials(args)
85
+ break
86
+
87
+ case 'summarize':
88
+ await summarize(args)
89
+ break
90
+
91
+ case 'calibrate':
92
+ await calibrate(args)
93
+ break
94
+
95
+ case 'validate-refs':
96
+ await validateRefs(args)
97
+ break
98
+
99
+ case 'balance':
100
+ await balance(args)
101
+ break
102
+
103
+ case 'schemas':
104
+ await schemasCli(args)
105
+ break
106
+
107
+ case 'adapter:scaffold':
108
+ await adapterScaffold(args)
109
+ break
110
+
111
+ case 'adapter:check':
112
+ await adapterCheck(args)
113
+ break
114
+
115
+ case '-h':
116
+ case '--help':
117
+ case undefined:
118
+ printHelp()
119
+ break
120
+
121
+ case '-v':
122
+ case '--version': {
123
+ const { version } = await import('../package.json')
124
+ // biome-ignore lint/suspicious/noConsole: CLI version output
125
+ console.log(version)
126
+ break
563
127
  }
564
- }
565
128
 
566
- // Session params with MCP servers
567
- const sessionParams = {
568
- cwd: cwd ?? process.cwd(),
569
- mcpServers,
129
+ default:
130
+ console.error(`Unknown command: ${command}`)
131
+ console.error("Run 'acp-harness --help' for usage")
132
+ process.exit(1)
570
133
  }
571
-
572
- let isFirstOutput = true
573
-
574
- try {
575
- logProgress('Connecting to agent...', showProgress)
576
- await client.connect()
577
- logProgress('Connected!', showProgress)
578
-
579
- // Create session with MCP servers
580
- const session = await client.createSession(sessionParams)
581
- logProgress(`Session: ${session.id}`, showProgress)
582
-
583
- // Run evaluations sequentially
584
- for (let i = 0; i < prompts.length; i++) {
585
- const promptCase = prompts[i]
586
- if (!promptCase) continue
587
-
588
- logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, showProgress)
589
-
590
- const startTime = Date.now()
591
- let result: FullResult
592
-
593
- try {
594
- const prompt = createPrompt(promptCase.input)
595
- const { updates } = await client.promptSync(session.id, prompt)
596
-
597
- const endTime = Date.now()
598
- const trajectory = extractTrajectory(updates, startTime)
599
- const output = extractOutput(trajectory)
600
- const hasErrors = hasToolErrors(trajectory)
601
-
602
- result = {
603
- id: promptCase.id,
604
- input: promptCase.input,
605
- output,
606
- ...(promptCase.expected && { expected: promptCase.expected }),
607
- trajectory,
608
- metadata: {
609
- ...promptCase.metadata,
610
- agent: agentCommand.join(' '),
611
- },
612
- timing: {
613
- start: startTime,
614
- end: endTime,
615
- firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
616
- },
617
- status: hasErrors ? 'failed' : 'passed',
618
- }
619
- } catch (error) {
620
- const endTime = Date.now()
621
- const message = error instanceof Error ? error.message : String(error)
622
- const isTimeout = message.includes('timeout') || message.includes('timed out')
623
-
624
- result = {
625
- id: promptCase.id,
626
- input: promptCase.input,
627
- output: '',
628
- trajectory: [],
629
- metadata: {
630
- ...promptCase.metadata,
631
- agent: agentCommand.join(' '),
632
- },
633
- timing: {
634
- start: startTime,
635
- end: endTime,
636
- },
637
- status: isTimeout ? 'timeout' : 'error',
638
- errors: [message],
639
- }
640
- }
641
-
642
- // Format and output result
643
- if (format === 'judge') {
644
- // Judge format: write markdown to .md, full JSONL to .full.jsonl
645
- const markdown = formatJudgeMarkdown(result)
646
- const fullJsonl = formatFullWithStepIds(result)
647
- await writeOutput(markdown, judgeMarkdownPath, !isFirstOutput)
648
- await writeOutput(fullJsonl, judgeFullPath, !isFirstOutput)
649
- } else {
650
- // Summary format: write to single file
651
- const formatted = formatResult(result, format)
652
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
653
- }
654
- isFirstOutput = false
655
-
656
- const statusIcon = result.status === 'passed' ? '✓' : result.status === 'failed' ? '✗' : '!'
657
- logProgress(` ${statusIcon} ${result.status} (${result.timing.end - result.timing.start}ms)`, showProgress)
658
- }
659
- } finally {
660
- logProgress('Disconnecting...', showProgress)
661
- await client.disconnect()
662
- }
663
-
664
- logProgress('Done!', showProgress)
665
134
  }
666
135
 
667
136
  main().catch((error) => {
668
- console.error('Error:', error)
137
+ console.error('Error:', error instanceof Error ? error.message : error)
669
138
  process.exit(1)
670
139
  })