npm - @plaited/acp-harness - Versions diffs - 0.2.6 → 0.3.1 - Mend

@plaited/acp-harness 0.2.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/LICENSE +1 -1
package/README.md +120 -16
package/bin/cli.ts +105 -636
package/bin/tests/cli.spec.ts +218 -51
package/package.json +20 -4
package/src/acp-client.ts +5 -4
package/src/acp-transport.ts +14 -7
package/src/adapter-check.ts +542 -0
package/src/adapter-scaffold.ts +934 -0
package/src/balance.ts +232 -0
package/src/calibrate.ts +300 -0
package/src/capture.ts +457 -0
package/src/constants.ts +94 -0
package/src/grader-loader.ts +174 -0
package/src/harness.ts +35 -0
package/src/schemas-cli.ts +239 -0
package/src/schemas.ts +567 -0
package/src/summarize.ts +245 -0
package/src/tests/adapter-check.spec.ts +70 -0
package/src/tests/adapter-scaffold.spec.ts +112 -0
package/src/tests/fixtures/grader-bad-module.ts +5 -0
package/src/tests/fixtures/grader-exec-fail.py +9 -0
package/src/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/tests/fixtures/grader-exec.py +29 -0
package/src/tests/fixtures/grader-module.ts +14 -0
package/src/tests/grader-loader.spec.ts +153 -0
package/src/trials.ts +395 -0
package/src/validate-refs.ts +188 -0
package/.claude/rules/accuracy.md +0 -43
package/.claude/rules/bun-apis.md +0 -80
package/.claude/rules/code-review.md +0 -254
package/.claude/rules/git-workflow.md +0 -37
package/.claude/rules/github.md +0 -154
package/.claude/rules/testing.md +0 -172
package/.claude/skills/acp-harness/SKILL.md +0 -310
package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
package/.claude/skills/acp-harness/references/downstream.md +0 -288
package/.claude/skills/acp-harness/references/output-formats.md +0 -221
package/.claude-plugin/marketplace.json +0 -15
package/.claude-plugin/plugin.json +0 -16
package/.github/CODEOWNERS +0 -6
package/.github/workflows/ci.yml +0 -63
package/.github/workflows/publish.yml +0 -146
package/.mcp.json +0 -20
package/CLAUDE.md +0 -92
package/Dockerfile.test +0 -23
package/biome.json +0 -96
package/bun.lock +0 -513
package/docker-compose.test.yml +0 -21
package/scripts/bun-test-wrapper.sh +0 -46
package/src/acp.constants.ts +0 -56
package/src/acp.schemas.ts +0 -161
package/src/acp.types.ts +0 -28
package/src/tests/fixtures/.claude/settings.local.json +0 -8
package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
package/tsconfig.json +0 -32

package/bin/cli.ts CHANGED Viewed

@@ -1,670 +1,139 @@
 #!/usr/bin/env bun
 /**
- * Execute evaluation prompts against an ACP agent.
+ * ACP Harness CLI - Agent evaluation toolkit.
  *
  * @remarks
- * Connects to an ACP-compatible agent (Claude Code, Droid, etc.) and
- * runs evaluation prompts, capturing full trajectories for analysis.
+ * Router for harness commands. Thin wrapper that delegates to command modules.
  *
- * Usage:
- *   acp-harness <prompts.jsonl> --command <cmd> -o <results.jsonl>
+ * Commands:
+ * - capture: Core trajectory capture
+ * - trials: Multi-run pass@k/pass^k analysis
+ * - summarize: Derive compact views from results
+ * - calibrate: Sample failures for grader review
+ * - validate-refs: Check reference solutions
+ * - balance: Analyze test set coverage
+ * - schemas: Export JSON schemas for non-TS users
+ * - adapter:scaffold: Scaffold new ACP adapter project
+ * - adapter:check: Validate adapter ACP compliance
  */
-import { appendFile } from 'node:fs/promises'
-import { parseArgs } from 'node:util'
-import type { PlanEntry, SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
-import { z } from 'zod'
-import { createACPClient, createPrompt } from '../src/acp.ts'
+import { adapterCheck } from '../src/adapter-check.ts'
+import { adapterScaffold } from '../src/adapter-scaffold.ts'
+import { balance } from '../src/balance.ts'
+import { calibrate } from '../src/calibrate.ts'
+import { capture } from '../src/capture.ts'
+import { schemasCli } from '../src/schemas-cli.ts'
+import { summarize } from '../src/summarize.ts'
+import { trials } from '../src/trials.ts'
+import { validateRefs } from '../src/validate-refs.ts'
-// ============================================================================
-// Schemas (SDK-compatible MCP server format)
-// ============================================================================
+const [command, ...args] = Bun.argv.slice(2)
-const EnvVariableSchema = z.object({
-  name: z.string(),
-  value: z.string(),
-})
-const HttpHeaderSchema = z.object({
-  name: z.string(),
-  value: z.string(),
-})
-const McpServerStdioSchema = z.object({
-  type: z.literal('stdio').optional(),
-  name: z.string(),
-  command: z.string(),
-  args: z.array(z.string()),
-  env: z.array(EnvVariableSchema),
-})
-const McpServerHttpSchema = z.object({
-  type: z.literal('http'),
-  name: z.string(),
-  url: z.string(),
-  headers: z.array(HttpHeaderSchema),
-})
-const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
-const PromptCaseSchema = z.object({
-  id: z.string(),
-  input: z.string(),
-  expected: z.string().optional(),
-  metadata: z.record(z.string(), z.unknown()).optional(),
-  timeout: z.number().optional(),
-})
-const ToolInputSchema = z
-  .object({
-    file_path: z.string().optional(),
-    path: z.string().optional(),
-    content: z.string().optional(),
-    new_string: z.string().optional(),
-  })
-  .passthrough()
-// ============================================================================
-// Types
-// ============================================================================
-type McpServerConfig = z.infer<typeof McpServerSchema>
-type PromptCase = z.infer<typeof PromptCaseSchema>
-/** Trajectory step types */
-type TrajectoryStep =
-  | { type: 'thought'; content: string; timestamp: number }
-  | { type: 'message'; content: string; timestamp: number }
-  | {
-      type: 'tool_call'
-      name: string
-      status: string
-      input?: unknown
-      output?: unknown
-      duration?: number
-      timestamp: number
-    }
-  | { type: 'plan'; entries: PlanEntry[]; timestamp: number }
-/** Full output format */
-type FullResult = {
-  id: string
-  input: string
-  output: string
-  expected?: string
-  trajectory: TrajectoryStep[]
-  metadata: Record<string, unknown>
-  timing: {
-    start: number
-    end: number
-    firstResponse?: number
-  }
-  status: 'passed' | 'failed' | 'error' | 'timeout'
-  errors?: string[]
-}
-/** Summary output format */
-type SummaryResult = {
-  id: string
-  input: string
-  output: string
-  toolCalls: string[]
-  status: 'passed' | 'failed' | 'error' | 'timeout'
-  duration: number
-}
-type OutputFormat = 'summary' | 'judge'
-/** Step with unique ID for judge format correlation */
-type IndexedStep = TrajectoryStep & { stepId: string }
-// ============================================================================
-// Argument Parsing
-// ============================================================================
-const { values, positionals } = parseArgs({
-  args: Bun.argv.slice(2),
-  options: {
-    command: {
-      type: 'string',
-    },
-    cmd: {
-      type: 'string',
-    },
-    output: {
-      type: 'string',
-      short: 'o',
-    },
-    cwd: {
-      type: 'string',
-      short: 'c',
-    },
-    timeout: {
-      type: 'string',
-      short: 't',
-      default: '60000',
-    },
-    format: {
-      type: 'string',
-      short: 'f',
-      default: 'summary',
-    },
-    progress: {
-      type: 'boolean',
-      default: false,
-    },
-    append: {
-      type: 'boolean',
-      default: false,
-    },
-    'mcp-server': {
-      type: 'string',
-      multiple: true,
-    },
-    help: {
-      type: 'boolean',
-      short: 'h',
-    },
-  },
-  allowPositionals: true,
-})
-if (values.help || positionals.length === 0) {
+const printHelp = () => {
   // biome-ignore lint/suspicious/noConsole: CLI help output
   console.log(`
-Usage: acp-harness <prompts.jsonl> [options]
-Arguments:
-  prompts.jsonl     Input file with evaluation prompts
+acp-harness - CLI tool for agent evaluation
-Options:
-  --cmd, --command  ACP agent command (default: "claude-code-acp")
-  -o, --output      Output file (default: stdout)
-  -c, --cwd         Working directory for agent
-  -t, --timeout     Request timeout in ms (default: 60000)
-  -f, --format      Output format: summary, judge (default: summary)
-  --progress        Show progress to stderr
-  --append          Append to output file instead of overwriting
-  --mcp-server      MCP server config JSON (repeatable)
-  -h, --help        Show this help message
+Commands:
+  capture          Capture trajectories from ACP agent
+  trials           Run prompts multiple times for pass@k/pass^k metrics
+  summarize        Derive compact views from results
+  calibrate        Sample failures for grader review
+  validate-refs    Check reference solutions against grader
+  balance          Analyze test set coverage
+  schemas          Export JSON schemas for non-TypeScript users
+  adapter:scaffold Scaffold a new ACP adapter project
+  adapter:check    Validate adapter ACP compliance
-Input Format (JSONL):
-  {"id":"test-001","input":"Create a button","expected":"should contain <button>","metadata":{"category":"ui"}}
-Output Formats:
-  summary - Minimal JSONL: id, input, output, toolCalls, status, duration
-  judge   - Two-tier output:
-            1. Markdown with step IDs and head/tail previews → <output>.md
-            2. Full trajectory JSONL for reference → <output>.full.jsonl
+Run 'acp-harness <command> --help' for command-specific help.
 Examples:
-  # Using the default claude-code-acp adapter
-  acp-harness prompts.jsonl -o results.jsonl
+  # Basic capture
+  acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
-  # Using bunx to run an adapter
-  acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
+  # With grader
+  acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.ts -o results.jsonl
-  # Using a local adapter script
-  acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
+  # Multi-run trials
+  acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts -o trials.jsonl
-  # With judge format for LLM evaluation
-  acp-harness prompts.jsonl --cmd "bunx claude-code-acp" --format judge -o results
+  # Derive summary view
+  acp-harness summarize results.jsonl -o summary.jsonl
-Note: Requires an ACP-compatible agent. For Claude Code, install the adapter:
-  npm install -g @zed-industries/claude-code-acp
-  ANTHROPIC_API_KEY=sk-... acp-harness prompts.jsonl -o results.jsonl
-`)
-  process.exit(values.help ? 0 : 1)
-}
+  # Export schemas
+  acp-harness schemas --json -o schemas.json
-// ============================================================================
-// Helpers
-// ============================================================================
-/** Parse command string into command array */
-const parseCommand = (cmd: string): string[] => {
-  return cmd.split(/\s+/).filter(Boolean)
-}
-/** Parse MCP server config from JSON string (SDK-compatible format) */
-const parseMcpServerConfig = (json: string): McpServerConfig => {
-  return McpServerSchema.parse(JSON.parse(json))
-}
-/** Load prompts from JSONL file */
-const loadPrompts = async (path: string): Promise<PromptCase[]> => {
-  const content = await Bun.file(path).text()
-  return content
-    .trim()
-    .split('\n')
-    .filter(Boolean)
-    .map((line, index) => {
-      try {
-        return PromptCaseSchema.parse(JSON.parse(line))
-      } catch (error) {
-        throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
-      }
-    })
-}
-/** Extract trajectory from session notifications */
-const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
-  const trajectory: TrajectoryStep[] = []
-  const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
-  for (const notification of notifications) {
-    const timestamp = Date.now() - startTime
-    const update = notification.update
-    if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
-      trajectory.push({
-        type: 'thought',
-        content: update.content.text,
-        timestamp,
-      })
-    } else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
-      trajectory.push({
-        type: 'message',
-        content: update.content.text,
-        timestamp,
-      })
-    } else if (update.sessionUpdate === 'tool_call') {
-      const toolCall = update as ToolCall
-      const existing = toolCallMap.get(toolCall.toolCallId)
-      if (existing) {
-        // Update existing tool call with completion info
-        existing.step.status = toolCall.status ?? 'pending'
-        if (toolCall.content) {
-          existing.step.output = toolCall.content
-        }
-        if (toolCall.rawOutput) {
-          existing.step.output = toolCall.rawOutput
-        }
-        existing.step.duration = timestamp - existing.start
-      } else {
-        // New tool call
-        const step: TrajectoryStep & { type: 'tool_call' } = {
-          type: 'tool_call',
-          name: toolCall.title,
-          status: toolCall.status ?? 'pending',
-          input: toolCall.rawInput,
-          timestamp,
-        }
-        toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
-        trajectory.push(step)
-      }
-    } else if (update.sessionUpdate === 'plan') {
-      trajectory.push({
-        type: 'plan',
-        entries: update.entries,
-        timestamp,
-      })
-    }
-  }
-  return trajectory
-}
-/** Extract final text output from trajectory */
-const extractOutput = (trajectory: TrajectoryStep[]): string => {
-  return trajectory
-    .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
-    .map((step) => step.content)
-    .join('\n')
-}
-/** Check if any tool calls failed */
-const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
-  return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
-}
-/** Head/tail preview configuration */
-const HEAD_LINES = 8
-const TAIL_LINES = 4
-const MAX_CONTENT_LENGTH = 500
-/** Extract head and tail lines from content */
-const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
-  const lines = content.split('\n')
-  if (lines.length <= headLines + tailLines) {
-    return content
-  }
-  const head = lines.slice(0, headLines).join('\n')
-  const tail = lines.slice(-tailLines).join('\n')
-  const omitted = lines.length - headLines - tailLines
-  return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
-}
-/** Extract file path from tool input if present */
-const extractFilePath = (input: unknown): string | undefined => {
-  const result = ToolInputSchema.safeParse(input)
-  if (!result.success) return undefined
-  return result.data.file_path ?? result.data.path
-}
-/** Extract content from tool input if present */
-const extractContent = (input: unknown): string | undefined => {
-  const result = ToolInputSchema.safeParse(input)
-  if (!result.success) return undefined
-  return result.data.content ?? result.data.new_string
-}
-/** Format result as summary JSONL */
-const formatSummary = (result: FullResult): string => {
-  const summary: SummaryResult = {
-    id: result.id,
-    input: result.input,
-    output: result.output,
-    toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
-    status: result.status,
-    duration: result.timing.end - result.timing.start,
-  }
-  return JSON.stringify(summary)
-}
+  # Scaffold new adapter
+  acp-harness adapter:scaffold my-agent -o ./adapters/my-agent
-/** Format result as judge markdown with step IDs */
-const formatJudgeMarkdown = (result: FullResult): string => {
-  const lines: string[] = [
-    `## Evaluation Record: ${result.id}`,
-    '',
-    `**Input:** ${result.input}`,
-    '',
-    '**Trajectory:**',
-  ]
+  # Validate adapter compliance
+  acp-harness adapter:check bun ./my-adapter/src/index.ts
-  let stepNum = 1
-  for (const step of result.trajectory) {
-    const stepId = `${result.id}-step-${stepNum}`
-    if (step.type === 'thought') {
-      const preview = step.content.slice(0, 100)
-      const truncated = step.content.length > 100 ? '...' : ''
-      lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
-      stepNum++
-    } else if (step.type === 'tool_call') {
-      const duration = step.duration ? ` (${step.duration}ms)` : ''
-      const filePath = extractFilePath(step.input)
-      const content = extractContent(step.input)
-      lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
-      // Add file path if present
-      if (filePath) {
-        const charCount = content?.length ?? 0
-        lines.push(`   File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
-      }
-      // Add head/tail preview for content-producing tools
-      if (content && content.length > 0) {
-        const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content) : content
-        // Detect file extension for syntax highlighting
-        const ext = filePath?.split('.').pop() ?? 'typescript'
-        lines.push(`   \`\`\`${ext}`)
-        lines.push(`   ${preview.split('\n').join('\n   ')}`)
-        lines.push(`   \`\`\``)
-      }
-      stepNum++
-    } else if (step.type === 'plan') {
-      const planSummary = step.entries.map((e) => `${e.content}: ${e.status}`).join(', ')
-      const truncated = planSummary.length > 80 ? '...' : ''
-      lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
-      stepNum++
-    } else if (step.type === 'message') {
-      const preview = step.content.slice(0, 100)
-      const truncated = step.content.length > 100 ? '...' : ''
-      lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
-      stepNum++
-    }
-  }
-  lines.push('')
-  const outputPreview = result.output.slice(0, 200)
-  const outputTruncated = result.output.length > 200 ? '...' : ''
-  lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
-  lines.push('')
-  const metadataStr = Object.entries(result.metadata)
-    .map(([k, v]) => `${k}=${v}`)
-    .join(', ')
-  lines.push(`**Metadata:** ${metadataStr}`)
-  lines.push(`**Status:** ${result.status}`)
-  lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
-  lines.push('')
-  lines.push('---')
-  lines.push('')
-  return lines.join('\n')
-}
-/** Add step IDs to trajectory for full JSONL output */
-const addStepIds = (result: FullResult): FullResult & { trajectory: IndexedStep[] } => {
-  let stepNum = 1
-  const indexedTrajectory = result.trajectory.map((step) => ({
-    ...step,
-    stepId: `${result.id}-step-${stepNum++}`,
-  }))
-  return { ...result, trajectory: indexedTrajectory }
-}
-/** Format result based on output format (returns markdown for judge, JSONL for summary) */
-const formatResult = (result: FullResult, format: OutputFormat): string => {
-  if (format === 'summary') {
-    return formatSummary(result)
-  }
-  // Judge format returns markdown
-  return formatJudgeMarkdown(result)
-}
-/** Format result as full JSONL with step IDs (for judge format's paired file) */
-const formatFullWithStepIds = (result: FullResult): string => {
-  return JSON.stringify(addStepIds(result))
-}
-/** Write output line (to stdout or file) */
-const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
-  if (outputPath) {
-    if (append) {
-      await appendFile(outputPath, `${line}\n`)
-    } else {
-      await Bun.write(outputPath, `${line}\n`)
-    }
-  } else {
-    // biome-ignore lint/suspicious/noConsole: CLI stdout output
-    console.log(line)
-  }
-}
-/** Log progress to stderr (doesn't pollute stdout) */
-const logProgress = (message: string, showProgress: boolean): void => {
-  if (showProgress) {
-    console.error(message)
-  }
-}
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
+Documentation: https://github.com/plaited/acp-harness
+`)
 }
-// ============================================================================
-// Main
-// ============================================================================
 const main = async () => {
-  const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
-    process.exit(1)
-  }
-  const agentCommand = parseCommand(values.cmd ?? values.command ?? 'claude-code-acp')
-  const outputPath = values.output
-  const timeout = Number.parseInt(values.timeout ?? '60000', 10)
-  const cwd = values.cwd
-  const format = (values.format ?? 'summary') as OutputFormat
-  const showProgress = values.progress ?? false
-  const appendOutput = values.append ?? false
-  // Validate format
-  if (!['summary', 'judge'].includes(format)) {
-    console.error(`Error: Invalid format "${format}". Must be: summary, judge`)
-    process.exit(1)
-  }
-  // Judge format requires output path (creates two files)
-  if (format === 'judge' && !outputPath) {
-    console.error('Error: --format judge requires --output <path> (creates <path>.md and <path>.full.jsonl)')
-    process.exit(1)
-  }
-  // Parse MCP server configurations (already SDK-compatible format)
-  const mcpServers = (values['mcp-server'] ?? []).map(parseMcpServerConfig)
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-  // Resolve output path relative to process.cwd()
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  // Compute output paths for judge format (creates two files)
-  const judgeMarkdownPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.md` : undefined
-  const judgeFullPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.full.jsonl` : undefined
-  // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, showProgress)
-  logProgress(`Command: ${agentCommand.join(' ')}`, showProgress)
-  logProgress(`Format: ${format}`, showProgress)
-  if (format === 'judge') {
-    logProgress(`Output: ${judgeMarkdownPath} + ${judgeFullPath}`, showProgress)
-  } else if (resolvedOutputPath) {
-    logProgress(`Output: ${resolvedOutputPath}`, showProgress)
-  }
-  if (mcpServers.length > 0) {
-    logProgress(`MCP Servers: ${mcpServers.map((s) => s.name).join(', ')}`, showProgress)
-  }
-  // Create ACP client
-  const client = createACPClient({
-    command: agentCommand,
-    cwd,
-    timeout,
-  })
-  // Clear output file(s) if not appending
-  if (resolvedOutputPath && !appendOutput) {
-    if (format === 'judge') {
-      await Bun.write(judgeMarkdownPath!, '')
-      await Bun.write(judgeFullPath!, '')
-    } else {
-      await Bun.write(resolvedOutputPath, '')
+  switch (command) {
+    case 'capture':
+      await capture(args)
+      break
+    case 'trials':
+      await trials(args)
+      break
+    case 'summarize':
+      await summarize(args)
+      break
+    case 'calibrate':
+      await calibrate(args)
+      break
+    case 'validate-refs':
+      await validateRefs(args)
+      break
+    case 'balance':
+      await balance(args)
+      break
+    case 'schemas':
+      await schemasCli(args)
+      break
+    case 'adapter:scaffold':
+      await adapterScaffold(args)
+      break
+    case 'adapter:check':
+      await adapterCheck(args)
+      break
+    case '-h':
+    case '--help':
+    case undefined:
+      printHelp()
+      break
+    case '-v':
+    case '--version': {
+      const { version } = await import('../package.json')
+      // biome-ignore lint/suspicious/noConsole: CLI version output
+      console.log(version)
+      break
     }
-  }
-  // Session params with MCP servers
-  const sessionParams = {
-    cwd: cwd ?? process.cwd(),
-    mcpServers,
+    default:
+      console.error(`Unknown command: ${command}`)
+      console.error("Run 'acp-harness --help' for usage")
+      process.exit(1)
   }
-  let isFirstOutput = true
-  try {
-    logProgress('Connecting to agent...', showProgress)
-    await client.connect()
-    logProgress('Connected!', showProgress)
-    // Create session with MCP servers
-    const session = await client.createSession(sessionParams)
-    logProgress(`Session: ${session.id}`, showProgress)
-    // Run evaluations sequentially
-    for (let i = 0; i < prompts.length; i++) {
-      const promptCase = prompts[i]
-      if (!promptCase) continue
-      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, showProgress)
-      const startTime = Date.now()
-      let result: FullResult
-      try {
-        const prompt = createPrompt(promptCase.input)
-        const { updates } = await client.promptSync(session.id, prompt)
-        const endTime = Date.now()
-        const trajectory = extractTrajectory(updates, startTime)
-        const output = extractOutput(trajectory)
-        const hasErrors = hasToolErrors(trajectory)
-        result = {
-          id: promptCase.id,
-          input: promptCase.input,
-          output,
-          ...(promptCase.expected && { expected: promptCase.expected }),
-          trajectory,
-          metadata: {
-            ...promptCase.metadata,
-            agent: agentCommand.join(' '),
-          },
-          timing: {
-            start: startTime,
-            end: endTime,
-            firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
-          },
-          status: hasErrors ? 'failed' : 'passed',
-        }
-      } catch (error) {
-        const endTime = Date.now()
-        const message = error instanceof Error ? error.message : String(error)
-        const isTimeout = message.includes('timeout') || message.includes('timed out')
-        result = {
-          id: promptCase.id,
-          input: promptCase.input,
-          output: '',
-          trajectory: [],
-          metadata: {
-            ...promptCase.metadata,
-            agent: agentCommand.join(' '),
-          },
-          timing: {
-            start: startTime,
-            end: endTime,
-          },
-          status: isTimeout ? 'timeout' : 'error',
-          errors: [message],
-        }
-      }
-      // Format and output result
-      if (format === 'judge') {
-        // Judge format: write markdown to .md, full JSONL to .full.jsonl
-        const markdown = formatJudgeMarkdown(result)
-        const fullJsonl = formatFullWithStepIds(result)
-        await writeOutput(markdown, judgeMarkdownPath, !isFirstOutput)
-        await writeOutput(fullJsonl, judgeFullPath, !isFirstOutput)
-      } else {
-        // Summary format: write to single file
-        const formatted = formatResult(result, format)
-        await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-      }
-      isFirstOutput = false
-      const statusIcon = result.status === 'passed' ? '✓' : result.status === 'failed' ? '✗' : '!'
-      logProgress(`  ${statusIcon} ${result.status} (${result.timing.end - result.timing.start}ms)`, showProgress)
-    }
-  } finally {
-    logProgress('Disconnecting...', showProgress)
-    await client.disconnect()
-  }
-  logProgress('Done!', showProgress)
 }
 main().catch((error) => {
-  console.error('Error:', error)
+  console.error('Error:', error instanceof Error ? error.message : error)
   process.exit(1)
 })