npm - @plaited/acp-harness - Versions diffs - 0.2.5 → 0.3.1 - Mend

@plaited/acp-harness 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/LICENSE +1 -1
package/README.md +120 -16
package/bin/cli.ts +105 -636
package/bin/tests/cli.spec.ts +218 -51
package/package.json +20 -4
package/src/acp-client.ts +5 -4
package/src/acp-transport.ts +14 -7
package/src/adapter-check.ts +542 -0
package/src/adapter-scaffold.ts +934 -0
package/src/balance.ts +232 -0
package/src/calibrate.ts +300 -0
package/src/capture.ts +457 -0
package/src/constants.ts +94 -0
package/src/grader-loader.ts +174 -0
package/src/harness.ts +35 -0
package/src/schemas-cli.ts +239 -0
package/src/schemas.ts +567 -0
package/src/summarize.ts +245 -0
package/src/tests/adapter-check.spec.ts +70 -0
package/src/tests/adapter-scaffold.spec.ts +112 -0
package/src/tests/fixtures/grader-bad-module.ts +5 -0
package/src/tests/fixtures/grader-exec-fail.py +9 -0
package/src/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/tests/fixtures/grader-exec.py +29 -0
package/src/tests/fixtures/grader-module.ts +14 -0
package/src/tests/grader-loader.spec.ts +153 -0
package/src/trials.ts +395 -0
package/src/validate-refs.ts +188 -0
package/.claude/rules/accuracy.md +0 -43
package/.claude/rules/bun-apis.md +0 -80
package/.claude/rules/code-review.md +0 -254
package/.claude/rules/git-workflow.md +0 -37
package/.claude/rules/github.md +0 -154
package/.claude/rules/testing.md +0 -172
package/.claude/skills/acp-harness/SKILL.md +0 -310
package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
package/.claude/skills/acp-harness/references/downstream.md +0 -288
package/.claude/skills/acp-harness/references/output-formats.md +0 -221
package/.claude-plugin/marketplace.json +0 -15
package/.claude-plugin/plugin.json +0 -16
package/.github/CODEOWNERS +0 -6
package/.github/workflows/ci.yml +0 -63
package/.github/workflows/publish.yml +0 -146
package/.mcp.json +0 -20
package/CLAUDE.md +0 -92
package/Dockerfile.test +0 -23
package/biome.json +0 -96
package/bun.lock +0 -513
package/docker-compose.test.yml +0 -21
package/scripts/bun-test-wrapper.sh +0 -46
package/src/acp.constants.ts +0 -56
package/src/acp.schemas.ts +0 -161
package/src/acp.types.ts +0 -28
package/src/tests/fixtures/.claude/settings.local.json +0 -8
package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
package/tsconfig.json +0 -32

package/src/capture.ts ADDED Viewed

@@ -0,0 +1,457 @@
+/**
+ * Core trajectory capture command.
+ *
+ * @remarks
+ * Executes prompts against an ACP agent and captures full trajectories.
+ * This is the foundational command - all other views derive from its output.
+ *
+ * Output format is always full trajectory JSONL (`CaptureResultSchema`).
+ * Use `summarize` command to derive compact views.
+ *
+ * @packageDocumentation
+ */
+import { appendFile } from 'node:fs/promises'
+import { parseArgs } from 'node:util'
+import type { SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
+import { createACPClient } from './acp-client.ts'
+import { createPrompt } from './acp-helpers.ts'
+import { DEFAULT_HARNESS_TIMEOUT, HEAD_LINES, TAIL_LINES } from './constants.ts'
+import { loadGrader } from './grader-loader.ts'
+import type { CaptureResult, Grader, PromptCase, TrajectoryStep } from './schemas.ts'
+import { McpServerSchema, PromptCaseSchema, ToolInputSchema } from './schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for capture command */
+export type CaptureConfig = {
+  /** Path to prompts.jsonl file */
+  promptsPath: string
+  /** ACP agent command (e.g., ['bunx', 'claude-code-acp']) */
+  agentCommand: string[]
+  /** Output file path (undefined for stdout) */
+  outputPath?: string
+  /** Working directory for agent */
+  cwd?: string
+  /** Timeout per prompt in milliseconds */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+  /** Append to output file instead of overwriting */
+  append?: boolean
+  /** MCP server configurations */
+  mcpServers?: unknown[]
+  /** Optional grader function */
+  grader?: Grader
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Load prompts from JSONL file */
+export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return PromptCaseSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+/** Extract trajectory from session notifications */
+export const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
+  const trajectory: TrajectoryStep[] = []
+  const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
+  for (const notification of notifications) {
+    const timestamp = Date.now() - startTime
+    const update = notification.update
+    if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
+      trajectory.push({
+        type: 'thought',
+        content: update.content.text,
+        timestamp,
+      })
+    } else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
+      trajectory.push({
+        type: 'message',
+        content: update.content.text,
+        timestamp,
+      })
+    } else if (update.sessionUpdate === 'tool_call') {
+      const toolCall = update as ToolCall
+      const existing = toolCallMap.get(toolCall.toolCallId)
+      if (existing) {
+        // Update existing tool call with completion info
+        existing.step.status = toolCall.status ?? 'pending'
+        if (toolCall.content) {
+          existing.step.output = toolCall.content
+        }
+        if (toolCall.rawOutput) {
+          existing.step.output = toolCall.rawOutput
+        }
+        existing.step.duration = timestamp - existing.start
+      } else {
+        // New tool call
+        const step: TrajectoryStep & { type: 'tool_call' } = {
+          type: 'tool_call',
+          name: toolCall.title,
+          status: toolCall.status ?? 'pending',
+          input: toolCall.rawInput,
+          timestamp,
+        }
+        toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
+        trajectory.push(step)
+      }
+    } else if (update.sessionUpdate === 'plan') {
+      trajectory.push({
+        type: 'plan',
+        entries: update.entries,
+        timestamp,
+      })
+    }
+  }
+  return trajectory
+}
+/** Extract final text output from trajectory */
+export const extractOutput = (trajectory: TrajectoryStep[]): string => {
+  return trajectory
+    .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
+    .map((step) => step.content)
+    .join('\n')
+}
+/** Check if any tool calls failed */
+export const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
+  return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
+}
+/** Head/tail preview of content */
+export const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
+  const lines = content.split('\n')
+  if (lines.length <= headLines + tailLines) {
+    return content
+  }
+  const head = lines.slice(0, headLines).join('\n')
+  const tail = lines.slice(-tailLines).join('\n')
+  const omitted = lines.length - headLines - tailLines
+  return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
+}
+/** Extract file path from tool input if present */
+export const extractFilePath = (input: unknown): string | undefined => {
+  const result = ToolInputSchema.safeParse(input)
+  if (!result.success) return undefined
+  return result.data.file_path ?? result.data.path
+}
+/** Extract content from tool input if present */
+export const extractContent = (input: unknown): string | undefined => {
+  const result = ToolInputSchema.safeParse(input)
+  if (!result.success) return undefined
+  return result.data.content ?? result.data.new_string
+}
+/** Write output line (to stdout or file) */
+const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
+  if (outputPath) {
+    if (append) {
+      await appendFile(outputPath, `${line}\n`)
+    } else {
+      await Bun.write(outputPath, `${line}\n`)
+    }
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(line)
+  }
+}
+/** Log progress to stderr (doesn't pollute stdout) */
+const logProgress = (message: string, showProgress: boolean): void => {
+  if (showProgress) {
+    console.error(message)
+  }
+}
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+// ============================================================================
+// Capture Implementation
+// ============================================================================
+/**
+ * Execute capture with configuration object.
+ *
+ * @param config - Capture configuration
+ * @returns Array of capture results
+ */
+export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
+  const {
+    promptsPath,
+    agentCommand,
+    outputPath,
+    cwd,
+    timeout = DEFAULT_HARNESS_TIMEOUT,
+    progress = false,
+    append = false,
+    mcpServers = [],
+    grader,
+  } = config
+  // Parse MCP server configurations
+  const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
+  // Load prompts
+  const prompts = await loadPrompts(promptsPath)
+  // Resolve output path
+  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  // Log progress info
+  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
+  logProgress(`Command: ${agentCommand.join(' ')}`, progress)
+  if (resolvedOutputPath) {
+    logProgress(`Output: ${resolvedOutputPath}`, progress)
+  }
+  if (parsedMcpServers.length > 0) {
+    logProgress(`MCP Servers: ${parsedMcpServers.map((s) => s.name).join(', ')}`, progress)
+  }
+  // Create ACP client
+  const client = createACPClient({
+    command: agentCommand,
+    cwd,
+    timeout,
+  })
+  // Clear output file if not appending
+  if (resolvedOutputPath && !append) {
+    await Bun.write(resolvedOutputPath, '')
+  }
+  // Session params with MCP servers
+  const sessionParams = {
+    cwd: cwd ?? process.cwd(),
+    mcpServers: parsedMcpServers,
+  }
+  const results: CaptureResult[] = []
+  let isFirstOutput = true
+  try {
+    logProgress('Connecting to agent...', progress)
+    await client.connect()
+    logProgress('Connected!', progress)
+    // Create session with MCP servers
+    const session = await client.createSession(sessionParams)
+    logProgress(`Session: ${session.id}`, progress)
+    // Run evaluations sequentially
+    for (let i = 0; i < prompts.length; i++) {
+      const promptCase = prompts[i]
+      if (!promptCase) continue
+      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, progress)
+      const startTime = Date.now()
+      let result: CaptureResult
+      try {
+        const prompt = createPrompt(promptCase.input)
+        const { updates } = await client.promptSync(session.id, prompt)
+        const endTime = Date.now()
+        const trajectory = extractTrajectory(updates, startTime)
+        const output = extractOutput(trajectory)
+        const toolErrors = hasToolErrors(trajectory)
+        result = {
+          id: promptCase.id,
+          input: promptCase.input,
+          output,
+          ...(promptCase.expected && { expected: promptCase.expected }),
+          trajectory,
+          metadata: {
+            ...promptCase.metadata,
+            agent: agentCommand.join(' '),
+          },
+          timing: {
+            start: startTime,
+            end: endTime,
+            firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
+          },
+          toolErrors,
+        }
+        // Apply grader if provided
+        if (grader) {
+          result.score = await grader({
+            input: promptCase.input,
+            output,
+            expected: promptCase.expected,
+            trajectory,
+          })
+        }
+      } catch (error) {
+        const endTime = Date.now()
+        const message = error instanceof Error ? error.message : String(error)
+        result = {
+          id: promptCase.id,
+          input: promptCase.input,
+          output: '',
+          trajectory: [],
+          metadata: {
+            ...promptCase.metadata,
+            agent: agentCommand.join(' '),
+          },
+          timing: {
+            start: startTime,
+            end: endTime,
+          },
+          toolErrors: true,
+          errors: [message],
+        }
+      }
+      results.push(result)
+      // Write result immediately
+      const formatted = JSON.stringify(result)
+      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+      isFirstOutput = false
+      const statusIcon = result.toolErrors ? '!' : '✓'
+      logProgress(`  ${statusIcon} (${result.timing.end - result.timing.start}ms)`, progress)
+    }
+  } finally {
+    logProgress('Disconnecting...', progress)
+    await client.disconnect()
+  }
+  logProgress('Done!', progress)
+  return results
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Capture command CLI handler.
+ *
+ * @param args - Command line arguments (after 'capture')
+ */
+export const capture = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      cwd: { type: 'string', short: 'c' },
+      timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
+      progress: { type: 'boolean', default: false },
+      append: { type: 'boolean', default: false },
+      'mcp-server': { type: 'string', multiple: true },
+      grader: { type: 'string', short: 'g' },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness capture <prompts.jsonl> <command> [args...] [options]
+Arguments:
+  prompts.jsonl     Input file with evaluation prompts
+  command [args]    ACP agent command to execute
+Options:
+  -o, --output      Output file (default: stdout)
+  -c, --cwd         Working directory for agent
+  -t, --timeout     Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
+  --progress        Show progress to stderr
+  --append          Append to output file instead of overwriting
+  --mcp-server      MCP server config JSON (repeatable)
+  -g, --grader      Path to grader (.ts/.js module or executable script)
+  -h, --help        Show this help message
+Output Format:
+  Full trajectory JSONL with toolErrors indicator.
+  Use 'acp-harness summarize' to derive compact views.
+Graders:
+  TS/JS modules must export a 'grade' function.
+  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Examples:
+  # Basic capture
+  acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
+  # With TypeScript grader
+  acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.ts -o results.jsonl
+  # With Python grader
+  acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.py -o results.jsonl
+`)
+    return
+  }
+  const promptsPath = positionals[0]
+  if (!promptsPath) {
+    console.error('Error: prompts.jsonl path is required')
+    process.exit(1)
+  }
+  const agentCommand = positionals.slice(1)
+  if (agentCommand.length === 0) {
+    console.error('Error: ACP agent command is required')
+    console.error('Example: acp-harness capture prompts.jsonl bunx claude-code-acp')
+    process.exit(1)
+  }
+  // Load grader if specified
+  let grader: Grader | undefined
+  if (values.grader) {
+    try {
+      grader = await loadGrader(values.grader)
+    } catch (error) {
+      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+      process.exit(1)
+    }
+  }
+  // Parse MCP server configurations
+  const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
+  await runCapture({
+    promptsPath,
+    agentCommand,
+    outputPath: values.output,
+    cwd: values.cwd,
+    timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
+    progress: values.progress ?? false,
+    append: values.append ?? false,
+    mcpServers,
+    grader,
+  })
+}

package/src/constants.ts ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * Constants for ACP client and harness operations.
+ *
+ * @remarks
+ * Contains all constant values used across the implementation:
+ * - ACP protocol method names and version
+ * - JSON-RPC error codes
+ * - Harness defaults (timeouts, preview limits)
+ *
+ * @packageDocumentation
+ */
+// ============================================================================
+// ACP Protocol Methods
+// ============================================================================
+/** ACP method names */
+export const ACP_METHODS = {
+  // Lifecycle
+  INITIALIZE: 'initialize',
+  SHUTDOWN: 'shutdown',
+  // Sessions
+  CREATE_SESSION: 'session/new',
+  LOAD_SESSION: 'session/load',
+  PROMPT: 'session/prompt',
+  CANCEL: 'session/cancel',
+  UPDATE: 'session/update',
+  REQUEST_PERMISSION: 'session/request_permission',
+  SET_MODEL: 'session/set_model',
+  // Protocol-level
+  CANCEL_REQUEST: '$/cancel_request',
+} as const
+// ============================================================================
+// ACP Protocol Version
+// ============================================================================
+/** Current protocol version - SDK uses number type */
+export const ACP_PROTOCOL_VERSION = 1 as const
+// ============================================================================
+// JSON-RPC Error Codes
+// ============================================================================
+/** Standard JSON-RPC error codes */
+export const JSON_RPC_ERRORS = {
+  PARSE_ERROR: -32700,
+  INVALID_REQUEST: -32600,
+  METHOD_NOT_FOUND: -32601,
+  INVALID_PARAMS: -32602,
+  INTERNAL_ERROR: -32603,
+  REQUEST_CANCELLED: -32800,
+} as const
+// ============================================================================
+// ACP Client Defaults
+// ============================================================================
+/** Default ACP Client Name */
+export const DEFAULT_ACP_CLIENT_NAME = 'plaited-acp-client'
+/** Default timeout for ACP operations in milliseconds */
+export const DEFAULT_ACP_TIMEOUT = 30000
+/** Default polling interval for streaming updates in milliseconds */
+export const DEFAULT_POLLING_INTERVAL = 50
+// ============================================================================
+// Harness Preview Configuration
+// ============================================================================
+/** Number of lines to show at the head of content previews */
+export const HEAD_LINES = 8
+/** Number of lines to show at the tail of content previews */
+export const TAIL_LINES = 4
+/** Maximum content length before applying head/tail preview */
+export const MAX_CONTENT_LENGTH = 500
+// ============================================================================
+// Harness Defaults
+// ============================================================================
+/** Default timeout for prompt evaluation in milliseconds */
+export const DEFAULT_HARNESS_TIMEOUT = 60000
+/** Default number of trials for pass@k analysis */
+export const DEFAULT_TRIAL_COUNT = 5
+/** Default sample size for calibration */
+export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10