npm - @plaited/acp-harness - Versions diffs - 0.2.6 → 0.3.2 - Mend

@plaited/acp-harness 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +1 -1
package/README.md +175 -34
package/bin/cli.ts +105 -636
package/bin/tests/cli.spec.ts +218 -51
package/package.json +21 -5
package/src/acp-client.ts +5 -4
package/src/acp-transport.ts +14 -7
package/src/adapter-check.ts +542 -0
package/src/adapter-scaffold.ts +934 -0
package/src/balance.ts +257 -0
package/src/calibrate.ts +319 -0
package/src/capture.ts +457 -0
package/src/constants.ts +94 -0
package/src/grader-loader.ts +174 -0
package/src/harness.ts +35 -0
package/src/schemas-cli.ts +239 -0
package/src/schemas.ts +567 -0
package/src/summarize.ts +259 -0
package/src/tests/adapter-check.spec.ts +70 -0
package/src/tests/adapter-scaffold.spec.ts +112 -0
package/src/tests/balance-helpers.spec.ts +279 -0
package/src/tests/calibrate-helpers.spec.ts +226 -0
package/src/tests/capture-helpers.spec.ts +553 -0
package/src/tests/fixtures/grader-bad-module.ts +5 -0
package/src/tests/fixtures/grader-exec-fail.py +9 -0
package/src/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/tests/fixtures/grader-exec.py +29 -0
package/src/tests/fixtures/grader-module.ts +14 -0
package/src/tests/grader-loader.spec.ts +153 -0
package/src/tests/summarize-helpers.spec.ts +339 -0
package/src/tests/trials-calculations.spec.ts +209 -0
package/src/trials.ts +407 -0
package/src/validate-refs.ts +188 -0
package/.claude/rules/accuracy.md +0 -43
package/.claude/rules/bun-apis.md +0 -80
package/.claude/rules/code-review.md +0 -254
package/.claude/rules/git-workflow.md +0 -37
package/.claude/rules/github.md +0 -154
package/.claude/rules/testing.md +0 -172
package/.claude/skills/acp-harness/SKILL.md +0 -310
package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
package/.claude/skills/acp-harness/references/downstream.md +0 -288
package/.claude/skills/acp-harness/references/output-formats.md +0 -221
package/.claude-plugin/marketplace.json +0 -15
package/.claude-plugin/plugin.json +0 -16
package/.github/CODEOWNERS +0 -6
package/.github/workflows/ci.yml +0 -63
package/.github/workflows/publish.yml +0 -146
package/.mcp.json +0 -20
package/CLAUDE.md +0 -92
package/Dockerfile.test +0 -23
package/biome.json +0 -96
package/bun.lock +0 -513
package/docker-compose.test.yml +0 -21
package/scripts/bun-test-wrapper.sh +0 -46
package/src/acp.constants.ts +0 -56
package/src/acp.schemas.ts +0 -161
package/src/acp.types.ts +0 -28
package/src/tests/fixtures/.claude/settings.local.json +0 -8
package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
package/tsconfig.json +0 -32

package/src/trials.ts ADDED Viewed

@@ -0,0 +1,407 @@
+/**
+ * Multi-run trials command for pass@k/pass^k analysis.
+ *
+ * @remarks
+ * Runs each prompt k times to measure non-determinism.
+ * Without a grader, captures raw trials. With a grader, computes:
+ * - passRate: Simple pass rate (passes / k)
+ * - passAtK: Probability of at least one pass in k samples
+ * - passExpK: Probability of all k samples passing
+ *
+ * @packageDocumentation
+ */
+import { appendFile } from 'node:fs/promises'
+import { parseArgs } from 'node:util'
+import { createACPClient } from './acp-client.ts'
+import { createPrompt } from './acp-helpers.ts'
+import { extractOutput, extractTrajectory, loadPrompts } from './capture.ts'
+import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from './constants.ts'
+import { loadGrader } from './grader-loader.ts'
+import type { Grader, TrialEntry, TrialResult } from './schemas.ts'
+import { McpServerSchema } from './schemas.ts'
+// ============================================================================
+// Pass@k/Pass^k Calculation
+// ============================================================================
+/**
+ * Calculate pass@k: probability of at least one pass in k samples.
+ *
+ * @remarks
+ * Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k)
+ * where n = total samples, c = correct samples, k = samples per trial
+ *
+ * For our case where n = k (we run exactly k trials per prompt):
+ * pass@k = 1 - (1 - passRate)^k (simplified)
+ *
+ * @param passes - Number of passing trials
+ * @param k - Total number of trials
+ * @returns Probability of at least one pass
+ *
+ * @public
+ */
+export const calculatePassAtK = (passes: number, k: number): number => {
+  if (passes >= k) return 1
+  if (passes === 0) return 0
+  // Simplified formula when n = k
+  const passRate = passes / k
+  return 1 - (1 - passRate) ** k
+}
+/**
+ * Calculate pass^k: probability of all k samples passing.
+ *
+ * @remarks
+ * This is simply passRate^k
+ *
+ * @param passes - Number of passing trials
+ * @param k - Total number of trials
+ * @returns Probability of all k samples passing
+ *
+ * @public
+ */
+export const calculatePassExpK = (passes: number, k: number): number => {
+  if (passes === k) return 1
+  if (passes === 0) return 0
+  const passRate = passes / k
+  return passRate ** k
+}
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for trials command */
+export type TrialsConfig = {
+  /** Path to prompts.jsonl file */
+  promptsPath: string
+  /** ACP agent command */
+  agentCommand: string[]
+  /** Number of trials per prompt */
+  k: number
+  /** Output file path */
+  outputPath?: string
+  /** Working directory for agent */
+  cwd?: string
+  /** Timeout per prompt in milliseconds */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+  /** Append to output file */
+  append?: boolean
+  /** MCP server configurations */
+  mcpServers?: unknown[]
+  /** Optional grader function */
+  grader?: Grader
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+/** Write output line */
+const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
+  if (outputPath) {
+    if (append) {
+      await appendFile(outputPath, `${line}\n`)
+    } else {
+      await Bun.write(outputPath, `${line}\n`)
+    }
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(line)
+  }
+}
+/** Log progress to stderr */
+const logProgress = (message: string, showProgress: boolean): void => {
+  if (showProgress) {
+    console.error(message)
+  }
+}
+// ============================================================================
+// Trials Implementation
+// ============================================================================
+/**
+ * Execute trials with configuration object.
+ *
+ * @param config - Trials configuration
+ * @returns Array of trial results
+ */
+export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
+  const {
+    promptsPath,
+    agentCommand,
+    k,
+    outputPath,
+    cwd,
+    timeout = DEFAULT_HARNESS_TIMEOUT,
+    progress = false,
+    append = false,
+    mcpServers = [],
+    grader,
+  } = config
+  // Parse MCP server configurations
+  const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
+  // Load prompts
+  const prompts = await loadPrompts(promptsPath)
+  // Resolve output path
+  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  // Log progress info
+  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
+  logProgress(`Running ${k} trials per prompt`, progress)
+  logProgress(`Command: ${agentCommand.join(' ')}`, progress)
+  if (grader) {
+    logProgress('Grader: enabled (will compute pass@k metrics)', progress)
+  }
+  // Create ACP client
+  const client = createACPClient({
+    command: agentCommand,
+    cwd,
+    timeout,
+  })
+  // Clear output file if not appending
+  if (resolvedOutputPath && !append) {
+    await Bun.write(resolvedOutputPath, '')
+  }
+  // Session params
+  const sessionParams = {
+    cwd: cwd ?? process.cwd(),
+    mcpServers: parsedMcpServers,
+  }
+  const results: TrialResult[] = []
+  let isFirstOutput = true
+  try {
+    logProgress('Connecting to agent...', progress)
+    await client.connect()
+    logProgress('Connected!', progress)
+    // Run evaluations
+    for (let i = 0; i < prompts.length; i++) {
+      const promptCase = prompts[i]
+      if (!promptCase) continue
+      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
+      const trialEntries: TrialEntry[] = []
+      for (let trialNum = 1; trialNum <= k; trialNum++) {
+        // Create fresh session for each trial
+        const session = await client.createSession(sessionParams)
+        const startTime = Date.now()
+        try {
+          const prompt = createPrompt(promptCase.input)
+          const { updates } = await client.promptSync(session.id, prompt)
+          const endTime = Date.now()
+          const trajectory = extractTrajectory(updates, startTime)
+          const output = extractOutput(trajectory)
+          const entry: TrialEntry = {
+            trialNum,
+            output,
+            trajectory,
+            duration: endTime - startTime,
+          }
+          // Apply grader if provided
+          if (grader) {
+            const graderResult = await grader({
+              input: promptCase.input,
+              output,
+              expected: promptCase.expected,
+              trajectory,
+            })
+            entry.pass = graderResult.pass
+            entry.score = graderResult.score
+            entry.reasoning = graderResult.reasoning
+          }
+          trialEntries.push(entry)
+          logProgress(
+            `    Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
+            progress,
+          )
+        } catch (error) {
+          const endTime = Date.now()
+          const message = error instanceof Error ? error.message : String(error)
+          trialEntries.push({
+            trialNum,
+            output: '',
+            trajectory: [],
+            duration: endTime - startTime,
+            pass: false,
+            reasoning: `Error: ${message}`,
+          })
+          logProgress(`    Trial ${trialNum}/${k}: ! (error)`, progress)
+        }
+      }
+      // Build result
+      const result: TrialResult = {
+        id: promptCase.id,
+        input: promptCase.input,
+        ...(promptCase.expected && { expected: promptCase.expected }),
+        k,
+        trials: trialEntries,
+      }
+      // Calculate metrics if grader was used
+      if (grader) {
+        const passes = trialEntries.filter((t) => t.pass).length
+        result.passRate = passes / k
+        result.passAtK = calculatePassAtK(passes, k)
+        result.passExpK = calculatePassExpK(passes, k)
+      }
+      results.push(result)
+      // Write result immediately
+      const formatted = JSON.stringify(result)
+      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+      isFirstOutput = false
+      if (grader) {
+        logProgress(
+          `  → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
+          progress,
+        )
+      }
+    }
+  } finally {
+    logProgress('Disconnecting...', progress)
+    await client.disconnect()
+  }
+  logProgress('Done!', progress)
+  return results
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Trials command CLI handler.
+ *
+ * @param args - Command line arguments (after 'trials')
+ */
+export const trials = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) },
+      cwd: { type: 'string', short: 'c' },
+      timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
+      progress: { type: 'boolean', default: false },
+      append: { type: 'boolean', default: false },
+      'mcp-server': { type: 'string', multiple: true },
+      grader: { type: 'string', short: 'g' },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness trials <prompts.jsonl> <command> [args...] [options]
+Arguments:
+  prompts.jsonl     Input file with evaluation prompts
+  command [args]    ACP agent command to execute
+Options:
+  -o, --output      Output file (default: stdout)
+  -k                Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
+  -c, --cwd         Working directory for agent
+  -t, --timeout     Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
+  --progress        Show progress to stderr
+  --append          Append to output file
+  --mcp-server      MCP server config JSON (repeatable)
+  -g, --grader      Path to grader (.ts/.js module or executable script)
+  -h, --help        Show this help message
+Output Format:
+  Without grader: Raw trials with trajectories
+  With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK)
+Graders:
+  TS/JS modules must export a 'grade' function.
+  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Examples:
+  # Capture only
+  acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 -o trials.jsonl
+  # With TypeScript grader
+  acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts -o trials.jsonl
+  # With Python grader
+  acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.py -o trials.jsonl
+`)
+    return
+  }
+  const promptsPath = positionals[0]
+  if (!promptsPath) {
+    console.error('Error: prompts.jsonl path is required')
+    process.exit(1)
+  }
+  const agentCommand = positionals.slice(1)
+  if (agentCommand.length === 0) {
+    console.error('Error: ACP agent command is required')
+    process.exit(1)
+  }
+  // Load grader if specified
+  let grader: Grader | undefined
+  if (values.grader) {
+    try {
+      grader = await loadGrader(values.grader)
+    } catch (error) {
+      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+      process.exit(1)
+    }
+  }
+  // Parse MCP server configurations
+  const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
+  await runTrials({
+    promptsPath,
+    agentCommand,
+    k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
+    outputPath: values.output,
+    cwd: values.cwd,
+    timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
+    progress: values.progress ?? false,
+    append: values.append ?? false,
+    mcpServers,
+    grader,
+  })
+}

package/src/validate-refs.ts ADDED Viewed

@@ -0,0 +1,188 @@
+/**
+ * Validate-refs command - check reference solutions against grader.
+ *
+ * @remarks
+ * Validates that reference solutions in prompts.jsonl pass the grader.
+ * Helps identify prompts with broken or incorrect reference solutions.
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { loadPrompts } from './capture.ts'
+import { loadGrader } from './grader-loader.ts'
+import type { Grader, ValidationResult } from './schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for validate-refs command */
+export type ValidateRefsConfig = {
+  /** Path to prompts.jsonl file */
+  promptsPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Grader function */
+  grader: Grader
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+// ============================================================================
+// Validate-Refs Implementation
+// ============================================================================
+/**
+ * Execute validate-refs with configuration object.
+ *
+ * @param config - Validate-refs configuration
+ * @returns Array of validation results
+ */
+export const runValidateRefs = async (config: ValidateRefsConfig): Promise<ValidationResult[]> => {
+  const { promptsPath, outputPath, grader } = config
+  // Load prompts
+  const prompts = await loadPrompts(promptsPath)
+  // Filter to prompts with reference solutions
+  const promptsWithRefs = prompts.filter((p) => p.reference !== undefined)
+  if (promptsWithRefs.length === 0) {
+    console.error('No prompts with reference solutions found')
+    return []
+  }
+  console.error(`Validating ${promptsWithRefs.length} reference solutions...`)
+  const results: ValidationResult[] = []
+  for (const prompt of promptsWithRefs) {
+    const graderResult = await grader({
+      input: prompt.input,
+      output: prompt.reference as string,
+      expected: prompt.expected,
+      trajectory: [], // No trajectory for reference validation
+    })
+    results.push({
+      id: prompt.id,
+      reference: prompt.reference as string,
+      passes: graderResult.pass,
+      graderResult,
+    })
+    const icon = graderResult.pass ? '✓' : '✗'
+    console.error(`  ${icon} ${prompt.id}`)
+  }
+  // Format output
+  const output = results.map((r) => JSON.stringify(r)).join('\n')
+  // Write output
+  if (outputPath) {
+    await Bun.write(resolvePath(outputPath), output)
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(output)
+  }
+  // Summary
+  const passed = results.filter((r) => r.passes).length
+  const failed = results.length - passed
+  console.error(`\nResults: ${passed} passed, ${failed} failed`)
+  if (failed > 0) {
+    console.error('\nFailing references:')
+    for (const result of results.filter((r) => !r.passes)) {
+      console.error(`  - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`)
+    }
+  }
+  return results
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Validate-refs command CLI handler.
+ *
+ * @param args - Command line arguments (after 'validate-refs')
+ */
+export const validateRefs = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      grader: { type: 'string', short: 'g' },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
+Arguments:
+  prompts.jsonl     Input file with prompts (must have 'reference' field)
+Options:
+  -o, --output      Output file (default: stdout)
+  -g, --grader      Path to grader (.ts/.js module or executable script, required)
+  -h, --help        Show this help message
+Output:
+  JSONL with validation results for each reference solution.
+Prompt Format:
+  {
+    "id": "test-001",
+    "input": "What is 2+2?",
+    "expected": "4",
+    "reference": "The answer is 4."
+  }
+Examples:
+  acp-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
+`)
+    return
+  }
+  const promptsPath = positionals[0]
+  if (!promptsPath) {
+    console.error('Error: prompts.jsonl path is required')
+    process.exit(1)
+  }
+  if (!values.grader) {
+    console.error('Error: --grader is required for validate-refs')
+    process.exit(1)
+  }
+  // Load grader
+  let grader: Grader
+  try {
+    grader = await loadGrader(values.grader)
+  } catch (error) {
+    console.error(`Error: ${error instanceof Error ? error.message : error}`)
+    process.exit(1)
+  }
+  await runValidateRefs({
+    promptsPath,
+    outputPath: values.output,
+    grader,
+  })
+}

package/.claude/rules/accuracy.md DELETED Viewed

@@ -1,43 +0,0 @@
-# Accuracy and Confidence Standards
-**Confidence Threshold**: 95% - Report uncertainty rather than guess
-## Verification Protocol
-1. **Verification First**: Before stating any specific implementation detail (function signature, file path, API schema), use the `typescript-lsp` skill to verify types and signatures, then read the relevant file in real-time to verify accuracy.
-2. **Handling Uncertainty**: If you cannot verify information or find contradictions between instructions and live code, you must NOT provide speculative answers.
-   - **Action**: Clearly state you cannot answer with high confidence and explain the discrepancy.
-   - Example: "I cannot confirm [detail] because my instructions indicate [X], but the current file shows [Y]. My knowledge may be outdated."
-3. **Dynamic Exploration**:
-   - **PREFER typescript-lsp over Grep/Glob** for `.ts`, `.tsx`, `.js`, `.jsx` files
-   - Use `lsp-find` to search for symbols, types, and patterns across the workspace
-   - Use `lsp-references` to find all usages of a symbol
-   - Use `lsp-hover` to verify type signatures
-   - Only fall back to Grep/Glob for non-TypeScript files or when LSP is unavailable
-   - Use Read for other file types. Always prioritize live code over instructions.
-4. **Tool-Assisted Verification**: Use these skills to enhance verification accuracy:
-   - **`typescript-lsp` skill**: Use `lsp-hover` to verify type signatures, `lsp-references` to find all usages before modifying, `lsp-symbols` for file structure, and `lsp-find` to search for patterns across the workspace.
-   - **WebFetch**: Retrieve current documentation from authoritative sources (MDN Web Docs, WHATWG specs) when using web platform APIs.
-   - These skills complement (but do not replace) reading live code - always verify outputs against actual implementation.
-## Certainty Requirements
-You may only propose a specific change if you are **at least 95% certain** it is correct, based on direct comparison with current code.
-**When uncertain:**
-- Report the discrepancy clearly
-- State why you cannot confidently recommend a fix
-- Present the issue to the user for manual resolution
-- DO NOT invent solutions or infer changes
-## For Agent-Specific Applications
-Agents should apply these standards to their specific domain:
-- **Documentation agents**: Only update TSDoc if parameter names/types match current code
-- **Architecture agents**: Verify referenced patterns exist in current codebase
-- **Code review agents**: Read files before commenting on implementation details
-- **Pattern agents**: Confirm examples reflect actual usage in codebase