npm - @plaited/acp-harness - Versions diffs - 0.3.2 → 0.4.0 - Mend

@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +53 -31
package/bin/cli.ts +15 -0
package/package.json +5 -7
package/src/acp-client.ts +7 -4
package/src/adapter-check.ts +0 -1
package/src/adapter-scaffold.ts +16 -15
package/src/calibrate.ts +28 -8
package/src/capture.ts +114 -33
package/src/grader-loader.ts +3 -3
package/src/harness.ts +4 -0
package/src/headless-cli.ts +433 -0
package/src/headless-history-builder.ts +141 -0
package/src/headless-output-parser.ts +251 -0
package/src/headless-session-manager.ts +389 -0
package/src/headless.schemas.ts +241 -0
package/src/headless.ts +71 -0
package/src/headless.types.ts +19 -0
package/src/integration_tests/acp-claude.spec.ts +170 -0
package/src/integration_tests/acp-gemini.spec.ts +174 -0
package/src/schemas.ts +88 -36
package/src/summarize.ts +4 -8
package/src/tests/acp-client.spec.ts +1 -1
package/src/tests/capture-cli.spec.ts +188 -0
package/src/tests/capture-helpers.spec.ts +229 -67
package/src/tests/constants.spec.ts +121 -0
package/src/tests/fixtures/grader-exec.py +3 -3
package/src/tests/fixtures/grader-module.ts +2 -2
package/src/tests/grader-loader.spec.ts +5 -5
package/src/tests/headless.spec.ts +460 -0
package/src/tests/schemas-cli.spec.ts +142 -0
package/src/tests/schemas.spec.ts +657 -0
package/src/tests/summarize-helpers.spec.ts +3 -3
package/src/tests/trials-cli.spec.ts +145 -0
package/src/trials.ts +6 -19
package/src/validate-refs.ts +1 -1
package/src/tests/acp-integration.docker.ts +0 -214

package/src/tests/trials-cli.spec.ts ADDED Viewed

@@ -0,0 +1,145 @@
+import { describe, expect, test } from 'bun:test'
+import type { TrialsConfig } from '../trials.ts'
+// ============================================================================
+// TrialsConfig type
+// ============================================================================
+describe('TrialsConfig configuration', () => {
+  test('TrialsConfig type accepts valid configuration', () => {
+    const config: TrialsConfig = {
+      promptsPath: '/tmp/prompts.jsonl',
+      agentCommand: ['bunx', 'test-agent'],
+      k: 5,
+      outputPath: '/tmp/output.jsonl',
+      cwd: '/tmp',
+      timeout: 30000,
+      progress: true,
+      append: false,
+    }
+    expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
+    expect(config.agentCommand).toEqual(['bunx', 'test-agent'])
+    expect(config.k).toBe(5)
+  })
+  test('TrialsConfig allows minimal configuration', () => {
+    const config: TrialsConfig = {
+      promptsPath: '/tmp/prompts.jsonl',
+      agentCommand: ['echo', 'test'],
+      k: 3,
+    }
+    expect(config.outputPath).toBeUndefined()
+    expect(config.cwd).toBeUndefined()
+    expect(config.timeout).toBeUndefined()
+    expect(config.progress).toBeUndefined()
+    expect(config.append).toBeUndefined()
+    expect(config.grader).toBeUndefined()
+  })
+})
+// ============================================================================
+// CLI Help Output
+// ============================================================================
+describe('trials CLI', () => {
+  test('displays help with --help flag', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '--help'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stdout = await new Response(proc.stdout).text()
+    await proc.exited
+    expect(stdout).toContain('Usage: acp-harness trials')
+    expect(stdout).toContain('prompts.jsonl')
+    expect(stdout).toContain('-o, --output')
+    expect(stdout).toContain('-k')
+    expect(stdout).toContain('-c, --cwd')
+    expect(stdout).toContain('-t, --timeout')
+    expect(stdout).toContain('--progress')
+    expect(stdout).toContain('-g, --grader')
+    expect(stdout).toContain('pass@k')
+  })
+  test('shows error for missing prompts file argument', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('prompts.jsonl path is required')
+  })
+  test('shows error for missing agent command', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('ACP agent command is required')
+  })
+})
+// ============================================================================
+// Schemas CLI
+// ============================================================================
+describe('schemas CLI', () => {
+  test('displays help with --help flag', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--help'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stdout = await new Response(proc.stdout).text()
+    await proc.exited
+    expect(stdout).toContain('Usage: acp-harness schemas')
+    expect(stdout).toContain('-o, --output')
+    expect(stdout).toContain('-j, --json')
+    expect(stdout).toContain('-s, --split')
+    expect(stdout).toContain('-l, --list')
+    expect(stdout).toContain('Available Schemas')
+  })
+  test('lists schemas with --list flag', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--list'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stdout = await new Response(proc.stdout).text()
+    await proc.exited
+    expect(stdout).toContain('Available schemas')
+    expect(stdout).toContain('PromptCase')
+    expect(stdout).toContain('CaptureResult')
+    expect(stdout).toContain('GraderResult')
+  })
+  test('exports schema as JSON', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', 'PromptCase', '--json'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stdout = await new Response(proc.stdout).text()
+    await proc.exited
+    const schema = JSON.parse(stdout)
+    expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
+    expect(schema.title).toBe('PromptCase')
+    expect(schema.type).toBe('object')
+  })
+})

package/src/trials.ts CHANGED Viewed

@@ -19,7 +19,6 @@ import { extractOutput, extractTrajectory, loadPrompts } from './capture.ts'
 import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from './constants.ts'
 import { loadGrader } from './grader-loader.ts'
 import type { Grader, TrialEntry, TrialResult } from './schemas.ts'
-import { McpServerSchema } from './schemas.ts'
 // ============================================================================
 // Pass@k/Pass^k Calculation
@@ -92,8 +91,6 @@ export type TrialsConfig = {
   progress?: boolean
   /** Append to output file */
   append?: boolean
-  /** MCP server configurations */
-  mcpServers?: unknown[]
   /** Optional grader function */
   grader?: Grader
 }
@@ -149,13 +146,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
     timeout = DEFAULT_HARNESS_TIMEOUT,
     progress = false,
     append = false,
-    mcpServers = [],
     grader,
   } = config
-  // Parse MCP server configurations
-  const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
   // Load prompts
   const prompts = await loadPrompts(promptsPath)
@@ -182,10 +175,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
     await Bun.write(resolvedOutputPath, '')
   }
-  // Session params
+  // Session params - agents auto-discover MCP configs from cwd
   const sessionParams = {
     cwd: cwd ?? process.cwd(),
-    mcpServers: parsedMcpServers,
   }
   const results: TrialResult[] = []
@@ -211,7 +203,8 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
         const startTime = Date.now()
         try {
-          const prompt = createPrompt(promptCase.input)
+          const inputText = Array.isArray(promptCase.input) ? promptCase.input.join('\n') : promptCase.input
+          const prompt = createPrompt(inputText)
           const { updates } = await client.promptSync(session.id, prompt)
           const endTime = Date.now()
@@ -230,7 +223,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
             const graderResult = await grader({
               input: promptCase.input,
               output,
-              expected: promptCase.expected,
+              hint: promptCase.hint,
               trajectory,
             })
             entry.pass = graderResult.pass
@@ -263,7 +256,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
       const result: TrialResult = {
         id: promptCase.id,
         input: promptCase.input,
-        ...(promptCase.expected && { expected: promptCase.expected }),
+        ...(promptCase.hint && { hint: promptCase.hint }),
         k,
         trials: trialEntries,
       }
@@ -318,7 +311,6 @@ export const trials = async (args: string[]): Promise<void> => {
       timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
       progress: { type: 'boolean', default: false },
       append: { type: 'boolean', default: false },
-      'mcp-server': { type: 'string', multiple: true },
       grader: { type: 'string', short: 'g' },
       help: { type: 'boolean', short: 'h' },
     },
@@ -337,11 +329,10 @@ Arguments:
 Options:
   -o, --output      Output file (default: stdout)
   -k                Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
-  -c, --cwd         Working directory for agent
+  -c, --cwd         Working directory for agent (agents auto-discover MCP configs from here)
   -t, --timeout     Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
   --progress        Show progress to stderr
   --append          Append to output file
-  --mcp-server      MCP server config JSON (repeatable)
   -g, --grader      Path to grader (.ts/.js module or executable script)
   -h, --help        Show this help message
@@ -389,9 +380,6 @@ Examples:
     }
   }
-  // Parse MCP server configurations
-  const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
   await runTrials({
     promptsPath,
     agentCommand,
@@ -401,7 +389,6 @@ Examples:
     timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
     progress: values.progress ?? false,
     append: values.append ?? false,
-    mcpServers,
     grader,
   })
 }

package/src/validate-refs.ts CHANGED Viewed

@@ -69,7 +69,7 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
     const graderResult = await grader({
       input: prompt.input,
       output: prompt.reference as string,
-      expected: prompt.expected,
+      hint: prompt.hint,
       trajectory: [], // No trajectory for reference validation
     })

package/src/tests/acp-integration.docker.ts DELETED Viewed

@@ -1,214 +0,0 @@
-/**
- * ACP Client Integration Tests
- *
- * @remarks
- * These tests verify the ACP client works against real Claude Code
- * via the `claude-code-acp` adapter.
- *
- * **Run in Docker only** for consistent environment:
- * ```bash
- * ANTHROPIC_API_KEY=sk-... bun run test:acp
- * ```
- *
- * Prerequisites:
- * 1. Docker installed
- * 2. API key: `ANTHROPIC_API_KEY` environment variable
- *
- * These tests make real API calls and consume credits.
- */
-import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
-import { type ACPClient, createACPClient } from '../acp-client.ts'
-import { createPrompt, summarizeResponse } from '../acp-helpers.ts'
-// Long timeout for real agent interactions (2 minutes)
-setDefaultTimeout(120000)
-// Fixtures directory with .claude/skills and .mcp.json
-const FIXTURES_DIR = `${import.meta.dir}/fixtures`
-// Use haiku for all tests to reduce costs
-const TEST_MODEL = 'claude-haiku-4-5-20251001'
-describe('ACP Client Integration', () => {
-  let client: ACPClient
-  beforeAll(async () => {
-    // cc-acp adapter expects ANTHROPIC_API_KEY
-    client = createACPClient({
-      command: ['bunx', 'claude-code-acp'],
-      timeout: 120000, // 2 min timeout for initialization
-      env: {
-        ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '',
-      },
-    })
-    await client.connect()
-  })
-  afterAll(async () => {
-    await client?.disconnect()
-  })
-  test('connects and initializes', () => {
-    expect(client.isConnected()).toBe(true)
-    const initResult = client.getInitializeResult()
-    expect(initResult).toBeDefined()
-    expect(initResult?.protocolVersion).toBeDefined()
-  })
-  test('reports agent capabilities', () => {
-    const capabilities = client.getCapabilities()
-    expect(capabilities).toBeDefined()
-  })
-  test('creates session', async () => {
-    const session = await client.createSession({
-      cwd: FIXTURES_DIR,
-      mcpServers: [],
-    })
-    expect(session).toBeDefined()
-    expect(session.id).toBeDefined()
-    expect(typeof session.id).toBe('string')
-  })
-  test('sends prompt and receives response', async () => {
-    const session = await client.createSession({
-      cwd: FIXTURES_DIR,
-      mcpServers: [],
-    })
-    // Use haiku for faster/cheaper test runs
-    await client.setModel(session.id, TEST_MODEL)
-    // Simple prompt that doesn't require tools
-    const { result, updates } = await client.promptSync(
-      session.id,
-      createPrompt('What is 2 + 2? Reply with just the number.'),
-    )
-    expect(result).toBeDefined()
-    expect(updates).toBeInstanceOf(Array)
-    // Summarize and verify response structure
-    const summary = summarizeResponse(updates)
-    expect(summary.text).toBeDefined()
-    expect(summary.text.length).toBeGreaterThan(0)
-  })
-  test('streaming prompt yields updates', async () => {
-    const session = await client.createSession({
-      cwd: FIXTURES_DIR,
-      mcpServers: [],
-    })
-    // Use haiku for faster/cheaper test runs
-    await client.setModel(session.id, TEST_MODEL)
-    const events: string[] = []
-    for await (const event of client.prompt(session.id, createPrompt('Say "hello" and nothing else.'))) {
-      events.push(event.type)
-      if (event.type === 'complete') {
-        expect(event.result).toBeDefined()
-      }
-    }
-    expect(events).toContain('complete')
-  })
-  test('handles tool usage prompt', async () => {
-    const session = await client.createSession({
-      cwd: FIXTURES_DIR,
-      mcpServers: [],
-    })
-    // Use haiku for faster/cheaper test runs
-    await client.setModel(session.id, TEST_MODEL)
-    // Prompt that should trigger tool usage - reading a specific file
-    const { updates } = await client.promptSync(
-      session.id,
-      createPrompt('Use the Read tool to read calculator-mcp.ts and tell me what tools the MCP server provides.'),
-    )
-    const summary = summarizeResponse(updates)
-    // Verify response mentions calculator tools
-    expect(summary.text.length).toBeGreaterThan(0)
-    // Response should mention the calculator tools (add, subtract, etc.)
-    expect(summary.text.toLowerCase()).toMatch(/add|subtract|multiply|divide|calculator/)
-  })
-  test('uses skill from cwd', async () => {
-    const session = await client.createSession({
-      cwd: FIXTURES_DIR,
-      mcpServers: [],
-    })
-    // Use haiku for faster/cheaper test runs
-    await client.setModel(session.id, TEST_MODEL)
-    // Ask Claude to use the greeting skill
-    const { updates } = await client.promptSync(session.id, createPrompt('Please greet me using the greeting skill.'))
-    const summary = summarizeResponse(updates)
-    // The greeting skill instructs Claude to include specific phrases
-    expect(summary.text.length).toBeGreaterThan(0)
-    expect(summary.text.toLowerCase()).toMatch(/hello|greet|welcome/)
-  })
-  test('uses MCP server tools', async () => {
-    // Path to calculator MCP server fixture (must be absolute per ACP spec)
-    const calculatorPath = `${FIXTURES_DIR}/calculator-mcp.ts`
-    const bunPath = Bun.which('bun') ?? 'bun'
-    // Retry helper for flaky MCP server startup
-    const maxRetries = 3
-    let lastError: Error | undefined
-    for (let attempt = 1; attempt <= maxRetries; attempt++) {
-      const session = await client.createSession({
-        cwd: FIXTURES_DIR,
-        mcpServers: [
-          {
-            name: 'calculator',
-            command: bunPath,
-            args: [calculatorPath],
-            env: [],
-          },
-        ],
-      })
-      // Set model to haiku for faster/cheaper test runs
-      await client.setModel(session.id, TEST_MODEL)
-      // Ask Claude to use the calculator MCP server
-      const { updates } = await client.promptSync(
-        session.id,
-        createPrompt('Use the calculator MCP server add tool to compute 15 + 27. Reply with just the number.'),
-      )
-      const summary = summarizeResponse(updates)
-      // Check if we got 42 in the response
-      if (summary.text.match(/42/)) {
-        expect(summary.text.length).toBeGreaterThan(0)
-        expect(summary.text).toMatch(/42/)
-        return // Success!
-      }
-      // MCP server might not have been ready, retry
-      lastError = new Error(`Attempt ${attempt}: Response did not contain 42. Got: ${summary.text.slice(0, 100)}...`)
-      if (attempt < maxRetries) {
-        console.log(`MCP test attempt ${attempt} failed, retrying...`)
-      }
-    }
-    // All retries exhausted
-    throw lastError ?? new Error('MCP test failed after all retries')
-  })
-})