@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,21 +11,25 @@ CLI tool for capturing agent trajectories from ACP-compatible agents. Execute pr
11
11
  Use these tools directly via the CLI without installation:
12
12
 
13
13
  ```bash
14
- # Run without installing
15
- bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
14
+ # Using built-in headless adapter (recommended - no extra install needed)
15
+ export ANTHROPIC_API_KEY=sk-...
16
+ bunx @plaited/acp-harness capture prompts.jsonl \
17
+ bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
18
+ -o results.jsonl
16
19
 
17
- # Or install globally
18
- bun add -g @plaited/acp-harness
19
- acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
20
+ # Or with an external ACP adapter
21
+ bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
20
22
  ```
21
23
 
22
- **Prerequisite:** Install an ACP adapter and set your API key:
24
+ **Prerequisite:** Set your API key. The `headless` command works with any CLI agent that supports JSON output - no adapter installation required:
23
25
 
24
26
  ```bash
25
- npm install -g @anthropic-ai/claude-code-acp
26
- export ANTHROPIC_API_KEY=sk-...
27
+ export ANTHROPIC_API_KEY=sk-... # For Claude
28
+ export GEMINI_API_KEY=... # For Gemini
27
29
  ```
28
30
 
31
+ Pre-built schemas are available in `.claude/skills/acp-adapters/schemas/` for Claude and Gemini.
32
+
29
33
  ### Commands
30
34
 
31
35
  | Command | Description |
@@ -37,17 +41,21 @@ export ANTHROPIC_API_KEY=sk-...
37
41
  | `validate-refs <prompts>` | Check reference solutions |
38
42
  | `balance <prompts>` | Analyze test set coverage |
39
43
  | `schemas [name]` | Export JSON schemas |
40
- | `adapter:scaffold [name]` | Scaffold new ACP adapter project |
44
+ | `headless --schema <path>` | Schema-driven adapter for any CLI agent |
41
45
  | `adapter:check <cmd>` | Validate adapter ACP compliance |
42
46
 
43
47
  ### Examples
44
48
 
45
49
  ```bash
46
- # Capture trajectories
47
- bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
50
+ # Capture trajectories using headless adapter (recommended)
51
+ bunx @plaited/acp-harness capture prompts.jsonl \
52
+ bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
53
+ -o results.jsonl
48
54
 
49
55
  # Run trials for pass@k analysis
50
- bunx @plaited/acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts
56
+ bunx @plaited/acp-harness trials prompts.jsonl \
57
+ bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json \
58
+ -k 5 --grader ./grader.ts
51
59
 
52
60
  # Summarize results
53
61
  bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
@@ -55,11 +63,9 @@ bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
55
63
  # Export schemas
56
64
  bunx @plaited/acp-harness schemas CaptureResult --json
57
65
 
58
- # Scaffold a new adapter
59
- bunx @plaited/acp-harness adapter:scaffold my-agent -o ./my-agent-acp
60
-
61
66
  # Validate adapter compliance
62
- bunx @plaited/acp-harness adapter:check bun ./my-adapter/src/main.ts
67
+ bunx @plaited/acp-harness adapter:check \
68
+ bunx @plaited/acp-harness headless --schema ./schemas/claude-headless.json
63
69
  ```
64
70
 
65
71
  ## Skills for AI Agents
@@ -110,10 +116,12 @@ Discover, create, and validate ACP adapters for agent integration.
110
116
 
111
117
  | Command | Description |
112
118
  |---------|-------------|
119
+ | `headless` | Schema-driven adapter for any CLI agent |
113
120
  | `adapter:scaffold` | Generate new adapter project with handlers |
114
121
  | `adapter:check` | Validate ACP protocol compliance |
115
122
 
116
123
  **Use cases:**
124
+ - Wrapping headless CLI agents with schema-driven adapter
117
125
  - Finding existing adapters for your agent
118
126
  - Building custom ACP adapters from scratch
119
127
  - Validating adapter implementations
@@ -121,10 +129,18 @@ Discover, create, and validate ACP adapters for agent integration.
121
129
  ## Input Format
122
130
 
123
131
  ```jsonl
124
- {"id":"test-001","input":"Create a primary button","expected":"should contain <button>","metadata":{"category":"ui"}}
125
- {"id":"test-002","input":"Fix the TypeScript error","metadata":{"category":"bugfix"}}
132
+ {"id":"test-001","input":"Create a primary button","hint":"should contain <button>","metadata":{"category":"ui"}}
133
+ {"id":"test-002","input":["Create a component","Now add tests"],"metadata":{"category":"multi-turn"}}
126
134
  ```
127
135
 
136
+ | Field | Required | Description |
137
+ |-------|----------|-------------|
138
+ | `id` | Yes | Unique identifier |
139
+ | `input` | Yes | Single prompt (string) or conversation turns (string[]) |
140
+ | `hint` | No | Grader context - what to look for |
141
+ | `reference` | No | Reference solution (for validate-refs) |
142
+ | `metadata` | No | Tags, category, difficulty for filtering |
143
+
128
144
  ## Output Format
129
145
 
130
146
  The harness outputs full trajectory JSONL (`CaptureResult` schema):
@@ -134,12 +150,12 @@ The harness outputs full trajectory JSONL (`CaptureResult` schema):
134
150
  "id": "test-001",
135
151
  "input": "Create a primary button",
136
152
  "output": "Here's a button component...",
137
- "expected": "should contain <button>",
153
+ "hint": "should contain <button>",
138
154
  "trajectory": [...],
139
- "metadata": {"category": "ui", "agent": "bunx claude-code-acp"},
140
- "timing": {"start": 1234567890, "end": 1234567900},
155
+ "metadata": {"category": "ui", "agent": "bunx claude-code-acp", "trajectoryRichness": "full", "turnCount": 1},
156
+ "timing": {"start": 1234567890, "end": 1234567900, "sessionCreation": 234, "total": 10},
141
157
  "toolErrors": false,
142
- "score": {"pass": true, "score": 1.0, "reasoning": "Contains expected"}
158
+ "score": {"pass": true, "score": 1.0, "reasoning": "Contains hint"}
143
159
  }
144
160
  ```
145
161
 
@@ -147,6 +163,9 @@ Key fields:
147
163
  - `toolErrors`: Boolean indicating if any tool calls failed
148
164
  - `score`: Grader result (only if `--grader` provided)
149
165
  - `trajectory`: Full execution trace (thoughts, messages, tool calls, plans)
166
+ - `metadata.trajectoryRichness`: `"full"` | `"messages-only"` | `"minimal"`
167
+ - `timing.sessionCreation`: Time to initialize session (ms)
168
+ - `timing.total`: End-to-end duration (ms)
150
169
 
151
170
  ## Graders
152
171
 
@@ -159,12 +178,12 @@ Export a `grade` function:
159
178
  ```typescript
160
179
  import type { Grader } from '@plaited/acp-harness/schemas'
161
180
 
162
- export const grade: Grader = async ({ input, output, expected, trajectory }) => {
163
- const pass = output.toLowerCase().includes(expected?.toLowerCase() ?? '')
181
+ export const grade: Grader = async ({ input, output, hint, trajectory }) => {
182
+ const pass = output.toLowerCase().includes(hint?.toLowerCase() ?? '')
164
183
  return {
165
184
  pass,
166
185
  score: pass ? 1.0 : 0.0,
167
- reasoning: pass ? 'Contains expected answer' : 'Missing expected answer'
186
+ reasoning: pass ? 'Contains hint content' : 'Missing hint content'
168
187
  }
169
188
  }
170
189
  ```
@@ -184,13 +203,13 @@ import sys
184
203
 
185
204
  data = json.load(sys.stdin)
186
205
  output = data["output"].lower()
187
- expected = (data.get("expected") or "").lower()
206
+ hint = (data.get("hint") or "").lower()
188
207
 
189
- pass_result = expected in output if expected else True
208
+ pass_result = hint in output if hint else True
190
209
  print(json.dumps({
191
210
  "pass": pass_result,
192
211
  "score": 1.0 if pass_result else 0.0,
193
- "reasoning": "Contains expected" if pass_result else "Missing expected"
212
+ "reasoning": "Contains hint" if pass_result else "Missing hint"
194
213
  }))
195
214
  ```
196
215
 
@@ -200,7 +219,7 @@ acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.py
200
219
  ```
201
220
 
202
221
  **Protocol:**
203
- - Input (stdin): `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}`
222
+ - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
204
223
  - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
205
224
 
206
225
  ## Downstream Integration
@@ -222,13 +241,16 @@ cat results.jsonl | your-scoring-script.ts
222
241
  bun install # Install dependencies
223
242
  bun run check # Type check + lint + format
224
243
  bun test # Run unit tests
244
+
245
+ # Run integration tests in Docker (requires API keys)
246
+ ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm acp-test
225
247
  ```
226
248
 
227
249
  ## Requirements
228
250
 
229
251
  - **Runtime:** Bun >= 1.2.9
230
- - **ACP Adapter:** `@anthropic-ai/claude-code-acp` or compatible
231
- - **API Key:** `ANTHROPIC_API_KEY` environment variable
252
+ - **ACP Adapter:** Built-in `headless` command (recommended) or external adapter
253
+ - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
232
254
 
233
255
  ## License
234
256
 
package/bin/cli.ts CHANGED
@@ -14,6 +14,7 @@
14
14
  * - validate-refs: Check reference solutions
15
15
  * - balance: Analyze test set coverage
16
16
  * - schemas: Export JSON schemas for non-TS users
17
+ * - headless: Schema-driven adapter for any headless CLI agent
17
18
  * - adapter:scaffold: Scaffold new ACP adapter project
18
19
  * - adapter:check: Validate adapter ACP compliance
19
20
  */
@@ -23,6 +24,7 @@ import { adapterScaffold } from '../src/adapter-scaffold.ts'
23
24
  import { balance } from '../src/balance.ts'
24
25
  import { calibrate } from '../src/calibrate.ts'
25
26
  import { capture } from '../src/capture.ts'
27
+ import { headless } from '../src/headless.ts'
26
28
  import { schemasCli } from '../src/schemas-cli.ts'
27
29
  import { summarize } from '../src/summarize.ts'
28
30
  import { trials } from '../src/trials.ts'
@@ -43,6 +45,7 @@ Commands:
43
45
  validate-refs Check reference solutions against grader
44
46
  balance Analyze test set coverage
45
47
  schemas Export JSON schemas for non-TypeScript users
48
+ headless Schema-driven adapter for any headless CLI agent
46
49
  adapter:scaffold Scaffold a new ACP adapter project
47
50
  adapter:check Validate adapter ACP compliance
48
51
 
@@ -70,6 +73,14 @@ Examples:
70
73
  # Validate adapter compliance
71
74
  acp-harness adapter:check bun ./my-adapter/src/main.ts
72
75
 
76
+ # Run headless adapter with schema
77
+ acp-harness headless --schema ./claude-headless.json
78
+
79
+ # Capture with headless adapter
80
+ acp-harness capture prompts.jsonl \\
81
+ acp-harness headless --schema ./claude-headless.json \\
82
+ -o results.jsonl
83
+
73
84
  Documentation: https://github.com/plaited/acp-harness
74
85
  `)
75
86
  }
@@ -104,6 +115,10 @@ const main = async () => {
104
115
  await schemasCli(args)
105
116
  break
106
117
 
118
+ case 'headless':
119
+ await headless(args)
120
+ break
121
+
107
122
  case 'adapter:scaffold':
108
123
  await adapterScaffold(args)
109
124
  break
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/acp-harness",
3
- "version": "0.3.2",
3
+ "version": "0.4.0",
4
4
  "description": "CLI tool for capturing agent trajectories from ACP-compatible agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -28,10 +28,8 @@
28
28
  "./bin/**",
29
29
  "!./src/**/tests/*",
30
30
  "!./src/**/*.spec.ts",
31
- "!./src/**/*.docker.ts",
32
31
  "!./bin/**/tests/*",
33
- "!./bin/**/*.spec.ts",
34
- "!./bin/**/*.docker.ts"
32
+ "!./bin/**/*.spec.ts"
35
33
  ],
36
34
  "publishConfig": {
37
35
  "access": "public"
@@ -43,8 +41,8 @@
43
41
  "check:types": "tsc --noEmit",
44
42
  "check:write": "biome check --write && format-package --write",
45
43
  "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
46
- "test": "bun test ./src/ ./bin/ ./.claude",
47
- "test:docker": "docker compose -f docker-compose.test.yml run --rm acp-test"
44
+ "test": "bun test ./**/tests/*.spec.ts",
45
+ "test:integration": "bun test ./**/integration_tests/*.spec.ts"
48
46
  },
49
47
  "lint-staged": {
50
48
  "*.{js,cjs,jsx,tsx,ts}": [
@@ -56,7 +54,7 @@
56
54
  },
57
55
  "dependencies": {
58
56
  "zod": "^4.3.5",
59
- "@plaited/development-skills": "0.6.2"
57
+ "@plaited/development-skills": "0.6.3"
60
58
  },
61
59
  "peerDependencies": {
62
60
  "typescript-language-server": "^5.1.3",
package/src/acp-client.ts CHANGED
@@ -22,7 +22,6 @@ import type {
22
22
  Implementation,
23
23
  InitializeRequest,
24
24
  InitializeResponse,
25
- McpServer,
26
25
  PromptRequest,
27
26
  PromptResponse,
28
27
  RequestPermissionRequest,
@@ -277,18 +276,22 @@ export const createACPClient = (config: ACPClientConfig) => {
277
276
  /**
278
277
  * Creates a new conversation session.
279
278
  *
280
- * @param params - Session parameters with working directory and optional MCP servers
279
+ * @remarks
280
+ * MCP servers are auto-discovered by the agent from configuration files
281
+ * in the working directory (e.g., `.mcp.json`, `.gemini/settings.json`).
282
+ *
283
+ * @param params - Session parameters with working directory
281
284
  * @returns The created session
282
285
  * @throws {ACPClientError} If not connected
283
286
  */
284
- const createSession = async (params: { cwd: string; mcpServers?: McpServer[] }): Promise<Session> => {
287
+ const createSession = async (params: { cwd: string }): Promise<Session> => {
285
288
  if (!transport?.isConnected()) {
286
289
  throw new ACPClientError('Not connected')
287
290
  }
288
291
 
289
292
  const response = await transport.request<{ sessionId: string }>(ACP_METHODS.CREATE_SESSION, {
290
293
  cwd: params.cwd,
291
- mcpServers: params.mcpServers ?? [],
294
+ mcpServers: [], // Required field - empty array lets agents auto-discover from cwd
292
295
  })
293
296
  return { id: response.sessionId }
294
297
  }
@@ -185,7 +185,6 @@ const checkSessionNew = async (
185
185
  try {
186
186
  const response = await transport.request<{ sessionId: string }>(ACP_METHODS.CREATE_SESSION, {
187
187
  cwd: process.cwd(),
188
- mcpServers: [],
189
188
  })
190
189
 
191
190
  if (!response || !response.sessionId) {
@@ -296,7 +296,6 @@ import { sessionManager } from '../session-manager.ts'
296
296
 
297
297
  type SessionNewParams = {
298
298
  cwd: string
299
- mcpServers?: unknown[]
300
299
  }
301
300
 
302
301
  type SessionNewResult = {
@@ -304,12 +303,11 @@ type SessionNewResult = {
304
303
  }
305
304
 
306
305
  export const handleSessionNew = async (params: unknown): Promise<SessionNewResult> => {
307
- const { cwd, mcpServers = [] } = params as SessionNewParams
306
+ const { cwd } = params as SessionNewParams
308
307
 
309
- const sessionId = sessionManager.createSession({
310
- cwd,
311
- mcpServers,
312
- })
308
+ // MCP servers are discovered from cwd configuration files
309
+ // (e.g., .mcp.json, .gemini/settings.json)
310
+ const sessionId = sessionManager.createSession({ cwd })
313
311
 
314
312
  return { sessionId }
315
313
  }
@@ -438,19 +436,17 @@ import { randomUUID } from 'node:crypto'
438
436
  type Session = {
439
437
  id: string
440
438
  cwd: string
441
- mcpServers: unknown[]
442
439
  createdAt: Date
443
440
  }
444
441
 
445
442
  class SessionManager {
446
443
  #sessions = new Map<string, Session>()
447
444
 
448
- createSession(params: { cwd: string; mcpServers: unknown[] }): string {
445
+ createSession(params: { cwd: string }): string {
449
446
  const id = \`sess_\${randomUUID().slice(0, 8)}\`
450
447
  this.#sessions.set(id, {
451
448
  id,
452
449
  cwd: params.cwd,
453
- mcpServers: params.mcpServers,
454
450
  createdAt: new Date(),
455
451
  })
456
452
  return id
@@ -550,13 +546,15 @@ from typing import Any, Dict, Optional
550
546
  sessions: Dict[str, Dict[str, Any]] = {}
551
547
 
552
548
 
553
- def create_session(cwd: str, mcp_servers: list) -> str:
554
- """Create a new session."""
549
+ def create_session(cwd: str) -> str:
550
+ """Create a new session.
551
+
552
+ MCP servers are discovered from cwd configuration files.
553
+ """
555
554
  session_id = f"sess_{uuid.uuid4().hex[:8]}"
556
555
  sessions[session_id] = {
557
556
  "id": session_id,
558
557
  "cwd": cwd,
559
- "mcp_servers": mcp_servers,
560
558
  }
561
559
  return session_id
562
560
 
@@ -597,10 +595,13 @@ def handle_initialize(params: Dict[str, Any]) -> Dict[str, Any]:
597
595
 
598
596
 
599
597
  def handle_session_new(params: Dict[str, Any]) -> Dict[str, Any]:
600
- """Handle session/new request."""
598
+ """Handle session/new request.
599
+
600
+ MCP servers are discovered from cwd configuration files
601
+ (e.g., .mcp.json, .gemini/settings.json).
602
+ """
601
603
  cwd = params.get("cwd", ".")
602
- mcp_servers = params.get("mcpServers", [])
603
- session_id = create_session(cwd, mcp_servers)
604
+ session_id = create_session(cwd)
604
605
  return {"sessionId": session_id}
605
606
 
606
607
 
package/src/calibrate.ts CHANGED
@@ -57,17 +57,37 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
57
57
  }
58
58
 
59
59
  /**
60
- * Random sample from array.
60
+ * Randomly sample n elements from an array using Fisher-Yates shuffle.
61
61
  *
62
62
  * @param arr - Array to sample from
63
63
  * @param n - Number of samples to take
64
- * @returns Array of sampled elements
64
+ * @returns Array of sampled elements in random order
65
+ *
66
+ * @remarks
67
+ * Uses Fisher-Yates (Knuth) shuffle for uniform distribution.
68
+ * Creates a copy to avoid mutating the input array.
69
+ * O(n) time complexity with O(n) space for the copy.
70
+ * Not cryptographically secure (uses Math.random).
65
71
  *
66
72
  * @public
67
73
  */
68
74
  export const sampleArray = <T>(arr: T[], n: number): T[] => {
69
- const shuffled = [...arr].sort(() => 0.5 - Math.random())
70
- return shuffled.slice(0, n)
75
+ if (n <= 0) return []
76
+ if (n >= arr.length) return [...arr]
77
+
78
+ const copy = [...arr]
79
+
80
+ // Fisher-Yates shuffle working backwards through array
81
+ // Only shuffle enough elements to get n samples
82
+ const limit = copy.length - n
83
+ for (let i = copy.length - 1; i >= limit && i > 0; i--) {
84
+ // Random index from 0 to i (inclusive)
85
+ const j = Math.floor(Math.random() * (i + 1))
86
+ // Swap elements
87
+ ;[copy[i], copy[j]] = [copy[j]!, copy[i]!]
88
+ }
89
+
90
+ return copy.slice(-n)
71
91
  }
72
92
 
73
93
  /**
@@ -129,8 +149,8 @@ const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
129
149
  lines.push(`**Input:** ${sample.input}`)
130
150
  lines.push('')
131
151
 
132
- if (sample.expected) {
133
- lines.push(`**Expected:** ${sample.expected}`)
152
+ if (sample.hint) {
153
+ lines.push(`**Hint:** ${sample.hint}`)
134
154
  lines.push('')
135
155
  }
136
156
 
@@ -212,7 +232,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
212
232
  id: result.id,
213
233
  input: result.input,
214
234
  output: result.output,
215
- expected: result.expected,
235
+ hint: result.hint,
216
236
  originalScore: result.score as GraderResult,
217
237
  trajectorySnippet: getTrajectorySnippet(result.trajectory),
218
238
  }
@@ -222,7 +242,7 @@ export const runCalibrate = async (config: CalibrateConfig): Promise<Calibration
222
242
  calibrationSample.rescoredResult = await grader({
223
243
  input: result.input,
224
244
  output: result.output,
225
- expected: result.expected,
245
+ hint: result.hint,
226
246
  trajectory: result.trajectory,
227
247
  })
228
248
  }