@plaited/acp-harness 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/.claude/rules/accuracy.md +43 -0
  2. package/.claude/rules/bun-apis.md +80 -0
  3. package/.claude/rules/code-review.md +254 -0
  4. package/.claude/rules/git-workflow.md +37 -0
  5. package/.claude/rules/github.md +154 -0
  6. package/.claude/rules/testing.md +172 -0
  7. package/.claude/skills/acp-harness/SKILL.md +310 -0
  8. package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
  9. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
  10. package/.claude/skills/acp-harness/references/downstream.md +288 -0
  11. package/.claude/skills/acp-harness/references/output-formats.md +221 -0
  12. package/.claude-plugin/marketplace.json +15 -0
  13. package/.claude-plugin/plugin.json +16 -0
  14. package/.github/CODEOWNERS +6 -0
  15. package/.github/workflows/ci.yml +63 -0
  16. package/.github/workflows/publish.yml +146 -0
  17. package/.mcp.json +20 -0
  18. package/CLAUDE.md +92 -0
  19. package/Dockerfile.test +23 -0
  20. package/LICENSE +15 -0
  21. package/README.md +94 -0
  22. package/bin/cli.ts +670 -0
  23. package/bin/tests/cli.spec.ts +362 -0
  24. package/biome.json +96 -0
  25. package/bun.lock +513 -0
  26. package/docker-compose.test.yml +21 -0
  27. package/package.json +57 -0
  28. package/scripts/bun-test-wrapper.sh +46 -0
  29. package/src/acp-client.ts +503 -0
  30. package/src/acp-helpers.ts +121 -0
  31. package/src/acp-transport.ts +455 -0
  32. package/src/acp-utils.ts +341 -0
  33. package/src/acp.constants.ts +56 -0
  34. package/src/acp.schemas.ts +161 -0
  35. package/src/acp.ts +27 -0
  36. package/src/acp.types.ts +28 -0
  37. package/src/tests/acp-client.spec.ts +205 -0
  38. package/src/tests/acp-helpers.spec.ts +105 -0
  39. package/src/tests/acp-integration.docker.ts +214 -0
  40. package/src/tests/acp-transport.spec.ts +153 -0
  41. package/src/tests/acp-utils.spec.ts +394 -0
  42. package/src/tests/fixtures/.claude/settings.local.json +8 -0
  43. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
  44. package/src/tests/fixtures/calculator-mcp.ts +215 -0
  45. package/tsconfig.json +32 -0
@@ -0,0 +1,172 @@
1
+ # Testing
2
+
3
+ This project uses Bun's built-in test runner for unit and integration tests.
4
+
5
+ ## Test Types
6
+
7
+ ### Unit/Integration Tests (`*.spec.ts`)
8
+
9
+ - Standard Bun tests using `*.spec.ts` extension
10
+ - Run with `bun test` command
11
+ - Used for testing business logic, utilities, and non-visual functionality
12
+
13
+ ### Docker Integration Tests (`*.docker.ts`)
14
+
15
+ - Tests that require external services or API keys run in Docker containers
16
+ - Use `*.docker.ts` extension
17
+ - Run with `bun run test:docker`
18
+
19
+ ## Running Tests
20
+
21
+ ```bash
22
+ # Run all unit tests
23
+ bun test
24
+
25
+ # Run a specific spec test file
26
+ bun test path/to/file.spec.ts
27
+
28
+ # Run tests matching a pattern
29
+ bun test pattern
30
+
31
+ # Run Docker integration tests (requires ANTHROPIC_API_KEY)
32
+ ANTHROPIC_API_KEY=sk-... bun run test:docker
33
+ ```
34
+
35
+ ## Test Style Conventions
36
+
37
+ ### Use `test` Instead of `it`
38
+
39
+ Use `test` instead of `it` in test files for consistency:
40
+
41
+ ```typescript
42
+ // ✅ Good
43
+ test('should create ACP client correctly', () => {
44
+ // ...
45
+ })
46
+
47
+ // ❌ Avoid
48
+ it('should create ACP client correctly', () => {
49
+ // ...
50
+ })
51
+ ```
52
+
53
+ ## Skill Script Tests
54
+
55
+ Claude Code skills in `.claude/skills/` may include executable scripts. Tests for these scripts follow a specific structure:
56
+
57
+ ### Directory Structure
58
+
59
+ ```
60
+ .claude/skills/<skill-name>/
61
+ ├── SKILL.md
62
+ ├── scripts/
63
+ │ ├── script-name.ts # Executable script
64
+ │ └── tests/
65
+ │ └── script-name.spec.ts # Tests for the script
66
+ ```
67
+
68
+ ### Running Skill Script Tests
69
+
70
+ ```bash
71
+ # From skill directory
72
+ bun test scripts/tests/
73
+ ```
74
+
75
+ ### Test Pattern
76
+
77
+ Scripts that output JSON can be tested using Bun's shell API:
78
+
79
+ ```typescript
80
+ import { describe, test, expect } from 'bun:test'
81
+ import { join } from 'node:path'
82
+ import { $ } from 'bun'
83
+
84
+ const scriptsDir = join(import.meta.dir, '..')
85
+
86
+ describe('script-name', () => {
87
+ test('outputs expected JSON', async () => {
88
+ const result = await $`bun ${scriptsDir}/script-name.ts arg1 arg2`.json()
89
+ expect(result.filePath).toEndWith('expected.ts')
90
+ })
91
+
92
+ test('exits with error on invalid input', async () => {
93
+ const proc = Bun.spawn(['bun', `${scriptsDir}/script-name.ts`], {
94
+ stderr: 'pipe',
95
+ })
96
+ const exitCode = await proc.exited
97
+ expect(exitCode).toBe(1)
98
+ })
99
+ })
100
+ ```
101
+
102
+ ## Docker Integration Tests
103
+
104
+ Tests that require the Anthropic API run in Docker containers for consistent, isolated execution.
105
+
106
+ ### ACP Integration Tests
107
+
108
+ The ACP client integration tests require the Anthropic API and run in a Docker container:
109
+
110
+ ```bash
111
+ # Run locally with Docker (requires ANTHROPIC_API_KEY)
112
+ ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm acp-test
113
+
114
+ # Or using the npm script (still requires Docker)
115
+ ANTHROPIC_API_KEY=sk-... bun run test:docker
116
+ ```
117
+
118
+ ### File Naming
119
+
120
+ - **`*.docker.ts`**: Tests that run in Docker containers
121
+ - These are excluded from `bun test` and run separately in CI
122
+
123
+ ### CI Workflow
124
+
125
+ Docker tests use path filtering to reduce API costs:
126
+
127
+ ```yaml
128
+ # .github/workflows/ci.yml
129
+ jobs:
130
+ changes:
131
+ # Detects which paths changed
132
+ steps:
133
+ - uses: dorny/paths-filter@v3
134
+ with:
135
+ filters: |
136
+ acp:
137
+ - 'src/**'
138
+
139
+ test-acp-integration:
140
+ needs: changes
141
+ if: ${{ needs.changes.outputs.acp == 'true' }}
142
+ # Only runs when src/ files change
143
+ ```
144
+
145
+ ## Anti-Patterns
146
+
147
+ ### No Conditionals Around Assertions
148
+
149
+ Never wrap assertions in conditionals. Tests should fail explicitly, not silently skip assertions.
150
+
151
+ ```typescript
152
+ // ❌ WRONG: Conditional assertion
153
+ if (result) {
154
+ expect(result.value).toBe(expected)
155
+ }
156
+
157
+ // ❌ WRONG: Optional chaining with assertion
158
+ result?.value && expect(result.value).toBe(expected)
159
+
160
+ // ✅ CORRECT: Assert the condition, then assert the value
161
+ expect(result).toBeDefined()
162
+ expect(result.value).toBe(expected)
163
+
164
+ // ✅ CORRECT: Use type narrowing assertion
165
+ expect(result).not.toBeNull()
166
+ expect(result!.value).toBe(expected)
167
+ ```
168
+
169
+ If a value might not exist, the test should either:
170
+ 1. Assert that it exists first, then check its value
171
+ 2. Assert that it doesn't exist (if that's the expected behavior)
172
+ 3. Restructure the test to ensure the value is always present
@@ -0,0 +1,310 @@
1
+ ---
2
+ name: acp-harness
3
+ description: CLI tool for capturing agent trajectories. Execute prompts against ACP-compatible agents, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring.
4
+ compatibility: Bun >= 1.2.9
5
+ ---
6
+
7
+ # ACP Harness
8
+
9
+ ## Purpose
10
+
11
+ CLI tool for capturing trajectories from ACP-compatible agents, optimized for TypeScript/JavaScript projects using Bun.
12
+
13
+ **The harness captures. You score.**
14
+
15
+ | Harness Provides | You Provide |
16
+ |------------------|-------------|
17
+ | Prompt execution against ACP agents | Scoring logic (Braintrust, custom scripts) |
18
+ | Full trajectory capture (thoughts, tools, plans) | Pass/fail determination |
19
+ | Structured JSONL output | LLM-as-judge prompts |
20
+ | Reproducible execution environment | CI integration, golden file comparison |
21
+
22
+ **Use this when:**
23
+ - Capturing trajectories for downstream evaluation
24
+ - Generating training data (SFT/DPO) with full context
25
+ - Building regression test fixtures for agent behavior
26
+ - Comparing agent responses across configurations
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ # Run without installing (recommended for CI)
32
+ bunx @plaited/acp-harness prompts.jsonl -o results.jsonl
33
+
34
+ # Or install globally for repeated use
35
+ bun add -g @plaited/acp-harness
36
+ acp-harness prompts.jsonl -o results.jsonl
37
+
38
+ # Or add as project dependency
39
+ bun add @plaited/acp-harness
40
+ ```
41
+
42
+ **Note:** Examples below use `acp-harness` (the command available after global install). Replace with `bunx @plaited/acp-harness` if not installed globally.
43
+
44
+ ## Capture Workflow
45
+
46
+ ```mermaid
47
+ flowchart LR
48
+ Prompts["prompts.jsonl"] --> Harness["acp-harness"]
49
+ Agent["ACP Agent"] --> Harness
50
+ Harness -->|"JSONL"| Output["trajectories"]
51
+ Output --> Scoring["Your scoring logic"]
52
+ Scoring --> Decision["Informed choices"]
53
+ ```
54
+
55
+ The harness is a **capture layer** - it executes prompts and records trajectories. Scoring happens in your codebase.
56
+
57
+ | Use Case | Harness Captures | You Build |
58
+ |----------|------------------|-----------|
59
+ | **Agent comparison** | Same prompts → multiple agents → trajectories | Scoring pipeline (Braintrust, custom) |
60
+ | **Tool comparison** | Trajectory with tool/skill attribution | Diff analysis, preference data |
61
+ | **Training data** | Structured I/O with tool calls, plans, thoughts | SFT/DPO formatting |
62
+ | **Regression testing** | Deterministic prompt → trajectory capture | Golden file comparison, CI assertions |
63
+
64
+ ### Example: Comparing Built-in vs Skill
65
+
66
+ ```bash
67
+ # Run same prompt with built-in tool
68
+ acp-harness prompts.jsonl \
69
+ --cmd "bunx claude-code-acp" \
70
+ -o results-builtin.jsonl
71
+
72
+ # Run same prompt with custom skill installed
73
+ acp-harness prompts.jsonl \
74
+ --cmd "bunx claude-code-acp" \
75
+ --cwd /project/with/typescript-lsp-skill \
76
+ -o results-skill.jsonl
77
+
78
+ # Compare trajectories - which used better tools? faster? more accurate?
79
+ diff <(jq '.toolCalls' results-builtin.jsonl) <(jq '.toolCalls' results-skill.jsonl)
80
+ ```
81
+
82
+ ## Execution Environment
83
+
84
+ **Recommendation:** Run the harness in Docker containers for consistent, isolated execution.
85
+
86
+ ```bash
87
+ # Build and run with Docker Compose
88
+ docker compose -f docker-compose.acp.yml run --rm acp-harness
89
+
90
+ # Or build directly
91
+ docker build -f Dockerfile.acp -t acp-harness .
92
+ docker run --rm -e ANTHROPIC_API_KEY acp-harness
93
+ ```
94
+
95
+ Docker provides:
96
+ - Consistent environment across local and CI
97
+ - Filesystem isolation without app-level sandboxing
98
+ - Reproducible results for training data generation
99
+
100
+ See [assets/](assets/) for example container configurations:
101
+ - `Dockerfile.acp` - Base container with Bun and git
102
+ - `docker-compose.acp.yml` - Compose file with volume mounts for results
103
+
104
+ ## Non-Goals
105
+
106
+ This harness is optimized for TypeScript/JavaScript projects using Bun. It is **not** designed for:
107
+
108
+ - **Python projects** - Use [SWE-bench](https://github.com/SWE-bench/SWE-bench), [Braintrust Python SDK](https://www.braintrust.dev/)
109
+ - **Academic model benchmarking** - Use [EleutherAI lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
110
+ - **IDE integrations** - Use Copilot Evaluation Harness
111
+ - **SaaS observability** - Use Braintrust, Langfuse platforms directly
112
+
113
+ ## Quick Reference
114
+
115
+ | Resource | Description |
116
+ |----------|-------------|
117
+ | `bunx @plaited/acp-harness` | Execute prompts against agent, capture trajectories |
118
+ | [output-formats.md](references/output-formats.md) | JSONL schemas, format options |
119
+ | [downstream.md](references/downstream.md) | Integration patterns (Braintrust, jq, custom scorers) |
120
+
121
+ ## Output Pipeline
122
+
123
+ ```mermaid
124
+ flowchart LR
125
+ Prompts["prompts.jsonl"] --> Harness["acp-harness"]
126
+ Agent["ACP Agent"] --> Harness
127
+ Harness --> Summary["summary.jsonl"]
128
+ Harness --> Full["results.md + results.full.jsonl"]
129
+ Summary --> Your["Your scoring code"]
130
+ Full --> Your
131
+ ```
132
+
133
+ 1. **Prepare** - Create `prompts.jsonl` with test cases
134
+ 2. **Execute** - Run harness against target agent
135
+ 3. **Capture** - Trajectories streamed to output files
136
+ 4. **Score** - Pipe output to your scoring logic (Braintrust, jq, LLM-as-judge)
137
+
138
+ ## Harness Script
139
+
140
+ ### Basic Usage
141
+
142
+ ```bash
143
+ acp-harness <prompts.jsonl> --cmd <cmd> [options]
144
+ ```
145
+
146
+ ### Arguments
147
+
148
+ | Flag | Description | Default |
149
+ |------|-------------|---------|
150
+ | `prompts.jsonl` | Input file with prompts to execute | Required |
151
+ | `--cmd, --command` | ACP agent command (e.g., `bunx claude-code-acp`, `bun ./adapter.ts`) | `"claude-code-acp"` |
152
+ | `-o, --output` | Output file/path | stdout |
153
+ | `-c, --cwd` | Working directory for agent | current |
154
+ | `-t, --timeout` | Request timeout in ms | `60000` |
155
+ | `-f, --format` | Output format: `summary`, `judge` | `summary` |
156
+ | `--progress` | Show progress to stderr | false |
157
+ | `--append` | Append to output file | false |
158
+ | `--mcp-server` | MCP server config JSON (repeatable) | none |
159
+
160
+ ### Examples
161
+
162
+ ```bash
163
+ # Using the default claude-code-acp adapter
164
+ acp-harness prompts.jsonl -o results.jsonl
165
+
166
+ # Using bunx to run an adapter
167
+ acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
168
+
169
+ # Using a local adapter script (great for custom adapters in same repo)
170
+ acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
171
+
172
+ # Judge format - creates two files for downstream scoring
173
+ acp-harness prompts.jsonl --format judge -o results
174
+ # Creates: results.md (summary with step IDs) + results.full.jsonl (complete trajectory)
175
+
176
+ # With MCP server (stdio transport)
177
+ acp-harness prompts.jsonl \
178
+ --mcp-server '{"type":"stdio","name":"fs","command":["mcp-filesystem","/data"]}'
179
+
180
+ # With MCP server (HTTP transport)
181
+ acp-harness prompts.jsonl \
182
+ --mcp-server '{"type":"http","name":"api","url":"http://localhost:3000"}'
183
+
184
+ # Stream with progress
185
+ acp-harness prompts.jsonl --progress -o results.jsonl
186
+ ```
187
+
188
+ ## Input Format
189
+
190
+ Each line in `prompts.jsonl`:
191
+
192
+ ```jsonl
193
+ {"id":"test-001","input":"Create a primary button","expected":"should contain <button>","metadata":{"category":"ui"}}
194
+ {"id":"test-002","input":"Write a function for form validation","metadata":{"category":"logic"}}
195
+ ```
196
+
197
+ | Field | Required | Description |
198
+ |-------|----------|-------------|
199
+ | `id` | Yes | Unique identifier |
200
+ | `input` | Yes | Prompt text for the agent |
201
+ | `expected` | No | Expected output (for downstream scoring) |
202
+ | `metadata` | No | Tags, category, difficulty for filtering |
203
+ | `timeout` | No | Override default timeout for this prompt |
204
+
205
+ ## Output Formats
206
+
207
+ ### Summary Format (default)
208
+
209
+ Minimal JSONL for quick metrics and analysis:
210
+
211
+ ```jsonl
212
+ {"id":"test-001","input":"Create a button","output":"I created...","toolCalls":["Write"],"status":"passed","duration":1234}
213
+ ```
214
+
215
+ ### Judge Format (two-tier)
216
+
217
+ Creates two files optimized for downstream LLM-as-judge scoring:
218
+
219
+ **`<output>.md`** - Markdown summary with step IDs and code previews:
220
+
221
+ ```markdown
222
+ ## Capture Record: test-001
223
+
224
+ **Input:** Create a primary button
225
+
226
+ **Trajectory:**
227
+ 1. [THOUGHT] I'll create a styled button... [->test-001-step-1]
228
+ 2. [TOOL:Write] -> completed (234ms) [->test-001-step-2]
229
+ File: src/button.tsx (847 chars)
230
+ ```tsx
231
+ import { css } from 'some-css-lib'
232
+
233
+ type ButtonProps = {
234
+ label: string
235
+
236
+ // ... 30 lines omitted ...
237
+
238
+ export const Button = ({ label }: ButtonProps) => (
239
+ <button className={styles.btn}>{label}</button>
240
+ )
241
+ ```
242
+ 3. [MESSAGE] I created the button... [->test-001-step-3]
243
+
244
+ **Output:** I created the button with primary styling.
245
+ **Metadata:** category=ui, agent=claude-code-acp
246
+ **Status:** passed
247
+ **Duration:** 1234ms
248
+
249
+ ---
250
+ ```
251
+
252
+ **`<output>.full.jsonl`** - Complete trajectory with step IDs for correlation:
253
+
254
+ ```jsonl
255
+ {"id":"test-001","input":"...","output":"...","trajectory":[{"type":"thought","content":"...","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{...},"output":{...},"duration":234,"stepId":"test-001-step-2"}],...}
256
+ ```
257
+
258
+ **Usage patterns by judge context window:**
259
+
260
+ | Judge Model | Strategy |
261
+ |-------------|----------|
262
+ | Gemini (1M+ tokens) | Feed `results.full.jsonl` directly |
263
+ | Claude/GPT-4 (128-200k) | Use `results.full.jsonl` for most runs |
264
+ | Smaller models | Use `results.md`, retrieve specific steps by ID as needed |
265
+
266
+ ## Downstream Integration
267
+
268
+ The harness outputs standard JSONL that pipes to any tool:
269
+
270
+ ```bash
271
+ # Filter with jq
272
+ cat results.jsonl | jq 'select(.metadata.category == "ui")'
273
+
274
+ # Count tool usage
275
+ cat results.jsonl | jq -s 'map(.toolCalls | length) | add'
276
+
277
+ # Feed full trajectory to Gemini (large context)
278
+ cat results.full.jsonl | your-gemini-judge.ts
279
+ ```
280
+
281
+ See [downstream.md](references/downstream.md) for integration patterns with Braintrust, Gemini, and custom scorers.
282
+
283
+ ## Capture Targets
284
+
285
+ | Target | How to Capture |
286
+ |--------|----------------|
287
+ | **Agent capability** | Direct prompts, capture trajectory for analysis |
288
+ | **Skills** | Set `--cwd` to project with skill, capture skill-specific behavior |
289
+ | **MCP Servers** | Use `--mcp-server` flag, capture tool usage in trajectory |
290
+
291
+ ### Capturing Skill Behavior
292
+
293
+ ```bash
294
+ bunx @plaited/acp-harness skill-prompts.jsonl \
295
+ --cwd /project/with/skill \
296
+ -o results.jsonl
297
+ ```
298
+
299
+ ### Capturing MCP Server Usage
300
+
301
+ ```bash
302
+ bunx @plaited/acp-harness mcp-prompts.jsonl \
303
+ --mcp-server '{"type":"stdio","name":"fs","command":["mcp-filesystem"]}' \
304
+ -o results.jsonl
305
+ ```
306
+
307
+ ## Related
308
+
309
+ - **[@agentclientprotocol/sdk](https://www.npmjs.com/package/@agentclientprotocol/sdk)** - ACP SDK for programmatic access
310
+ - **[@zed-industries/claude-code-acp](https://www.npmjs.com/package/@zed-industries/claude-code-acp)** - Claude Code ACP adapter
@@ -0,0 +1,25 @@
1
+ # ACP Harness Docker Configuration
2
+ #
3
+ # Example Dockerfile for running ACP evaluations in an isolated container.
4
+ # Copy this to your project and customize as needed.
5
+ #
6
+ # Usage:
7
+ # docker build -f Dockerfile.acp -t acp-harness .
8
+ # docker run --rm -e ANTHROPIC_API_KEY acp-harness bunx @plaited/acp-harness prompts.jsonl
9
+
10
+ FROM oven/bun:1.2.9
11
+
12
+ # Install git (required for some agent operations)
13
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
14
+
15
+ WORKDIR /app
16
+
17
+ # Copy package files first for better layer caching
18
+ COPY package.json bun.lock* ./
19
+ RUN bun install --frozen-lockfile
20
+
21
+ # Copy source files
22
+ COPY . .
23
+
24
+ # Default command - override with your harness invocation
25
+ CMD ["bun", "test"]
@@ -0,0 +1,19 @@
1
+ # ACP Harness Docker Compose Configuration
2
+ #
3
+ # Example docker-compose for running ACP evaluations.
4
+ # Copy this to your project and customize as needed.
5
+ #
6
+ # Usage:
7
+ # ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.acp.yml run --rm acp-harness
8
+
9
+ services:
10
+ acp-harness:
11
+ build:
12
+ context: .
13
+ dockerfile: Dockerfile.acp
14
+ environment:
15
+ - ANTHROPIC_API_KEY
16
+ volumes:
17
+ # Mount output directory to persist results
18
+ - ./results:/app/results
19
+ command: ["bunx", "@plaited/acp-harness", "prompts.jsonl", "-o", "results/output.jsonl"]