@plaited/acp-harness 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/rules/accuracy.md +43 -0
- package/.claude/rules/bun-apis.md +80 -0
- package/.claude/rules/code-review.md +254 -0
- package/.claude/rules/git-workflow.md +37 -0
- package/.claude/rules/github.md +154 -0
- package/.claude/rules/testing.md +172 -0
- package/.claude/skills/acp-harness/SKILL.md +310 -0
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
- package/.claude/skills/acp-harness/references/downstream.md +288 -0
- package/.claude/skills/acp-harness/references/output-formats.md +221 -0
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +16 -0
- package/.github/CODEOWNERS +6 -0
- package/.github/workflows/ci.yml +63 -0
- package/.github/workflows/publish.yml +146 -0
- package/.mcp.json +20 -0
- package/CLAUDE.md +92 -0
- package/Dockerfile.test +23 -0
- package/LICENSE +15 -0
- package/README.md +94 -0
- package/bin/cli.ts +670 -0
- package/bin/tests/cli.spec.ts +362 -0
- package/biome.json +96 -0
- package/bun.lock +513 -0
- package/docker-compose.test.yml +21 -0
- package/package.json +57 -0
- package/scripts/bun-test-wrapper.sh +46 -0
- package/src/acp-client.ts +503 -0
- package/src/acp-helpers.ts +121 -0
- package/src/acp-transport.ts +455 -0
- package/src/acp-utils.ts +341 -0
- package/src/acp.constants.ts +56 -0
- package/src/acp.schemas.ts +161 -0
- package/src/acp.ts +27 -0
- package/src/acp.types.ts +28 -0
- package/src/tests/acp-client.spec.ts +205 -0
- package/src/tests/acp-helpers.spec.ts +105 -0
- package/src/tests/acp-integration.docker.ts +214 -0
- package/src/tests/acp-transport.spec.ts +153 -0
- package/src/tests/acp-utils.spec.ts +394 -0
- package/src/tests/fixtures/.claude/settings.local.json +8 -0
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
- package/src/tests/fixtures/calculator-mcp.ts +215 -0
- package/tsconfig.json +32 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Testing
|
|
2
|
+
|
|
3
|
+
This project uses Bun's built-in test runner for unit and integration tests.
|
|
4
|
+
|
|
5
|
+
## Test Types
|
|
6
|
+
|
|
7
|
+
### Unit/Integration Tests (`*.spec.ts`)
|
|
8
|
+
|
|
9
|
+
- Standard Bun tests using `*.spec.ts` extension
|
|
10
|
+
- Run with `bun test` command
|
|
11
|
+
- Used for testing business logic, utilities, and non-visual functionality
|
|
12
|
+
|
|
13
|
+
### Docker Integration Tests (`*.docker.ts`)
|
|
14
|
+
|
|
15
|
+
- Tests that require external services or API keys run in Docker containers
|
|
16
|
+
- Use `*.docker.ts` extension
|
|
17
|
+
- Run with `bun run test:docker`
|
|
18
|
+
|
|
19
|
+
## Running Tests
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Run all unit tests
|
|
23
|
+
bun test
|
|
24
|
+
|
|
25
|
+
# Run a specific spec test file
|
|
26
|
+
bun test path/to/file.spec.ts
|
|
27
|
+
|
|
28
|
+
# Run tests matching a pattern
|
|
29
|
+
bun test pattern
|
|
30
|
+
|
|
31
|
+
# Run Docker integration tests (requires ANTHROPIC_API_KEY)
|
|
32
|
+
ANTHROPIC_API_KEY=sk-... bun run test:docker
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Test Style Conventions
|
|
36
|
+
|
|
37
|
+
### Use `test` Instead of `it`
|
|
38
|
+
|
|
39
|
+
Use `test` instead of `it` in test files for consistency:
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
// ✅ Good
|
|
43
|
+
test('should create ACP client correctly', () => {
|
|
44
|
+
// ...
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
// ❌ Avoid
|
|
48
|
+
it('should create ACP client correctly', () => {
|
|
49
|
+
// ...
|
|
50
|
+
})
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Skill Script Tests
|
|
54
|
+
|
|
55
|
+
Claude Code skills in `.claude/skills/` may include executable scripts. Tests for these scripts follow a specific structure:
|
|
56
|
+
|
|
57
|
+
### Directory Structure
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
.claude/skills/<skill-name>/
|
|
61
|
+
├── SKILL.md
|
|
62
|
+
├── scripts/
|
|
63
|
+
│ ├── script-name.ts # Executable script
|
|
64
|
+
│ └── tests/
|
|
65
|
+
│ └── script-name.spec.ts # Tests for the script
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Running Skill Script Tests
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# From skill directory
|
|
72
|
+
bun test scripts/tests/
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Test Pattern
|
|
76
|
+
|
|
77
|
+
Scripts that output JSON can be tested using Bun's shell API:
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
import { describe, test, expect } from 'bun:test'
|
|
81
|
+
import { join } from 'node:path'
|
|
82
|
+
import { $ } from 'bun'
|
|
83
|
+
|
|
84
|
+
const scriptsDir = join(import.meta.dir, '..')
|
|
85
|
+
|
|
86
|
+
describe('script-name', () => {
|
|
87
|
+
test('outputs expected JSON', async () => {
|
|
88
|
+
const result = await $`bun ${scriptsDir}/script-name.ts arg1 arg2`.json()
|
|
89
|
+
expect(result.filePath).toEndWith('expected.ts')
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
test('exits with error on invalid input', async () => {
|
|
93
|
+
const proc = Bun.spawn(['bun', `${scriptsDir}/script-name.ts`], {
|
|
94
|
+
stderr: 'pipe',
|
|
95
|
+
})
|
|
96
|
+
const exitCode = await proc.exited
|
|
97
|
+
expect(exitCode).toBe(1)
|
|
98
|
+
})
|
|
99
|
+
})
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Docker Integration Tests
|
|
103
|
+
|
|
104
|
+
Tests that require the Anthropic API run in Docker containers for consistent, isolated execution.
|
|
105
|
+
|
|
106
|
+
### ACP Integration Tests
|
|
107
|
+
|
|
108
|
+
The ACP client integration tests require the Anthropic API and run in a Docker container:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
# Run locally with Docker (requires ANTHROPIC_API_KEY)
|
|
112
|
+
ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.test.yml run --rm acp-test
|
|
113
|
+
|
|
114
|
+
# Or using the npm script (still requires Docker)
|
|
115
|
+
ANTHROPIC_API_KEY=sk-... bun run test:docker
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### File Naming
|
|
119
|
+
|
|
120
|
+
- **`*.docker.ts`**: Tests that run in Docker containers
|
|
121
|
+
- These are excluded from `bun test` and run separately in CI
|
|
122
|
+
|
|
123
|
+
### CI Workflow
|
|
124
|
+
|
|
125
|
+
Docker tests use path filtering to reduce API costs:
|
|
126
|
+
|
|
127
|
+
```yaml
|
|
128
|
+
# .github/workflows/ci.yml
|
|
129
|
+
jobs:
|
|
130
|
+
changes:
|
|
131
|
+
# Detects which paths changed
|
|
132
|
+
steps:
|
|
133
|
+
- uses: dorny/paths-filter@v3
|
|
134
|
+
with:
|
|
135
|
+
filters: |
|
|
136
|
+
acp:
|
|
137
|
+
- 'src/**'
|
|
138
|
+
|
|
139
|
+
test-acp-integration:
|
|
140
|
+
needs: changes
|
|
141
|
+
if: ${{ needs.changes.outputs.acp == 'true' }}
|
|
142
|
+
# Only runs when src/ files change
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Anti-Patterns
|
|
146
|
+
|
|
147
|
+
### No Conditionals Around Assertions
|
|
148
|
+
|
|
149
|
+
Never wrap assertions in conditionals. Tests should fail explicitly, not silently skip assertions.
|
|
150
|
+
|
|
151
|
+
```typescript
|
|
152
|
+
// ❌ WRONG: Conditional assertion
|
|
153
|
+
if (result) {
|
|
154
|
+
expect(result.value).toBe(expected)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// ❌ WRONG: Optional chaining with assertion
|
|
158
|
+
result?.value && expect(result.value).toBe(expected)
|
|
159
|
+
|
|
160
|
+
// ✅ CORRECT: Assert the condition, then assert the value
|
|
161
|
+
expect(result).toBeDefined()
|
|
162
|
+
expect(result.value).toBe(expected)
|
|
163
|
+
|
|
164
|
+
// ✅ CORRECT: Use type narrowing assertion
|
|
165
|
+
expect(result).not.toBeNull()
|
|
166
|
+
expect(result!.value).toBe(expected)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
If a value might not exist, the test should either:
|
|
170
|
+
1. Assert that it exists first, then check its value
|
|
171
|
+
2. Assert that it doesn't exist (if that's the expected behavior)
|
|
172
|
+
3. Restructure the test to ensure the value is always present
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: acp-harness
|
|
3
|
+
description: CLI tool for capturing agent trajectories. Execute prompts against ACP-compatible agents, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring.
|
|
4
|
+
compatibility: Bun >= 1.2.9
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# ACP Harness
|
|
8
|
+
|
|
9
|
+
## Purpose
|
|
10
|
+
|
|
11
|
+
CLI tool for capturing trajectories from ACP-compatible agents, optimized for TypeScript/JavaScript projects using Bun.
|
|
12
|
+
|
|
13
|
+
**The harness captures. You score.**
|
|
14
|
+
|
|
15
|
+
| Harness Provides | You Provide |
|
|
16
|
+
|------------------|-------------|
|
|
17
|
+
| Prompt execution against ACP agents | Scoring logic (Braintrust, custom scripts) |
|
|
18
|
+
| Full trajectory capture (thoughts, tools, plans) | Pass/fail determination |
|
|
19
|
+
| Structured JSONL output | LLM-as-judge prompts |
|
|
20
|
+
| Reproducible execution environment | CI integration, golden file comparison |
|
|
21
|
+
|
|
22
|
+
**Use this when:**
|
|
23
|
+
- Capturing trajectories for downstream evaluation
|
|
24
|
+
- Generating training data (SFT/DPO) with full context
|
|
25
|
+
- Building regression test fixtures for agent behavior
|
|
26
|
+
- Comparing agent responses across configurations
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Run without installing (recommended for CI)
|
|
32
|
+
bunx @plaited/acp-harness prompts.jsonl -o results.jsonl
|
|
33
|
+
|
|
34
|
+
# Or install globally for repeated use
|
|
35
|
+
bun add -g @plaited/acp-harness
|
|
36
|
+
acp-harness prompts.jsonl -o results.jsonl
|
|
37
|
+
|
|
38
|
+
# Or add as project dependency
|
|
39
|
+
bun add @plaited/acp-harness
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Note:** Examples below use `acp-harness` (the command available after global install). Replace with `bunx @plaited/acp-harness` if not installed globally.
|
|
43
|
+
|
|
44
|
+
## Capture Workflow
|
|
45
|
+
|
|
46
|
+
```mermaid
|
|
47
|
+
flowchart LR
|
|
48
|
+
Prompts["prompts.jsonl"] --> Harness["acp-harness"]
|
|
49
|
+
Agent["ACP Agent"] --> Harness
|
|
50
|
+
Harness -->|"JSONL"| Output["trajectories"]
|
|
51
|
+
Output --> Scoring["Your scoring logic"]
|
|
52
|
+
Scoring --> Decision["Informed choices"]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The harness is a **capture layer** - it executes prompts and records trajectories. Scoring happens in your codebase.
|
|
56
|
+
|
|
57
|
+
| Use Case | Harness Captures | You Build |
|
|
58
|
+
|----------|------------------|-----------|
|
|
59
|
+
| **Agent comparison** | Same prompts → multiple agents → trajectories | Scoring pipeline (Braintrust, custom) |
|
|
60
|
+
| **Tool comparison** | Trajectory with tool/skill attribution | Diff analysis, preference data |
|
|
61
|
+
| **Training data** | Structured I/O with tool calls, plans, thoughts | SFT/DPO formatting |
|
|
62
|
+
| **Regression testing** | Deterministic prompt → trajectory capture | Golden file comparison, CI assertions |
|
|
63
|
+
|
|
64
|
+
### Example: Comparing Built-in vs Skill
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Run same prompt with built-in tool
|
|
68
|
+
acp-harness prompts.jsonl \
|
|
69
|
+
--cmd "bunx claude-code-acp" \
|
|
70
|
+
-o results-builtin.jsonl
|
|
71
|
+
|
|
72
|
+
# Run same prompt with custom skill installed
|
|
73
|
+
acp-harness prompts.jsonl \
|
|
74
|
+
--cmd "bunx claude-code-acp" \
|
|
75
|
+
--cwd /project/with/typescript-lsp-skill \
|
|
76
|
+
-o results-skill.jsonl
|
|
77
|
+
|
|
78
|
+
# Compare trajectories - which used better tools? faster? more accurate?
|
|
79
|
+
diff <(jq '.toolCalls' results-builtin.jsonl) <(jq '.toolCalls' results-skill.jsonl)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Execution Environment
|
|
83
|
+
|
|
84
|
+
**Recommendation:** Run the harness in Docker containers for consistent, isolated execution.
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Build and run with Docker Compose
|
|
88
|
+
docker compose -f docker-compose.acp.yml run --rm acp-harness
|
|
89
|
+
|
|
90
|
+
# Or build directly
|
|
91
|
+
docker build -f Dockerfile.acp -t acp-harness .
|
|
92
|
+
docker run --rm -e ANTHROPIC_API_KEY acp-harness
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Docker provides:
|
|
96
|
+
- Consistent environment across local and CI
|
|
97
|
+
- Filesystem isolation without app-level sandboxing
|
|
98
|
+
- Reproducible results for training data generation
|
|
99
|
+
|
|
100
|
+
See [assets/](assets/) for example container configurations:
|
|
101
|
+
- `Dockerfile.acp` - Base container with Bun and git
|
|
102
|
+
- `docker-compose.acp.yml` - Compose file with volume mounts for results
|
|
103
|
+
|
|
104
|
+
## Non-Goals
|
|
105
|
+
|
|
106
|
+
This harness is optimized for TypeScript/JavaScript projects using Bun. It is **not** designed for:
|
|
107
|
+
|
|
108
|
+
- **Python projects** - Use [SWE-bench](https://github.com/SWE-bench/SWE-bench), [Braintrust Python SDK](https://www.braintrust.dev/)
|
|
109
|
+
- **Academic model benchmarking** - Use [EleutherAI lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)
|
|
110
|
+
- **IDE integrations** - Use Copilot Evaluation Harness
|
|
111
|
+
- **SaaS observability** - Use Braintrust, Langfuse platforms directly
|
|
112
|
+
|
|
113
|
+
## Quick Reference
|
|
114
|
+
|
|
115
|
+
| Resource | Description |
|
|
116
|
+
|----------|-------------|
|
|
117
|
+
| `bunx @plaited/acp-harness` | Execute prompts against agent, capture trajectories |
|
|
118
|
+
| [output-formats.md](references/output-formats.md) | JSONL schemas, format options |
|
|
119
|
+
| [downstream.md](references/downstream.md) | Integration patterns (Braintrust, jq, custom scorers) |
|
|
120
|
+
|
|
121
|
+
## Output Pipeline
|
|
122
|
+
|
|
123
|
+
```mermaid
|
|
124
|
+
flowchart LR
|
|
125
|
+
Prompts["prompts.jsonl"] --> Harness["acp-harness"]
|
|
126
|
+
Agent["ACP Agent"] --> Harness
|
|
127
|
+
Harness --> Summary["summary.jsonl"]
|
|
128
|
+
Harness --> Full["results.md + results.full.jsonl"]
|
|
129
|
+
Summary --> Your["Your scoring code"]
|
|
130
|
+
Full --> Your
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
1. **Prepare** - Create `prompts.jsonl` with test cases
|
|
134
|
+
2. **Execute** - Run harness against target agent
|
|
135
|
+
3. **Capture** - Trajectories streamed to output files
|
|
136
|
+
4. **Score** - Pipe output to your scoring logic (Braintrust, jq, LLM-as-judge)
|
|
137
|
+
|
|
138
|
+
## Harness Script
|
|
139
|
+
|
|
140
|
+
### Basic Usage
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
acp-harness <prompts.jsonl> --cmd <cmd> [options]
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Arguments
|
|
147
|
+
|
|
148
|
+
| Flag | Description | Default |
|
|
149
|
+
|------|-------------|---------|
|
|
150
|
+
| `prompts.jsonl` | Input file with prompts to execute | Required |
|
|
151
|
+
| `--cmd, --command` | ACP agent command (e.g., `bunx claude-code-acp`, `bun ./adapter.ts`) | `"claude-code-acp"` |
|
|
152
|
+
| `-o, --output` | Output file/path | stdout |
|
|
153
|
+
| `-c, --cwd` | Working directory for agent | current |
|
|
154
|
+
| `-t, --timeout` | Request timeout in ms | `60000` |
|
|
155
|
+
| `-f, --format` | Output format: `summary`, `judge` | `summary` |
|
|
156
|
+
| `--progress` | Show progress to stderr | false |
|
|
157
|
+
| `--append` | Append to output file | false |
|
|
158
|
+
| `--mcp-server` | MCP server config JSON (repeatable) | none |
|
|
159
|
+
|
|
160
|
+
### Examples
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Using the default claude-code-acp adapter
|
|
164
|
+
acp-harness prompts.jsonl -o results.jsonl
|
|
165
|
+
|
|
166
|
+
# Using bunx to run an adapter
|
|
167
|
+
acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
|
|
168
|
+
|
|
169
|
+
# Using a local adapter script (great for custom adapters in same repo)
|
|
170
|
+
acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
|
|
171
|
+
|
|
172
|
+
# Judge format - creates two files for downstream scoring
|
|
173
|
+
acp-harness prompts.jsonl --format judge -o results
|
|
174
|
+
# Creates: results.md (summary with step IDs) + results.full.jsonl (complete trajectory)
|
|
175
|
+
|
|
176
|
+
# With MCP server (stdio transport)
|
|
177
|
+
acp-harness prompts.jsonl \
|
|
178
|
+
--mcp-server '{"type":"stdio","name":"fs","command":["mcp-filesystem","/data"]}'
|
|
179
|
+
|
|
180
|
+
# With MCP server (HTTP transport)
|
|
181
|
+
acp-harness prompts.jsonl \
|
|
182
|
+
--mcp-server '{"type":"http","name":"api","url":"http://localhost:3000"}'
|
|
183
|
+
|
|
184
|
+
# Stream with progress
|
|
185
|
+
acp-harness prompts.jsonl --progress -o results.jsonl
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Input Format
|
|
189
|
+
|
|
190
|
+
Each line in `prompts.jsonl`:
|
|
191
|
+
|
|
192
|
+
```jsonl
|
|
193
|
+
{"id":"test-001","input":"Create a primary button","expected":"should contain <button>","metadata":{"category":"ui"}}
|
|
194
|
+
{"id":"test-002","input":"Write a function for form validation","metadata":{"category":"logic"}}
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
| Field | Required | Description |
|
|
198
|
+
|-------|----------|-------------|
|
|
199
|
+
| `id` | Yes | Unique identifier |
|
|
200
|
+
| `input` | Yes | Prompt text for the agent |
|
|
201
|
+
| `expected` | No | Expected output (for downstream scoring) |
|
|
202
|
+
| `metadata` | No | Tags, category, difficulty for filtering |
|
|
203
|
+
| `timeout` | No | Override default timeout for this prompt |
|
|
204
|
+
|
|
205
|
+
## Output Formats
|
|
206
|
+
|
|
207
|
+
### Summary Format (default)
|
|
208
|
+
|
|
209
|
+
Minimal JSONL for quick metrics and analysis:
|
|
210
|
+
|
|
211
|
+
```jsonl
|
|
212
|
+
{"id":"test-001","input":"Create a button","output":"I created...","toolCalls":["Write"],"status":"passed","duration":1234}
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Judge Format (two-tier)
|
|
216
|
+
|
|
217
|
+
Creates two files optimized for downstream LLM-as-judge scoring:
|
|
218
|
+
|
|
219
|
+
**`<output>.md`** - Markdown summary with step IDs and code previews:
|
|
220
|
+
|
|
221
|
+
```markdown
|
|
222
|
+
## Capture Record: test-001
|
|
223
|
+
|
|
224
|
+
**Input:** Create a primary button
|
|
225
|
+
|
|
226
|
+
**Trajectory:**
|
|
227
|
+
1. [THOUGHT] I'll create a styled button... [->test-001-step-1]
|
|
228
|
+
2. [TOOL:Write] -> completed (234ms) [->test-001-step-2]
|
|
229
|
+
File: src/button.tsx (847 chars)
|
|
230
|
+
```tsx
|
|
231
|
+
import { css } from 'some-css-lib'
|
|
232
|
+
|
|
233
|
+
type ButtonProps = {
|
|
234
|
+
label: string
|
|
235
|
+
|
|
236
|
+
// ... 30 lines omitted ...
|
|
237
|
+
|
|
238
|
+
export const Button = ({ label }: ButtonProps) => (
|
|
239
|
+
<button className={styles.btn}>{label}</button>
|
|
240
|
+
)
|
|
241
|
+
```
|
|
242
|
+
3. [MESSAGE] I created the button... [->test-001-step-3]
|
|
243
|
+
|
|
244
|
+
**Output:** I created the button with primary styling.
|
|
245
|
+
**Metadata:** category=ui, agent=claude-code-acp
|
|
246
|
+
**Status:** passed
|
|
247
|
+
**Duration:** 1234ms
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
**`<output>.full.jsonl`** - Complete trajectory with step IDs for correlation:
|
|
253
|
+
|
|
254
|
+
```jsonl
|
|
255
|
+
{"id":"test-001","input":"...","output":"...","trajectory":[{"type":"thought","content":"...","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{...},"output":{...},"duration":234,"stepId":"test-001-step-2"}],...}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
**Usage patterns by judge context window:**
|
|
259
|
+
|
|
260
|
+
| Judge Model | Strategy |
|
|
261
|
+
|-------------|----------|
|
|
262
|
+
| Gemini (1M+ tokens) | Feed `results.full.jsonl` directly |
|
|
263
|
+
| Claude/GPT-4 (128-200k) | Use `results.full.jsonl` for most runs |
|
|
264
|
+
| Smaller models | Use `results.md`, retrieve specific steps by ID as needed |
|
|
265
|
+
|
|
266
|
+
## Downstream Integration
|
|
267
|
+
|
|
268
|
+
The harness outputs standard JSONL that pipes to any tool:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
# Filter with jq
|
|
272
|
+
cat results.jsonl | jq 'select(.metadata.category == "ui")'
|
|
273
|
+
|
|
274
|
+
# Count tool usage
|
|
275
|
+
cat results.jsonl | jq -s 'map(.toolCalls | length) | add'
|
|
276
|
+
|
|
277
|
+
# Feed full trajectory to Gemini (large context)
|
|
278
|
+
cat results.full.jsonl | your-gemini-judge.ts
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
See [downstream.md](references/downstream.md) for integration patterns with Braintrust, Gemini, and custom scorers.
|
|
282
|
+
|
|
283
|
+
## Capture Targets
|
|
284
|
+
|
|
285
|
+
| Target | How to Capture |
|
|
286
|
+
|--------|----------------|
|
|
287
|
+
| **Agent capability** | Direct prompts, capture trajectory for analysis |
|
|
288
|
+
| **Skills** | Set `--cwd` to project with skill, capture skill-specific behavior |
|
|
289
|
+
| **MCP Servers** | Use `--mcp-server` flag, capture tool usage in trajectory |
|
|
290
|
+
|
|
291
|
+
### Capturing Skill Behavior
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
bunx @plaited/acp-harness skill-prompts.jsonl \
|
|
295
|
+
--cwd /project/with/skill \
|
|
296
|
+
-o results.jsonl
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Capturing MCP Server Usage
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
bunx @plaited/acp-harness mcp-prompts.jsonl \
|
|
303
|
+
--mcp-server '{"type":"stdio","name":"fs","command":["mcp-filesystem"]}' \
|
|
304
|
+
-o results.jsonl
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## Related
|
|
308
|
+
|
|
309
|
+
- **[@agentclientprotocol/sdk](https://www.npmjs.com/package/@agentclientprotocol/sdk)** - ACP SDK for programmatic access
|
|
310
|
+
- **[@zed-industries/claude-code-acp](https://www.npmjs.com/package/@zed-industries/claude-code-acp)** - Claude Code ACP adapter
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# ACP Harness Docker Configuration
|
|
2
|
+
#
|
|
3
|
+
# Example Dockerfile for running ACP evaluations in an isolated container.
|
|
4
|
+
# Copy this to your project and customize as needed.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# docker build -f Dockerfile.acp -t acp-harness .
|
|
8
|
+
# docker run --rm -e ANTHROPIC_API_KEY acp-harness bunx @plaited/acp-harness prompts.jsonl
|
|
9
|
+
|
|
10
|
+
FROM oven/bun:1.2.9
|
|
11
|
+
|
|
12
|
+
# Install git (required for some agent operations)
|
|
13
|
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
14
|
+
|
|
15
|
+
WORKDIR /app
|
|
16
|
+
|
|
17
|
+
# Copy package files first for better layer caching
|
|
18
|
+
COPY package.json bun.lock* ./
|
|
19
|
+
RUN bun install --frozen-lockfile
|
|
20
|
+
|
|
21
|
+
# Copy source files
|
|
22
|
+
COPY . .
|
|
23
|
+
|
|
24
|
+
# Default command - override with your harness invocation
|
|
25
|
+
CMD ["bun", "test"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# ACP Harness Docker Compose Configuration
|
|
2
|
+
#
|
|
3
|
+
# Example docker-compose for running ACP evaluations.
|
|
4
|
+
# Copy this to your project and customize as needed.
|
|
5
|
+
#
|
|
6
|
+
# Usage:
|
|
7
|
+
# ANTHROPIC_API_KEY=sk-... docker compose -f docker-compose.acp.yml run --rm acp-harness
|
|
8
|
+
|
|
9
|
+
services:
|
|
10
|
+
acp-harness:
|
|
11
|
+
build:
|
|
12
|
+
context: .
|
|
13
|
+
dockerfile: Dockerfile.acp
|
|
14
|
+
environment:
|
|
15
|
+
- ANTHROPIC_API_KEY
|
|
16
|
+
volumes:
|
|
17
|
+
# Mount output directory to persist results
|
|
18
|
+
- ./results:/app/results
|
|
19
|
+
command: ["bunx", "@plaited/acp-harness", "prompts.jsonl", "-o", "results/output.jsonl"]
|