@plaited/acp-harness 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/rules/accuracy.md +43 -0
- package/.claude/rules/bun-apis.md +80 -0
- package/.claude/rules/code-review.md +254 -0
- package/.claude/rules/git-workflow.md +37 -0
- package/.claude/rules/github.md +154 -0
- package/.claude/rules/testing.md +172 -0
- package/.claude/skills/acp-harness/SKILL.md +310 -0
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
- package/.claude/skills/acp-harness/references/downstream.md +288 -0
- package/.claude/skills/acp-harness/references/output-formats.md +221 -0
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +16 -0
- package/.github/CODEOWNERS +6 -0
- package/.github/workflows/ci.yml +63 -0
- package/.github/workflows/publish.yml +146 -0
- package/.mcp.json +20 -0
- package/CLAUDE.md +92 -0
- package/Dockerfile.test +23 -0
- package/LICENSE +15 -0
- package/README.md +94 -0
- package/bin/cli.ts +670 -0
- package/bin/tests/cli.spec.ts +362 -0
- package/biome.json +96 -0
- package/bun.lock +513 -0
- package/docker-compose.test.yml +21 -0
- package/package.json +57 -0
- package/scripts/bun-test-wrapper.sh +46 -0
- package/src/acp-client.ts +503 -0
- package/src/acp-helpers.ts +121 -0
- package/src/acp-transport.ts +455 -0
- package/src/acp-utils.ts +341 -0
- package/src/acp.constants.ts +56 -0
- package/src/acp.schemas.ts +161 -0
- package/src/acp.ts +27 -0
- package/src/acp.types.ts +28 -0
- package/src/tests/acp-client.spec.ts +205 -0
- package/src/tests/acp-helpers.spec.ts +105 -0
- package/src/tests/acp-integration.docker.ts +214 -0
- package/src/tests/acp-transport.spec.ts +153 -0
- package/src/tests/acp-utils.spec.ts +394 -0
- package/src/tests/fixtures/.claude/settings.local.json +8 -0
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
- package/src/tests/fixtures/calculator-mcp.ts +215 -0
- package/tsconfig.json +32 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
# Downstream Integration
|
|
2
|
+
|
|
3
|
+
Patterns for piping harness output to analysis tools.
|
|
4
|
+
|
|
5
|
+
## Loading Results
|
|
6
|
+
|
|
7
|
+
Both output formats use JSONL (newline-delimited JSON):
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
// TypeScript pattern (validated in tests)
|
|
11
|
+
const parseResults = (jsonl: string) =>
|
|
12
|
+
jsonl.trim().split('\n').map((line) => JSON.parse(line))
|
|
13
|
+
|
|
14
|
+
// Load from file
|
|
15
|
+
const results = parseResults(await Bun.file('results.jsonl').text())
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## jq Analysis
|
|
19
|
+
|
|
20
|
+
Summary JSONL is designed for quick analysis with `jq`:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Calculate average duration
|
|
24
|
+
cat results.jsonl | jq -s 'map(.duration) | add / length'
|
|
25
|
+
|
|
26
|
+
# Count tool usage
|
|
27
|
+
cat results.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})'
|
|
28
|
+
|
|
29
|
+
# Filter by status
|
|
30
|
+
cat results.jsonl | jq 'select(.status == "failed")'
|
|
31
|
+
|
|
32
|
+
# Pass rate
|
|
33
|
+
cat results.jsonl | jq -s 'map(select(.status == "passed")) | length as $p | length as $t | "\($p)/\($t) passed"'
|
|
34
|
+
|
|
35
|
+
# Group by category
|
|
36
|
+
cat results.jsonl | jq -s 'group_by(.metadata.category) | map({category: .[0].metadata.category, count: length})'
|
|
37
|
+
|
|
38
|
+
# Find slowest runs
|
|
39
|
+
cat results.jsonl | jq -s 'sort_by(-.duration) | .[0:5] | map({id, duration})'
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## TypeScript Analysis Patterns
|
|
43
|
+
|
|
44
|
+
These patterns are validated by tests in `bin/tests/cli.spec.ts`:
|
|
45
|
+
|
|
46
|
+
### Filter by Status
|
|
47
|
+
|
|
48
|
+
```typescript
|
|
49
|
+
const failed = results.filter((r) => r.status === 'failed')
|
|
50
|
+
const passed = results.filter((r) => r.status === 'passed')
|
|
51
|
+
const passRate = passed.length / results.length
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Filter by Tool Usage
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
// Find runs that used Write tool
|
|
58
|
+
const withWrite = results.filter((r) => r.toolCalls.includes('Write'))
|
|
59
|
+
|
|
60
|
+
// Find runs that used multiple tools
|
|
61
|
+
const multiTool = results.filter((r) => r.toolCalls.length > 1)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Filter by Duration
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
// Slow runs (> 2 seconds)
|
|
68
|
+
const slow = results.filter((r) => r.duration > 2000)
|
|
69
|
+
|
|
70
|
+
// Find top 5 slowest
|
|
71
|
+
const slowest = [...results].sort((a, b) => b.duration - a.duration).slice(0, 5)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Filter by Metadata
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
// Filter by category
|
|
78
|
+
const uiResults = results.filter((r) => r.metadata.category === 'ui')
|
|
79
|
+
|
|
80
|
+
// Group and count by category
|
|
81
|
+
const grouped = results.reduce<Record<string, number>>((acc, r) => {
|
|
82
|
+
const cat = r.metadata.category as string
|
|
83
|
+
acc[cat] = (acc[cat] ?? 0) + 1
|
|
84
|
+
return acc
|
|
85
|
+
}, {})
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Count Tool Usage
|
|
89
|
+
|
|
90
|
+
```typescript
|
|
91
|
+
const allTools = results.flatMap((r) => r.toolCalls)
|
|
92
|
+
const toolCounts = allTools.reduce<Record<string, number>>((acc, tool) => {
|
|
93
|
+
acc[tool] = (acc[tool] ?? 0) + 1
|
|
94
|
+
return acc
|
|
95
|
+
}, {})
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Deduplicate by ID
|
|
99
|
+
|
|
100
|
+
```typescript
|
|
101
|
+
// Keep latest occurrence when merging multiple runs
|
|
102
|
+
const byId = new Map<string, unknown>()
|
|
103
|
+
for (const result of results) {
|
|
104
|
+
byId.set(result.id, result)
|
|
105
|
+
}
|
|
106
|
+
const deduped = Array.from(byId.values())
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Step-Level Retrieval
|
|
110
|
+
|
|
111
|
+
For judge format, correlate markdown step IDs with full JSONL:
|
|
112
|
+
|
|
113
|
+
```typescript
|
|
114
|
+
// Load both files
|
|
115
|
+
const markdown = await Bun.file('results.md').text()
|
|
116
|
+
const fullResults = parseResults(await Bun.file('results.full.jsonl').text())
|
|
117
|
+
|
|
118
|
+
// Build step index
|
|
119
|
+
const stepIndex = new Map<string, unknown>()
|
|
120
|
+
for (const result of fullResults) {
|
|
121
|
+
for (const step of result.trajectory) {
|
|
122
|
+
stepIndex.set(step.stepId, step)
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Retrieve full step by ID (from markdown [→stepId])
|
|
127
|
+
const stepId = 'test-001-step-2'
|
|
128
|
+
const fullStep = stepIndex.get(stepId) as { name: string; input: unknown }
|
|
129
|
+
console.log('Tool name:', fullStep.name)
|
|
130
|
+
console.log('Full input:', fullStep.input)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Extract Tool Calls from Trajectory
|
|
134
|
+
|
|
135
|
+
```typescript
|
|
136
|
+
const toolCalls = result.trajectory.filter((s) => s.type === 'tool_call')
|
|
137
|
+
const toolNames = toolCalls.map((t) => t.name)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Timing Information
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
const result = results[0]
|
|
144
|
+
const duration = result.timing.end - result.timing.start
|
|
145
|
+
const timeToFirstResponse = result.timing.firstResponse // ms after start
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## LLM-as-Judge
|
|
149
|
+
|
|
150
|
+
### Large Context Models (Gemini 1M+)
|
|
151
|
+
|
|
152
|
+
Feed full trajectory directly:
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
import { GoogleGenerativeAI } from '@google/generative-ai'
|
|
156
|
+
|
|
157
|
+
const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY!)
|
|
158
|
+
const model = genAI.getGenerativeModel({ model: 'gemini-2.5-pro' })
|
|
159
|
+
|
|
160
|
+
const results = parseResults(await Bun.file('results.full.jsonl').text())
|
|
161
|
+
|
|
162
|
+
const prompt = `
|
|
163
|
+
Evaluate these agent trajectories for code quality and reasoning.
|
|
164
|
+
|
|
165
|
+
${JSON.stringify(results, null, 2)}
|
|
166
|
+
|
|
167
|
+
For each evaluation, score 1-3:
|
|
168
|
+
- 1: Major issues (wrong tools, broken logic, incorrect output)
|
|
169
|
+
- 2: Minor issues (inefficient but correct)
|
|
170
|
+
- 3: Excellent (efficient trajectory, correct output)
|
|
171
|
+
|
|
172
|
+
Respond as JSON array: [{"id": "...", "score": N, "reasoning": "..."}]
|
|
173
|
+
`
|
|
174
|
+
|
|
175
|
+
const response = await model.generateContent(prompt)
|
|
176
|
+
console.log(response.response.text())
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Medium Context Models (Claude 200k)
|
|
180
|
+
|
|
181
|
+
Use full trajectory for most runs:
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
import Anthropic from '@anthropic-ai/sdk'
|
|
185
|
+
|
|
186
|
+
const client = new Anthropic()
|
|
187
|
+
const markdown = await Bun.file('results.md').text()
|
|
188
|
+
|
|
189
|
+
const response = await client.messages.create({
|
|
190
|
+
model: 'claude-sonnet-4-20250514',
|
|
191
|
+
max_tokens: 4096,
|
|
192
|
+
messages: [{
|
|
193
|
+
role: 'user',
|
|
194
|
+
content: `Evaluate these agent trajectories:\n\n${markdown}\n\nScore each 1-3 and explain.`
|
|
195
|
+
}]
|
|
196
|
+
})
|
|
197
|
+
|
|
198
|
+
console.log(response.content[0].text)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Braintrust Integration
|
|
202
|
+
|
|
203
|
+
Upload results programmatically:
|
|
204
|
+
|
|
205
|
+
```typescript
|
|
206
|
+
import { initLogger } from 'braintrust'
|
|
207
|
+
|
|
208
|
+
const logger = initLogger({
|
|
209
|
+
projectName: 'agent-eval',
|
|
210
|
+
apiKey: process.env.BRAINTRUST_API_KEY,
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
const results = parseResults(await Bun.file('results.jsonl').text())
|
|
214
|
+
|
|
215
|
+
for (const result of results) {
|
|
216
|
+
logger.log({
|
|
217
|
+
input: result.input,
|
|
218
|
+
output: result.output,
|
|
219
|
+
expected: result.expected,
|
|
220
|
+
scores: {
|
|
221
|
+
passed: result.status === 'passed' ? 1 : 0,
|
|
222
|
+
duration_ms: result.duration,
|
|
223
|
+
},
|
|
224
|
+
metadata: {
|
|
225
|
+
...result.metadata,
|
|
226
|
+
toolCalls: result.toolCalls,
|
|
227
|
+
},
|
|
228
|
+
})
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
await logger.flush()
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## CI Integration
|
|
235
|
+
|
|
236
|
+
### GitHub Actions
|
|
237
|
+
|
|
238
|
+
```yaml
|
|
239
|
+
name: Agent Eval
|
|
240
|
+
on:
|
|
241
|
+
schedule:
|
|
242
|
+
- cron: '0 0 * * 0' # Weekly
|
|
243
|
+
|
|
244
|
+
jobs:
|
|
245
|
+
eval:
|
|
246
|
+
runs-on: ubuntu-latest
|
|
247
|
+
steps:
|
|
248
|
+
- uses: actions/checkout@v4
|
|
249
|
+
- uses: oven-sh/setup-bun@v2
|
|
250
|
+
|
|
251
|
+
- name: Install ACP adapter
|
|
252
|
+
run: npm install -g @zed-industries/claude-code-acp
|
|
253
|
+
|
|
254
|
+
- name: Install dependencies
|
|
255
|
+
run: bun add @plaited/acp-harness
|
|
256
|
+
|
|
257
|
+
- name: Run harness
|
|
258
|
+
env:
|
|
259
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
260
|
+
run: |
|
|
261
|
+
bunx @plaited/acp-harness prompts.jsonl \
|
|
262
|
+
--format judge \
|
|
263
|
+
--progress \
|
|
264
|
+
-o eval-results
|
|
265
|
+
|
|
266
|
+
- name: Upload results
|
|
267
|
+
uses: actions/upload-artifact@v4
|
|
268
|
+
with:
|
|
269
|
+
name: eval-results
|
|
270
|
+
path: |
|
|
271
|
+
eval-results.md
|
|
272
|
+
eval-results.full.jsonl
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## Output Aggregation
|
|
276
|
+
|
|
277
|
+
Combine multiple runs:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
# Append mode during runs
|
|
281
|
+
bunx @plaited/acp-harness prompts-1.jsonl --append -o combined.jsonl
|
|
282
|
+
bunx @plaited/acp-harness prompts-2.jsonl --append -o combined.jsonl
|
|
283
|
+
|
|
284
|
+
# Merge separate files
|
|
285
|
+
cat run1.jsonl run2.jsonl run3.jsonl > combined.jsonl
|
|
286
|
+
|
|
287
|
+
# Dedupe by ID (keep latest) - use TypeScript pattern above
|
|
288
|
+
```
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# Output Formats
|
|
2
|
+
|
|
3
|
+
The harness supports two output formats optimized for different use cases.
|
|
4
|
+
|
|
5
|
+
## Format Selection
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
bunx @plaited/acp-harness prompts.jsonl --format <format> -o <output>
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
| Format | Files Created | Use Case |
|
|
12
|
+
|--------|---------------|----------|
|
|
13
|
+
| `summary` | Single JSONL | Quick metrics, dashboards, jq analysis |
|
|
14
|
+
| `judge` | `.md` + `.full.jsonl` | Downstream LLM-as-judge scoring |
|
|
15
|
+
|
|
16
|
+
## Summary Format (Default)
|
|
17
|
+
|
|
18
|
+
Minimal JSONL for quick metrics and analysis.
|
|
19
|
+
|
|
20
|
+
### Schema
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
type SummaryResult = {
|
|
24
|
+
id: string // Prompt identifier
|
|
25
|
+
input: string // Original prompt text
|
|
26
|
+
output: string // Final agent response
|
|
27
|
+
toolCalls: string[] // List of tool names used
|
|
28
|
+
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
29
|
+
duration: number // Total execution time (ms)
|
|
30
|
+
}
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Example Output
|
|
34
|
+
|
|
35
|
+
```jsonl
|
|
36
|
+
{"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","toolCalls":["Write"],"status":"passed","duration":1234}
|
|
37
|
+
{"id":"test-002","input":"Fix the TypeScript error","output":"I fixed the type error...","toolCalls":["Read","Edit"],"status":"passed","duration":2567}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Analysis with jq
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Calculate average duration
|
|
44
|
+
cat results.jsonl | jq -s 'map(.duration) | add / length'
|
|
45
|
+
|
|
46
|
+
# Count tool usage
|
|
47
|
+
cat results.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})'
|
|
48
|
+
|
|
49
|
+
# Filter by status
|
|
50
|
+
cat results.jsonl | jq 'select(.status == "failed")'
|
|
51
|
+
|
|
52
|
+
# Pass rate
|
|
53
|
+
cat results.jsonl | jq -s 'map(select(.status == "passed")) | length as $p | length as $t | "\($p)/\($t) passed"'
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Judge Format (Two-Tier)
|
|
57
|
+
|
|
58
|
+
Creates two files for downstream LLM-as-judge scoring with step-level correlation.
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
bunx @plaited/acp-harness prompts.jsonl --format judge -o results
|
|
62
|
+
# Creates: results.md + results.full.jsonl
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Markdown File (`<output>.md`)
|
|
66
|
+
|
|
67
|
+
Human-readable summary with step IDs and code previews.
|
|
68
|
+
|
|
69
|
+
**Structure:**
|
|
70
|
+
|
|
71
|
+
```markdown
|
|
72
|
+
## Capture Record: <id>
|
|
73
|
+
|
|
74
|
+
**Input:** <original prompt>
|
|
75
|
+
|
|
76
|
+
**Trajectory:**
|
|
77
|
+
1. [THOUGHT] <truncated content> [->stepId]
|
|
78
|
+
2. [TOOL:<name>] -> <status> (<duration>ms) [->stepId]
|
|
79
|
+
File: <path> (<size> chars)
|
|
80
|
+
```<ext>
|
|
81
|
+
<head lines>
|
|
82
|
+
|
|
83
|
+
// ... N lines omitted ...
|
|
84
|
+
|
|
85
|
+
<tail lines>
|
|
86
|
+
```
|
|
87
|
+
3. [PLAN] <plan summary> [->stepId]
|
|
88
|
+
4. [MESSAGE] <truncated content> [->stepId]
|
|
89
|
+
|
|
90
|
+
**Output:** <truncated final output>
|
|
91
|
+
**Metadata:** category=ui, agent=claude-code-acp, ...
|
|
92
|
+
**Status:** passed|failed|error|timeout
|
|
93
|
+
**Duration:** <ms>ms
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**Step ID Format:** `<prompt-id>-step-<N>` (e.g., `test-001-step-2`)
|
|
99
|
+
|
|
100
|
+
**Truncation Rules:**
|
|
101
|
+
- Thought/message content: First 100 characters
|
|
102
|
+
- Output: First 200 characters
|
|
103
|
+
- Code preview: Head (8 lines) + tail (4 lines) for files > 12 lines
|
|
104
|
+
|
|
105
|
+
### Full JSONL File (`<output>.full.jsonl`)
|
|
106
|
+
|
|
107
|
+
Complete trajectory with step IDs for correlation.
|
|
108
|
+
|
|
109
|
+
**Schema:**
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
type FullResult = {
|
|
113
|
+
id: string
|
|
114
|
+
input: string
|
|
115
|
+
output: string
|
|
116
|
+
expected?: string
|
|
117
|
+
trajectory: IndexedStep[] // Steps with stepId
|
|
118
|
+
metadata: Record<string, unknown>
|
|
119
|
+
timing: {
|
|
120
|
+
start: number // Unix timestamp (ms)
|
|
121
|
+
end: number // Unix timestamp (ms)
|
|
122
|
+
firstResponse?: number // Time to first response (ms)
|
|
123
|
+
}
|
|
124
|
+
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
125
|
+
errors?: string[]
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
type IndexedStep = TrajectoryStep & { stepId: string }
|
|
129
|
+
|
|
130
|
+
type TrajectoryStep =
|
|
131
|
+
| { type: 'thought'; content: string; timestamp: number }
|
|
132
|
+
| { type: 'message'; content: string; timestamp: number }
|
|
133
|
+
| {
|
|
134
|
+
type: 'tool_call'
|
|
135
|
+
name: string // Tool title from ACP SDK
|
|
136
|
+
status: string // pending, in_progress, completed, failed
|
|
137
|
+
input?: unknown // Raw input parameters
|
|
138
|
+
output?: unknown // Raw output
|
|
139
|
+
duration?: number // Execution time (ms)
|
|
140
|
+
timestamp: number
|
|
141
|
+
}
|
|
142
|
+
| { type: 'plan'; entries: PlanEntry[]; timestamp: number }
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**Example:**
|
|
146
|
+
|
|
147
|
+
```jsonl
|
|
148
|
+
{"id":"test-001","input":"Create a primary button","output":"I created the button...","trajectory":[{"type":"thought","content":"I'll create a styled button template with createStyles","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"import { createStyles }..."},"output":"File written successfully","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button template","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"status":"passed"}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Two-Tier Scoring Workflow
|
|
152
|
+
|
|
153
|
+
### Direct Scoring (Large Context)
|
|
154
|
+
|
|
155
|
+
For judges with large context windows (Gemini 1M+, Claude 200k):
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
# Feed full JSONL directly
|
|
159
|
+
cat results.full.jsonl | your-gemini-judge.ts
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Step-Level Retrieval (Small Context)
|
|
163
|
+
|
|
164
|
+
For smaller models or step-specific analysis:
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
// Load both files
|
|
168
|
+
const markdown = await Bun.file('results.md').text()
|
|
169
|
+
const fullLines = (await Bun.file('results.full.jsonl').text()).trim().split('\n')
|
|
170
|
+
|
|
171
|
+
// Parse full results indexed by step ID
|
|
172
|
+
const stepIndex = new Map<string, unknown>()
|
|
173
|
+
for (const line of fullLines) {
|
|
174
|
+
const result = JSON.parse(line)
|
|
175
|
+
for (const step of result.trajectory) {
|
|
176
|
+
stepIndex.set(step.stepId, step)
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Judge requests full content for specific step
|
|
181
|
+
const stepId = 'test-001-step-2' // From markdown [->stepId]
|
|
182
|
+
const fullStep = stepIndex.get(stepId)
|
|
183
|
+
console.log(fullStep.input) // Complete tool input
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Status Values
|
|
187
|
+
|
|
188
|
+
| Status | Meaning |
|
|
189
|
+
|--------|---------|
|
|
190
|
+
| `passed` | Completed without tool errors |
|
|
191
|
+
| `failed` | Completed but one or more tool calls failed |
|
|
192
|
+
| `error` | Unhandled exception during execution |
|
|
193
|
+
| `timeout` | Request exceeded timeout limit |
|
|
194
|
+
|
|
195
|
+
## Input Format
|
|
196
|
+
|
|
197
|
+
Both formats accept the same JSONL input:
|
|
198
|
+
|
|
199
|
+
```jsonl
|
|
200
|
+
{"id":"test-001","input":"Create a primary button","expected":"should contain <button>","metadata":{"category":"ui"}}
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
| Field | Required | Description |
|
|
204
|
+
|-------|----------|-------------|
|
|
205
|
+
| `id` | Yes | Unique identifier |
|
|
206
|
+
| `input` | Yes | Prompt text for the agent |
|
|
207
|
+
| `expected` | No | Expected output (for downstream scoring) |
|
|
208
|
+
| `metadata` | No | Tags, category, difficulty for filtering |
|
|
209
|
+
| `timeout` | No | Override default timeout for this prompt |
|
|
210
|
+
|
|
211
|
+
## Streaming Behavior
|
|
212
|
+
|
|
213
|
+
Both formats stream output line-by-line as results complete:
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# Watch results in real-time
|
|
217
|
+
bunx @plaited/acp-harness prompts.jsonl --progress -o results.jsonl &
|
|
218
|
+
tail -f results.jsonl
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Use `--append` to continue interrupted runs without overwriting previous results.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "plaited-acp-harness",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"description": "Claude Code plugin for Agent Client Protocol testing and evaluation"
|
|
5
|
+
},
|
|
6
|
+
"owner": {
|
|
7
|
+
"name": "Plaited Labs"
|
|
8
|
+
},
|
|
9
|
+
"plugins": [
|
|
10
|
+
{
|
|
11
|
+
"name": "acp-harness",
|
|
12
|
+
"source": "./"
|
|
13
|
+
}
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "acp-harness",
|
|
3
|
+
"version": "0.2.5",
|
|
4
|
+
"description": "Agent Client Protocol client and evaluation harness for TypeScript/Bun projects. Includes: createACPClient for programmatic agent access, run-harness.ts for trajectory capture, and LLM-as-judge evaluation templates. Requires Bun runtime.",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Plaited Labs"
|
|
7
|
+
},
|
|
8
|
+
"license": "ISC",
|
|
9
|
+
"mcpServers": {
|
|
10
|
+
"agent-client-protocol": {
|
|
11
|
+
"type": "http",
|
|
12
|
+
"url": "https://agentclientprotocol.com/mcp"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"skills": "./.claude/skills"
|
|
16
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
# Code owners for acp-harness repository
|
|
2
|
+
# These users will be automatically requested for review when someone opens a pull request.
|
|
3
|
+
# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
|
|
4
|
+
|
|
5
|
+
# Organization admins own all files
|
|
6
|
+
* @EdwardIrby @alisonailea
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
pull_request:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
# Detect which paths changed to conditionally run expensive jobs
|
|
14
|
+
changes:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
permissions:
|
|
17
|
+
pull-requests: read
|
|
18
|
+
outputs:
|
|
19
|
+
acp: ${{ steps.filter.outputs.acp }}
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
- uses: dorny/paths-filter@v3
|
|
23
|
+
id: filter
|
|
24
|
+
with:
|
|
25
|
+
filters: |
|
|
26
|
+
acp:
|
|
27
|
+
- 'src/**'
|
|
28
|
+
|
|
29
|
+
test-pr:
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: oven-sh/setup-bun@v2
|
|
35
|
+
with:
|
|
36
|
+
bun-version: latest
|
|
37
|
+
- name: Install dependencies
|
|
38
|
+
run: bun install
|
|
39
|
+
- name: Run check
|
|
40
|
+
run: bun run check
|
|
41
|
+
- name: Run test
|
|
42
|
+
run: bun run test
|
|
43
|
+
|
|
44
|
+
# ACP integration tests run in Docker container for consistent environment
|
|
45
|
+
# Only runs when src/ files change to reduce API costs
|
|
46
|
+
test-acp-integration:
|
|
47
|
+
needs: changes
|
|
48
|
+
if: ${{ needs.changes.outputs.acp == 'true' }}
|
|
49
|
+
runs-on: ubuntu-latest
|
|
50
|
+
container:
|
|
51
|
+
image: oven/bun:1.2.9
|
|
52
|
+
options: --user root
|
|
53
|
+
|
|
54
|
+
steps:
|
|
55
|
+
- uses: actions/checkout@v4
|
|
56
|
+
|
|
57
|
+
- name: Install dependencies
|
|
58
|
+
run: bun install --frozen-lockfile
|
|
59
|
+
|
|
60
|
+
- name: Run ACP integration tests
|
|
61
|
+
env:
|
|
62
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
63
|
+
run: bun test ./src/tests/acp-integration.docker.ts
|