@plaited/acp-harness 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/.claude/rules/accuracy.md +43 -0
  2. package/.claude/rules/bun-apis.md +80 -0
  3. package/.claude/rules/code-review.md +254 -0
  4. package/.claude/rules/git-workflow.md +37 -0
  5. package/.claude/rules/github.md +154 -0
  6. package/.claude/rules/testing.md +172 -0
  7. package/.claude/skills/acp-harness/SKILL.md +310 -0
  8. package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
  9. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
  10. package/.claude/skills/acp-harness/references/downstream.md +288 -0
  11. package/.claude/skills/acp-harness/references/output-formats.md +221 -0
  12. package/.claude-plugin/marketplace.json +15 -0
  13. package/.claude-plugin/plugin.json +16 -0
  14. package/.github/CODEOWNERS +6 -0
  15. package/.github/workflows/ci.yml +63 -0
  16. package/.github/workflows/publish.yml +146 -0
  17. package/.mcp.json +20 -0
  18. package/CLAUDE.md +92 -0
  19. package/Dockerfile.test +23 -0
  20. package/LICENSE +15 -0
  21. package/README.md +94 -0
  22. package/bin/cli.ts +670 -0
  23. package/bin/tests/cli.spec.ts +362 -0
  24. package/biome.json +96 -0
  25. package/bun.lock +513 -0
  26. package/docker-compose.test.yml +21 -0
  27. package/package.json +57 -0
  28. package/scripts/bun-test-wrapper.sh +46 -0
  29. package/src/acp-client.ts +503 -0
  30. package/src/acp-helpers.ts +121 -0
  31. package/src/acp-transport.ts +455 -0
  32. package/src/acp-utils.ts +341 -0
  33. package/src/acp.constants.ts +56 -0
  34. package/src/acp.schemas.ts +161 -0
  35. package/src/acp.ts +27 -0
  36. package/src/acp.types.ts +28 -0
  37. package/src/tests/acp-client.spec.ts +205 -0
  38. package/src/tests/acp-helpers.spec.ts +105 -0
  39. package/src/tests/acp-integration.docker.ts +214 -0
  40. package/src/tests/acp-transport.spec.ts +153 -0
  41. package/src/tests/acp-utils.spec.ts +394 -0
  42. package/src/tests/fixtures/.claude/settings.local.json +8 -0
  43. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
  44. package/src/tests/fixtures/calculator-mcp.ts +215 -0
  45. package/tsconfig.json +32 -0
@@ -0,0 +1,288 @@
1
+ # Downstream Integration
2
+
3
+ Patterns for piping harness output to analysis tools.
4
+
5
+ ## Loading Results
6
+
7
+ Both output formats use JSONL (newline-delimited JSON):
8
+
9
+ ```typescript
10
+ // TypeScript pattern (validated in tests)
11
+ const parseResults = (jsonl: string) =>
12
+ jsonl.trim().split('\n').map((line) => JSON.parse(line))
13
+
14
+ // Load from file
15
+ const results = parseResults(await Bun.file('results.jsonl').text())
16
+ ```
17
+
18
+ ## jq Analysis
19
+
20
+ Summary JSONL is designed for quick analysis with `jq`:
21
+
22
+ ```bash
23
+ # Calculate average duration
24
+ cat results.jsonl | jq -s 'map(.duration) | add / length'
25
+
26
+ # Count tool usage
27
+ cat results.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})'
28
+
29
+ # Filter by status
30
+ cat results.jsonl | jq 'select(.status == "failed")'
31
+
32
+ # Pass rate
33
+ cat results.jsonl | jq -s 'map(select(.status == "passed")) | length as $p | length as $t | "\($p)/\($t) passed"'
34
+
35
+ # Group by category
36
+ cat results.jsonl | jq -s 'group_by(.metadata.category) | map({category: .[0].metadata.category, count: length})'
37
+
38
+ # Find slowest runs
39
+ cat results.jsonl | jq -s 'sort_by(-.duration) | .[0:5] | map({id, duration})'
40
+ ```
41
+
42
+ ## TypeScript Analysis Patterns
43
+
44
+ These patterns are validated by tests in `bin/tests/cli.spec.ts`:
45
+
46
+ ### Filter by Status
47
+
48
+ ```typescript
49
+ const failed = results.filter((r) => r.status === 'failed')
50
+ const passed = results.filter((r) => r.status === 'passed')
51
+ const passRate = passed.length / results.length
52
+ ```
53
+
54
+ ### Filter by Tool Usage
55
+
56
+ ```typescript
57
+ // Find runs that used Write tool
58
+ const withWrite = results.filter((r) => r.toolCalls.includes('Write'))
59
+
60
+ // Find runs that used multiple tools
61
+ const multiTool = results.filter((r) => r.toolCalls.length > 1)
62
+ ```
63
+
64
+ ### Filter by Duration
65
+
66
+ ```typescript
67
+ // Slow runs (> 2 seconds)
68
+ const slow = results.filter((r) => r.duration > 2000)
69
+
70
+ // Find top 5 slowest
71
+ const slowest = [...results].sort((a, b) => b.duration - a.duration).slice(0, 5)
72
+ ```
73
+
74
+ ### Filter by Metadata
75
+
76
+ ```typescript
77
+ // Filter by category
78
+ const uiResults = results.filter((r) => r.metadata.category === 'ui')
79
+
80
+ // Group and count by category
81
+ const grouped = results.reduce<Record<string, number>>((acc, r) => {
82
+ const cat = r.metadata.category as string
83
+ acc[cat] = (acc[cat] ?? 0) + 1
84
+ return acc
85
+ }, {})
86
+ ```
87
+
88
+ ### Count Tool Usage
89
+
90
+ ```typescript
91
+ const allTools = results.flatMap((r) => r.toolCalls)
92
+ const toolCounts = allTools.reduce<Record<string, number>>((acc, tool) => {
93
+ acc[tool] = (acc[tool] ?? 0) + 1
94
+ return acc
95
+ }, {})
96
+ ```
97
+
98
+ ### Deduplicate by ID
99
+
100
+ ```typescript
101
+ // Keep latest occurrence when merging multiple runs
102
+ const byId = new Map<string, unknown>()
103
+ for (const result of results) {
104
+ byId.set(result.id, result)
105
+ }
106
+ const deduped = Array.from(byId.values())
107
+ ```
108
+
109
+ ## Step-Level Retrieval
110
+
111
+ For judge format, correlate markdown step IDs with full JSONL:
112
+
113
+ ```typescript
114
+ // Load both files
115
+ const markdown = await Bun.file('results.md').text()
116
+ const fullResults = parseResults(await Bun.file('results.full.jsonl').text())
117
+
118
+ // Build step index
119
+ const stepIndex = new Map<string, unknown>()
120
+ for (const result of fullResults) {
121
+ for (const step of result.trajectory) {
122
+ stepIndex.set(step.stepId, step)
123
+ }
124
+ }
125
+
126
+ // Retrieve full step by ID (from markdown [→stepId])
127
+ const stepId = 'test-001-step-2'
128
+ const fullStep = stepIndex.get(stepId) as { name: string; input: unknown }
129
+ console.log('Tool name:', fullStep.name)
130
+ console.log('Full input:', fullStep.input)
131
+ ```
132
+
133
+ ## Extract Tool Calls from Trajectory
134
+
135
+ ```typescript
136
+ const toolCalls = result.trajectory.filter((s) => s.type === 'tool_call')
137
+ const toolNames = toolCalls.map((t) => t.name)
138
+ ```
139
+
140
+ ## Timing Information
141
+
142
+ ```typescript
143
+ const result = results[0]
144
+ const duration = result.timing.end - result.timing.start
145
+ const timeToFirstResponse = result.timing.firstResponse // ms after start
146
+ ```
147
+
148
+ ## LLM-as-Judge
149
+
150
+ ### Large Context Models (Gemini 1M+)
151
+
152
+ Feed full trajectory directly:
153
+
154
+ ```typescript
155
+ import { GoogleGenerativeAI } from '@google/generative-ai'
156
+
157
+ const genAI = new GoogleGenerativeAI(process.env.GOOGLE_API_KEY!)
158
+ const model = genAI.getGenerativeModel({ model: 'gemini-2.5-pro' })
159
+
160
+ const results = parseResults(await Bun.file('results.full.jsonl').text())
161
+
162
+ const prompt = `
163
+ Evaluate these agent trajectories for code quality and reasoning.
164
+
165
+ ${JSON.stringify(results, null, 2)}
166
+
167
+ For each evaluation, score 1-3:
168
+ - 1: Major issues (wrong tools, broken logic, incorrect output)
169
+ - 2: Minor issues (inefficient but correct)
170
+ - 3: Excellent (efficient trajectory, correct output)
171
+
172
+ Respond as JSON array: [{"id": "...", "score": N, "reasoning": "..."}]
173
+ `
174
+
175
+ const response = await model.generateContent(prompt)
176
+ console.log(response.response.text())
177
+ ```
178
+
179
+ ### Medium Context Models (Claude 200k)
180
+
181
+ Use full trajectory for most runs:
182
+
183
+ ```typescript
184
+ import Anthropic from '@anthropic-ai/sdk'
185
+
186
+ const client = new Anthropic()
187
+ const markdown = await Bun.file('results.md').text()
188
+
189
+ const response = await client.messages.create({
190
+ model: 'claude-sonnet-4-20250514',
191
+ max_tokens: 4096,
192
+ messages: [{
193
+ role: 'user',
194
+ content: `Evaluate these agent trajectories:\n\n${markdown}\n\nScore each 1-3 and explain.`
195
+ }]
196
+ })
197
+
198
+ console.log(response.content[0].text)
199
+ ```
200
+
201
+ ## Braintrust Integration
202
+
203
+ Upload results programmatically:
204
+
205
+ ```typescript
206
+ import { initLogger } from 'braintrust'
207
+
208
+ const logger = initLogger({
209
+ projectName: 'agent-eval',
210
+ apiKey: process.env.BRAINTRUST_API_KEY,
211
+ })
212
+
213
+ const results = parseResults(await Bun.file('results.jsonl').text())
214
+
215
+ for (const result of results) {
216
+ logger.log({
217
+ input: result.input,
218
+ output: result.output,
219
+ expected: result.expected,
220
+ scores: {
221
+ passed: result.status === 'passed' ? 1 : 0,
222
+ duration_ms: result.duration,
223
+ },
224
+ metadata: {
225
+ ...result.metadata,
226
+ toolCalls: result.toolCalls,
227
+ },
228
+ })
229
+ }
230
+
231
+ await logger.flush()
232
+ ```
233
+
234
+ ## CI Integration
235
+
236
+ ### GitHub Actions
237
+
238
+ ```yaml
239
+ name: Agent Eval
240
+ on:
241
+ schedule:
242
+ - cron: '0 0 * * 0' # Weekly
243
+
244
+ jobs:
245
+ eval:
246
+ runs-on: ubuntu-latest
247
+ steps:
248
+ - uses: actions/checkout@v4
249
+ - uses: oven-sh/setup-bun@v2
250
+
251
+ - name: Install ACP adapter
252
+ run: npm install -g @zed-industries/claude-code-acp
253
+
254
+ - name: Install dependencies
255
+ run: bun add @plaited/acp-harness
256
+
257
+ - name: Run harness
258
+ env:
259
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
260
+ run: |
261
+ bunx @plaited/acp-harness prompts.jsonl \
262
+ --format judge \
263
+ --progress \
264
+ -o eval-results
265
+
266
+ - name: Upload results
267
+ uses: actions/upload-artifact@v4
268
+ with:
269
+ name: eval-results
270
+ path: |
271
+ eval-results.md
272
+ eval-results.full.jsonl
273
+ ```
274
+
275
+ ## Output Aggregation
276
+
277
+ Combine multiple runs:
278
+
279
+ ```bash
280
+ # Append mode during runs
281
+ bunx @plaited/acp-harness prompts-1.jsonl --append -o combined.jsonl
282
+ bunx @plaited/acp-harness prompts-2.jsonl --append -o combined.jsonl
283
+
284
+ # Merge separate files
285
+ cat run1.jsonl run2.jsonl run3.jsonl > combined.jsonl
286
+
287
+ # Dedupe by ID (keep latest) - use TypeScript pattern above
288
+ ```
@@ -0,0 +1,221 @@
1
+ # Output Formats
2
+
3
+ The harness supports two output formats optimized for different use cases.
4
+
5
+ ## Format Selection
6
+
7
+ ```bash
8
+ bunx @plaited/acp-harness prompts.jsonl --format <format> -o <output>
9
+ ```
10
+
11
+ | Format | Files Created | Use Case |
12
+ |--------|---------------|----------|
13
+ | `summary` | Single JSONL | Quick metrics, dashboards, jq analysis |
14
+ | `judge` | `.md` + `.full.jsonl` | Downstream LLM-as-judge scoring |
15
+
16
+ ## Summary Format (Default)
17
+
18
+ Minimal JSONL for quick metrics and analysis.
19
+
20
+ ### Schema
21
+
22
+ ```typescript
23
+ type SummaryResult = {
24
+ id: string // Prompt identifier
25
+ input: string // Original prompt text
26
+ output: string // Final agent response
27
+ toolCalls: string[] // List of tool names used
28
+ status: 'passed' | 'failed' | 'error' | 'timeout'
29
+ duration: number // Total execution time (ms)
30
+ }
31
+ ```
32
+
33
+ ### Example Output
34
+
35
+ ```jsonl
36
+ {"id":"test-001","input":"Create a primary button","output":"I created the button in src/button.tsx","toolCalls":["Write"],"status":"passed","duration":1234}
37
+ {"id":"test-002","input":"Fix the TypeScript error","output":"I fixed the type error...","toolCalls":["Read","Edit"],"status":"passed","duration":2567}
38
+ ```
39
+
40
+ ### Analysis with jq
41
+
42
+ ```bash
43
+ # Calculate average duration
44
+ cat results.jsonl | jq -s 'map(.duration) | add / length'
45
+
46
+ # Count tool usage
47
+ cat results.jsonl | jq -s 'map(.toolCalls) | flatten | group_by(.) | map({tool: .[0], count: length})'
48
+
49
+ # Filter by status
50
+ cat results.jsonl | jq 'select(.status == "failed")'
51
+
52
+ # Pass rate
53
+ cat results.jsonl | jq -s 'map(select(.status == "passed")) | length as $p | length as $t | "\($p)/\($t) passed"'
54
+ ```
55
+
56
+ ## Judge Format (Two-Tier)
57
+
58
+ Creates two files for downstream LLM-as-judge scoring with step-level correlation.
59
+
60
+ ```bash
61
+ bunx @plaited/acp-harness prompts.jsonl --format judge -o results
62
+ # Creates: results.md + results.full.jsonl
63
+ ```
64
+
65
+ ### Markdown File (`<output>.md`)
66
+
67
+ Human-readable summary with step IDs and code previews.
68
+
69
+ **Structure:**
70
+
71
+ ```markdown
72
+ ## Capture Record: <id>
73
+
74
+ **Input:** <original prompt>
75
+
76
+ **Trajectory:**
77
+ 1. [THOUGHT] <truncated content> [->stepId]
78
+ 2. [TOOL:<name>] -> <status> (<duration>ms) [->stepId]
79
+ File: <path> (<size> chars)
80
+ ```<ext>
81
+ <head lines>
82
+
83
+ // ... N lines omitted ...
84
+
85
+ <tail lines>
86
+ ```
87
+ 3. [PLAN] <plan summary> [->stepId]
88
+ 4. [MESSAGE] <truncated content> [->stepId]
89
+
90
+ **Output:** <truncated final output>
91
+ **Metadata:** category=ui, agent=claude-code-acp, ...
92
+ **Status:** passed|failed|error|timeout
93
+ **Duration:** <ms>ms
94
+
95
+ ---
96
+ ```
97
+
98
+ **Step ID Format:** `<prompt-id>-step-<N>` (e.g., `test-001-step-2`)
99
+
100
+ **Truncation Rules:**
101
+ - Thought/message content: First 100 characters
102
+ - Output: First 200 characters
103
+ - Code preview: Head (8 lines) + tail (4 lines) for files > 12 lines
104
+
105
+ ### Full JSONL File (`<output>.full.jsonl`)
106
+
107
+ Complete trajectory with step IDs for correlation.
108
+
109
+ **Schema:**
110
+
111
+ ```typescript
112
+ type FullResult = {
113
+ id: string
114
+ input: string
115
+ output: string
116
+ expected?: string
117
+ trajectory: IndexedStep[] // Steps with stepId
118
+ metadata: Record<string, unknown>
119
+ timing: {
120
+ start: number // Unix timestamp (ms)
121
+ end: number // Unix timestamp (ms)
122
+ firstResponse?: number // Time to first response (ms)
123
+ }
124
+ status: 'passed' | 'failed' | 'error' | 'timeout'
125
+ errors?: string[]
126
+ }
127
+
128
+ type IndexedStep = TrajectoryStep & { stepId: string }
129
+
130
+ type TrajectoryStep =
131
+ | { type: 'thought'; content: string; timestamp: number }
132
+ | { type: 'message'; content: string; timestamp: number }
133
+ | {
134
+ type: 'tool_call'
135
+ name: string // Tool title from ACP SDK
136
+ status: string // pending, in_progress, completed, failed
137
+ input?: unknown // Raw input parameters
138
+ output?: unknown // Raw output
139
+ duration?: number // Execution time (ms)
140
+ timestamp: number
141
+ }
142
+ | { type: 'plan'; entries: PlanEntry[]; timestamp: number }
143
+ ```
144
+
145
+ **Example:**
146
+
147
+ ```jsonl
148
+ {"id":"test-001","input":"Create a primary button","output":"I created the button...","trajectory":[{"type":"thought","content":"I'll create a styled button template with createStyles","timestamp":100,"stepId":"test-001-step-1"},{"type":"tool_call","name":"Write","status":"completed","input":{"file_path":"src/button.tsx","content":"import { createStyles }..."},"output":"File written successfully","duration":234,"timestamp":150,"stepId":"test-001-step-2"},{"type":"message","content":"I created the button template","timestamp":500,"stepId":"test-001-step-3"}],"metadata":{"category":"ui","agent":"claude"},"timing":{"start":1704067200000,"end":1704067201234,"firstResponse":100},"status":"passed"}
149
+ ```
150
+
151
+ ## Two-Tier Scoring Workflow
152
+
153
+ ### Direct Scoring (Large Context)
154
+
155
+ For judges with large context windows (Gemini 1M+, Claude 200k):
156
+
157
+ ```bash
158
+ # Feed full JSONL directly
159
+ cat results.full.jsonl | your-gemini-judge.ts
160
+ ```
161
+
162
+ ### Step-Level Retrieval (Small Context)
163
+
164
+ For smaller models or step-specific analysis:
165
+
166
+ ```typescript
167
+ // Load both files
168
+ const markdown = await Bun.file('results.md').text()
169
+ const fullLines = (await Bun.file('results.full.jsonl').text()).trim().split('\n')
170
+
171
+ // Parse full results indexed by step ID
172
+ const stepIndex = new Map<string, unknown>()
173
+ for (const line of fullLines) {
174
+ const result = JSON.parse(line)
175
+ for (const step of result.trajectory) {
176
+ stepIndex.set(step.stepId, step)
177
+ }
178
+ }
179
+
180
+ // Judge requests full content for specific step
181
+ const stepId = 'test-001-step-2' // From markdown [->stepId]
182
+ const fullStep = stepIndex.get(stepId)
183
+ console.log(fullStep.input) // Complete tool input
184
+ ```
185
+
186
+ ## Status Values
187
+
188
+ | Status | Meaning |
189
+ |--------|---------|
190
+ | `passed` | Completed without tool errors |
191
+ | `failed` | Completed but one or more tool calls failed |
192
+ | `error` | Unhandled exception during execution |
193
+ | `timeout` | Request exceeded timeout limit |
194
+
195
+ ## Input Format
196
+
197
+ Both formats accept the same JSONL input:
198
+
199
+ ```jsonl
200
+ {"id":"test-001","input":"Create a primary button","expected":"should contain <button>","metadata":{"category":"ui"}}
201
+ ```
202
+
203
+ | Field | Required | Description |
204
+ |-------|----------|-------------|
205
+ | `id` | Yes | Unique identifier |
206
+ | `input` | Yes | Prompt text for the agent |
207
+ | `expected` | No | Expected output (for downstream scoring) |
208
+ | `metadata` | No | Tags, category, difficulty for filtering |
209
+ | `timeout` | No | Override default timeout for this prompt |
210
+
211
+ ## Streaming Behavior
212
+
213
+ Both formats stream output line-by-line as results complete:
214
+
215
+ ```bash
216
+ # Watch results in real-time
217
+ bunx @plaited/acp-harness prompts.jsonl --progress -o results.jsonl &
218
+ tail -f results.jsonl
219
+ ```
220
+
221
+ Use `--append` to continue interrupted runs without overwriting previous results.
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "plaited-acp-harness",
3
+ "metadata": {
4
+ "description": "Claude Code plugin for Agent Client Protocol testing and evaluation"
5
+ },
6
+ "owner": {
7
+ "name": "Plaited Labs"
8
+ },
9
+ "plugins": [
10
+ {
11
+ "name": "acp-harness",
12
+ "source": "./"
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "name": "acp-harness",
3
+ "version": "0.2.5",
4
+ "description": "Agent Client Protocol client and evaluation harness for TypeScript/Bun projects. Includes: createACPClient for programmatic agent access, run-harness.ts for trajectory capture, and LLM-as-judge evaluation templates. Requires Bun runtime.",
5
+ "author": {
6
+ "name": "Plaited Labs"
7
+ },
8
+ "license": "ISC",
9
+ "mcpServers": {
10
+ "agent-client-protocol": {
11
+ "type": "http",
12
+ "url": "https://agentclientprotocol.com/mcp"
13
+ }
14
+ },
15
+ "skills": "./.claude/skills"
16
+ }
@@ -0,0 +1,6 @@
1
+ # Code owners for acp-harness repository
2
+ # These users will be automatically requested for review when someone opens a pull request.
3
+ # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
4
+
5
+ # Organization admins own all files
6
+ * @EdwardIrby @alisonailea
@@ -0,0 +1,63 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ # Detect which paths changed to conditionally run expensive jobs
14
+ changes:
15
+ runs-on: ubuntu-latest
16
+ permissions:
17
+ pull-requests: read
18
+ outputs:
19
+ acp: ${{ steps.filter.outputs.acp }}
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: dorny/paths-filter@v3
23
+ id: filter
24
+ with:
25
+ filters: |
26
+ acp:
27
+ - 'src/**'
28
+
29
+ test-pr:
30
+ runs-on: ubuntu-latest
31
+
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+ - uses: oven-sh/setup-bun@v2
35
+ with:
36
+ bun-version: latest
37
+ - name: Install dependencies
38
+ run: bun install
39
+ - name: Run check
40
+ run: bun run check
41
+ - name: Run test
42
+ run: bun run test
43
+
44
+ # ACP integration tests run in Docker container for consistent environment
45
+ # Only runs when src/ files change to reduce API costs
46
+ test-acp-integration:
47
+ needs: changes
48
+ if: ${{ needs.changes.outputs.acp == 'true' }}
49
+ runs-on: ubuntu-latest
50
+ container:
51
+ image: oven/bun:1.2.9
52
+ options: --user root
53
+
54
+ steps:
55
+ - uses: actions/checkout@v4
56
+
57
+ - name: Install dependencies
58
+ run: bun install --frozen-lockfile
59
+
60
+ - name: Run ACP integration tests
61
+ env:
62
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
63
+ run: bun test ./src/tests/acp-integration.docker.ts