@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ import { describe, expect, test } from 'bun:test'
2
+ import type { TrialsConfig } from '../trials.ts'
3
+
4
+ // ============================================================================
5
+ // TrialsConfig type
6
+ // ============================================================================
7
+
8
+ describe('TrialsConfig configuration', () => {
9
+ test('TrialsConfig type accepts valid configuration', () => {
10
+ const config: TrialsConfig = {
11
+ promptsPath: '/tmp/prompts.jsonl',
12
+ agentCommand: ['bunx', 'test-agent'],
13
+ k: 5,
14
+ outputPath: '/tmp/output.jsonl',
15
+ cwd: '/tmp',
16
+ timeout: 30000,
17
+ progress: true,
18
+ append: false,
19
+ }
20
+
21
+ expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
22
+ expect(config.agentCommand).toEqual(['bunx', 'test-agent'])
23
+ expect(config.k).toBe(5)
24
+ })
25
+
26
+ test('TrialsConfig allows minimal configuration', () => {
27
+ const config: TrialsConfig = {
28
+ promptsPath: '/tmp/prompts.jsonl',
29
+ agentCommand: ['echo', 'test'],
30
+ k: 3,
31
+ }
32
+
33
+ expect(config.outputPath).toBeUndefined()
34
+ expect(config.cwd).toBeUndefined()
35
+ expect(config.timeout).toBeUndefined()
36
+ expect(config.progress).toBeUndefined()
37
+ expect(config.append).toBeUndefined()
38
+ expect(config.grader).toBeUndefined()
39
+ })
40
+ })
41
+
42
+ // ============================================================================
43
+ // CLI Help Output
44
+ // ============================================================================
45
+
46
+ describe('trials CLI', () => {
47
+ test('displays help with --help flag', async () => {
48
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '--help'], {
49
+ stdout: 'pipe',
50
+ stderr: 'pipe',
51
+ })
52
+
53
+ const stdout = await new Response(proc.stdout).text()
54
+ await proc.exited
55
+
56
+ expect(stdout).toContain('Usage: acp-harness trials')
57
+ expect(stdout).toContain('prompts.jsonl')
58
+ expect(stdout).toContain('-o, --output')
59
+ expect(stdout).toContain('-k')
60
+ expect(stdout).toContain('-c, --cwd')
61
+ expect(stdout).toContain('-t, --timeout')
62
+ expect(stdout).toContain('--progress')
63
+ expect(stdout).toContain('-g, --grader')
64
+ expect(stdout).toContain('pass@k')
65
+ })
66
+
67
+ test('shows error for missing prompts file argument', async () => {
68
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials'], {
69
+ stdout: 'pipe',
70
+ stderr: 'pipe',
71
+ })
72
+
73
+ const stderr = await new Response(proc.stderr).text()
74
+ const exitCode = await proc.exited
75
+
76
+ expect(exitCode).not.toBe(0)
77
+ expect(stderr).toContain('prompts.jsonl path is required')
78
+ })
79
+
80
+ test('shows error for missing agent command', async () => {
81
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl'], {
82
+ stdout: 'pipe',
83
+ stderr: 'pipe',
84
+ })
85
+
86
+ const stderr = await new Response(proc.stderr).text()
87
+ const exitCode = await proc.exited
88
+
89
+ expect(exitCode).not.toBe(0)
90
+ expect(stderr).toContain('ACP agent command is required')
91
+ })
92
+ })
93
+
94
+ // ============================================================================
95
+ // Schemas CLI
96
+ // ============================================================================
97
+
98
+ describe('schemas CLI', () => {
99
+ test('displays help with --help flag', async () => {
100
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--help'], {
101
+ stdout: 'pipe',
102
+ stderr: 'pipe',
103
+ })
104
+
105
+ const stdout = await new Response(proc.stdout).text()
106
+ await proc.exited
107
+
108
+ expect(stdout).toContain('Usage: acp-harness schemas')
109
+ expect(stdout).toContain('-o, --output')
110
+ expect(stdout).toContain('-j, --json')
111
+ expect(stdout).toContain('-s, --split')
112
+ expect(stdout).toContain('-l, --list')
113
+ expect(stdout).toContain('Available Schemas')
114
+ })
115
+
116
+ test('lists schemas with --list flag', async () => {
117
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--list'], {
118
+ stdout: 'pipe',
119
+ stderr: 'pipe',
120
+ })
121
+
122
+ const stdout = await new Response(proc.stdout).text()
123
+ await proc.exited
124
+
125
+ expect(stdout).toContain('Available schemas')
126
+ expect(stdout).toContain('PromptCase')
127
+ expect(stdout).toContain('CaptureResult')
128
+ expect(stdout).toContain('GraderResult')
129
+ })
130
+
131
+ test('exports schema as JSON', async () => {
132
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', 'PromptCase', '--json'], {
133
+ stdout: 'pipe',
134
+ stderr: 'pipe',
135
+ })
136
+
137
+ const stdout = await new Response(proc.stdout).text()
138
+ await proc.exited
139
+
140
+ const schema = JSON.parse(stdout)
141
+ expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
142
+ expect(schema.title).toBe('PromptCase')
143
+ expect(schema.type).toBe('object')
144
+ })
145
+ })
package/src/trials.ts CHANGED
@@ -19,7 +19,6 @@ import { extractOutput, extractTrajectory, loadPrompts } from './capture.ts'
19
19
  import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from './constants.ts'
20
20
  import { loadGrader } from './grader-loader.ts'
21
21
  import type { Grader, TrialEntry, TrialResult } from './schemas.ts'
22
- import { McpServerSchema } from './schemas.ts'
23
22
 
24
23
  // ============================================================================
25
24
  // Pass@k/Pass^k Calculation
@@ -92,8 +91,6 @@ export type TrialsConfig = {
92
91
  progress?: boolean
93
92
  /** Append to output file */
94
93
  append?: boolean
95
- /** MCP server configurations */
96
- mcpServers?: unknown[]
97
94
  /** Optional grader function */
98
95
  grader?: Grader
99
96
  }
@@ -149,13 +146,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
149
146
  timeout = DEFAULT_HARNESS_TIMEOUT,
150
147
  progress = false,
151
148
  append = false,
152
- mcpServers = [],
153
149
  grader,
154
150
  } = config
155
151
 
156
- // Parse MCP server configurations
157
- const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
158
-
159
152
  // Load prompts
160
153
  const prompts = await loadPrompts(promptsPath)
161
154
 
@@ -182,10 +175,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
182
175
  await Bun.write(resolvedOutputPath, '')
183
176
  }
184
177
 
185
- // Session params
178
+ // Session params - agents auto-discover MCP configs from cwd
186
179
  const sessionParams = {
187
180
  cwd: cwd ?? process.cwd(),
188
- mcpServers: parsedMcpServers,
189
181
  }
190
182
 
191
183
  const results: TrialResult[] = []
@@ -211,7 +203,8 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
211
203
  const startTime = Date.now()
212
204
 
213
205
  try {
214
- const prompt = createPrompt(promptCase.input)
206
+ const inputText = Array.isArray(promptCase.input) ? promptCase.input.join('\n') : promptCase.input
207
+ const prompt = createPrompt(inputText)
215
208
  const { updates } = await client.promptSync(session.id, prompt)
216
209
 
217
210
  const endTime = Date.now()
@@ -230,7 +223,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
230
223
  const graderResult = await grader({
231
224
  input: promptCase.input,
232
225
  output,
233
- expected: promptCase.expected,
226
+ hint: promptCase.hint,
234
227
  trajectory,
235
228
  })
236
229
  entry.pass = graderResult.pass
@@ -263,7 +256,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
263
256
  const result: TrialResult = {
264
257
  id: promptCase.id,
265
258
  input: promptCase.input,
266
- ...(promptCase.expected && { expected: promptCase.expected }),
259
+ ...(promptCase.hint && { hint: promptCase.hint }),
267
260
  k,
268
261
  trials: trialEntries,
269
262
  }
@@ -318,7 +311,6 @@ export const trials = async (args: string[]): Promise<void> => {
318
311
  timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
319
312
  progress: { type: 'boolean', default: false },
320
313
  append: { type: 'boolean', default: false },
321
- 'mcp-server': { type: 'string', multiple: true },
322
314
  grader: { type: 'string', short: 'g' },
323
315
  help: { type: 'boolean', short: 'h' },
324
316
  },
@@ -337,11 +329,10 @@ Arguments:
337
329
  Options:
338
330
  -o, --output Output file (default: stdout)
339
331
  -k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
340
- -c, --cwd Working directory for agent
332
+ -c, --cwd Working directory for agent (agents auto-discover MCP configs from here)
341
333
  -t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
342
334
  --progress Show progress to stderr
343
335
  --append Append to output file
344
- --mcp-server MCP server config JSON (repeatable)
345
336
  -g, --grader Path to grader (.ts/.js module or executable script)
346
337
  -h, --help Show this help message
347
338
 
@@ -389,9 +380,6 @@ Examples:
389
380
  }
390
381
  }
391
382
 
392
- // Parse MCP server configurations
393
- const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
394
-
395
383
  await runTrials({
396
384
  promptsPath,
397
385
  agentCommand,
@@ -401,7 +389,6 @@ Examples:
401
389
  timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
402
390
  progress: values.progress ?? false,
403
391
  append: values.append ?? false,
404
- mcpServers,
405
392
  grader,
406
393
  })
407
394
  }
@@ -69,7 +69,7 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
69
69
  const graderResult = await grader({
70
70
  input: prompt.input,
71
71
  output: prompt.reference as string,
72
- expected: prompt.expected,
72
+ hint: prompt.hint,
73
73
  trajectory: [], // No trajectory for reference validation
74
74
  })
75
75
 
@@ -1,214 +0,0 @@
1
- /**
2
- * ACP Client Integration Tests
3
- *
4
- * @remarks
5
- * These tests verify the ACP client works against real Claude Code
6
- * via the `claude-code-acp` adapter.
7
- *
8
- * **Run in Docker only** for consistent environment:
9
- * ```bash
10
- * ANTHROPIC_API_KEY=sk-... bun run test:acp
11
- * ```
12
- *
13
- * Prerequisites:
14
- * 1. Docker installed
15
- * 2. API key: `ANTHROPIC_API_KEY` environment variable
16
- *
17
- * These tests make real API calls and consume credits.
18
- */
19
-
20
- import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
21
- import { type ACPClient, createACPClient } from '../acp-client.ts'
22
- import { createPrompt, summarizeResponse } from '../acp-helpers.ts'
23
-
24
- // Long timeout for real agent interactions (2 minutes)
25
- setDefaultTimeout(120000)
26
-
27
- // Fixtures directory with .claude/skills and .mcp.json
28
- const FIXTURES_DIR = `${import.meta.dir}/fixtures`
29
-
30
- // Use haiku for all tests to reduce costs
31
- const TEST_MODEL = 'claude-haiku-4-5-20251001'
32
-
33
- describe('ACP Client Integration', () => {
34
- let client: ACPClient
35
-
36
- beforeAll(async () => {
37
- // cc-acp adapter expects ANTHROPIC_API_KEY
38
- client = createACPClient({
39
- command: ['bunx', 'claude-code-acp'],
40
- timeout: 120000, // 2 min timeout for initialization
41
- env: {
42
- ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '',
43
- },
44
- })
45
-
46
- await client.connect()
47
- })
48
-
49
- afterAll(async () => {
50
- await client?.disconnect()
51
- })
52
-
53
- test('connects and initializes', () => {
54
- expect(client.isConnected()).toBe(true)
55
-
56
- const initResult = client.getInitializeResult()
57
- expect(initResult).toBeDefined()
58
- expect(initResult?.protocolVersion).toBeDefined()
59
- })
60
-
61
- test('reports agent capabilities', () => {
62
- const capabilities = client.getCapabilities()
63
- expect(capabilities).toBeDefined()
64
- })
65
-
66
- test('creates session', async () => {
67
- const session = await client.createSession({
68
- cwd: FIXTURES_DIR,
69
- mcpServers: [],
70
- })
71
-
72
- expect(session).toBeDefined()
73
- expect(session.id).toBeDefined()
74
- expect(typeof session.id).toBe('string')
75
- })
76
-
77
- test('sends prompt and receives response', async () => {
78
- const session = await client.createSession({
79
- cwd: FIXTURES_DIR,
80
- mcpServers: [],
81
- })
82
-
83
- // Use haiku for faster/cheaper test runs
84
- await client.setModel(session.id, TEST_MODEL)
85
-
86
- // Simple prompt that doesn't require tools
87
- const { result, updates } = await client.promptSync(
88
- session.id,
89
- createPrompt('What is 2 + 2? Reply with just the number.'),
90
- )
91
-
92
- expect(result).toBeDefined()
93
- expect(updates).toBeInstanceOf(Array)
94
-
95
- // Summarize and verify response structure
96
- const summary = summarizeResponse(updates)
97
- expect(summary.text).toBeDefined()
98
- expect(summary.text.length).toBeGreaterThan(0)
99
- })
100
-
101
- test('streaming prompt yields updates', async () => {
102
- const session = await client.createSession({
103
- cwd: FIXTURES_DIR,
104
- mcpServers: [],
105
- })
106
-
107
- // Use haiku for faster/cheaper test runs
108
- await client.setModel(session.id, TEST_MODEL)
109
-
110
- const events: string[] = []
111
-
112
- for await (const event of client.prompt(session.id, createPrompt('Say "hello" and nothing else.'))) {
113
- events.push(event.type)
114
- if (event.type === 'complete') {
115
- expect(event.result).toBeDefined()
116
- }
117
- }
118
-
119
- expect(events).toContain('complete')
120
- })
121
-
122
- test('handles tool usage prompt', async () => {
123
- const session = await client.createSession({
124
- cwd: FIXTURES_DIR,
125
- mcpServers: [],
126
- })
127
-
128
- // Use haiku for faster/cheaper test runs
129
- await client.setModel(session.id, TEST_MODEL)
130
-
131
- // Prompt that should trigger tool usage - reading a specific file
132
- const { updates } = await client.promptSync(
133
- session.id,
134
- createPrompt('Use the Read tool to read calculator-mcp.ts and tell me what tools the MCP server provides.'),
135
- )
136
-
137
- const summary = summarizeResponse(updates)
138
-
139
- // Verify response mentions calculator tools
140
- expect(summary.text.length).toBeGreaterThan(0)
141
- // Response should mention the calculator tools (add, subtract, etc.)
142
- expect(summary.text.toLowerCase()).toMatch(/add|subtract|multiply|divide|calculator/)
143
- })
144
-
145
- test('uses skill from cwd', async () => {
146
- const session = await client.createSession({
147
- cwd: FIXTURES_DIR,
148
- mcpServers: [],
149
- })
150
-
151
- // Use haiku for faster/cheaper test runs
152
- await client.setModel(session.id, TEST_MODEL)
153
-
154
- // Ask Claude to use the greeting skill
155
- const { updates } = await client.promptSync(session.id, createPrompt('Please greet me using the greeting skill.'))
156
-
157
- const summary = summarizeResponse(updates)
158
-
159
- // The greeting skill instructs Claude to include specific phrases
160
- expect(summary.text.length).toBeGreaterThan(0)
161
- expect(summary.text.toLowerCase()).toMatch(/hello|greet|welcome/)
162
- })
163
-
164
- test('uses MCP server tools', async () => {
165
- // Path to calculator MCP server fixture (must be absolute per ACP spec)
166
- const calculatorPath = `${FIXTURES_DIR}/calculator-mcp.ts`
167
- const bunPath = Bun.which('bun') ?? 'bun'
168
-
169
- // Retry helper for flaky MCP server startup
170
- const maxRetries = 3
171
- let lastError: Error | undefined
172
-
173
- for (let attempt = 1; attempt <= maxRetries; attempt++) {
174
- const session = await client.createSession({
175
- cwd: FIXTURES_DIR,
176
- mcpServers: [
177
- {
178
- name: 'calculator',
179
- command: bunPath,
180
- args: [calculatorPath],
181
- env: [],
182
- },
183
- ],
184
- })
185
-
186
- // Set model to haiku for faster/cheaper test runs
187
- await client.setModel(session.id, TEST_MODEL)
188
-
189
- // Ask Claude to use the calculator MCP server
190
- const { updates } = await client.promptSync(
191
- session.id,
192
- createPrompt('Use the calculator MCP server add tool to compute 15 + 27. Reply with just the number.'),
193
- )
194
-
195
- const summary = summarizeResponse(updates)
196
-
197
- // Check if we got 42 in the response
198
- if (summary.text.match(/42/)) {
199
- expect(summary.text.length).toBeGreaterThan(0)
200
- expect(summary.text).toMatch(/42/)
201
- return // Success!
202
- }
203
-
204
- // MCP server might not have been ready, retry
205
- lastError = new Error(`Attempt ${attempt}: Response did not contain 42. Got: ${summary.text.slice(0, 100)}...`)
206
- if (attempt < maxRetries) {
207
- console.log(`MCP test attempt ${attempt} failed, retrying...`)
208
- }
209
- }
210
-
211
- // All retries exhausted
212
- throw lastError ?? new Error('MCP test failed after all retries')
213
- })
214
- })