@plaited/acp-harness 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -31
- package/bin/cli.ts +15 -0
- package/package.json +5 -7
- package/src/acp-client.ts +7 -4
- package/src/adapter-check.ts +0 -1
- package/src/adapter-scaffold.ts +16 -15
- package/src/calibrate.ts +28 -8
- package/src/capture.ts +114 -33
- package/src/grader-loader.ts +3 -3
- package/src/harness.ts +4 -0
- package/src/headless-cli.ts +433 -0
- package/src/headless-history-builder.ts +141 -0
- package/src/headless-output-parser.ts +251 -0
- package/src/headless-session-manager.ts +389 -0
- package/src/headless.schemas.ts +241 -0
- package/src/headless.ts +71 -0
- package/src/headless.types.ts +19 -0
- package/src/integration_tests/acp-claude.spec.ts +170 -0
- package/src/integration_tests/acp-gemini.spec.ts +174 -0
- package/src/schemas.ts +88 -36
- package/src/summarize.ts +4 -8
- package/src/tests/acp-client.spec.ts +1 -1
- package/src/tests/capture-cli.spec.ts +188 -0
- package/src/tests/capture-helpers.spec.ts +229 -67
- package/src/tests/constants.spec.ts +121 -0
- package/src/tests/fixtures/grader-exec.py +3 -3
- package/src/tests/fixtures/grader-module.ts +2 -2
- package/src/tests/grader-loader.spec.ts +5 -5
- package/src/tests/headless.spec.ts +460 -0
- package/src/tests/schemas-cli.spec.ts +142 -0
- package/src/tests/schemas.spec.ts +657 -0
- package/src/tests/summarize-helpers.spec.ts +3 -3
- package/src/tests/trials-cli.spec.ts +145 -0
- package/src/trials.ts +6 -19
- package/src/validate-refs.ts +1 -1
- package/src/tests/acp-integration.docker.ts +0 -214
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { describe, expect, test } from 'bun:test'
|
|
2
|
+
import type { TrialsConfig } from '../trials.ts'
|
|
3
|
+
|
|
4
|
+
// ============================================================================
|
|
5
|
+
// TrialsConfig type
|
|
6
|
+
// ============================================================================
|
|
7
|
+
|
|
8
|
+
describe('TrialsConfig configuration', () => {
|
|
9
|
+
test('TrialsConfig type accepts valid configuration', () => {
|
|
10
|
+
const config: TrialsConfig = {
|
|
11
|
+
promptsPath: '/tmp/prompts.jsonl',
|
|
12
|
+
agentCommand: ['bunx', 'test-agent'],
|
|
13
|
+
k: 5,
|
|
14
|
+
outputPath: '/tmp/output.jsonl',
|
|
15
|
+
cwd: '/tmp',
|
|
16
|
+
timeout: 30000,
|
|
17
|
+
progress: true,
|
|
18
|
+
append: false,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
|
|
22
|
+
expect(config.agentCommand).toEqual(['bunx', 'test-agent'])
|
|
23
|
+
expect(config.k).toBe(5)
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('TrialsConfig allows minimal configuration', () => {
|
|
27
|
+
const config: TrialsConfig = {
|
|
28
|
+
promptsPath: '/tmp/prompts.jsonl',
|
|
29
|
+
agentCommand: ['echo', 'test'],
|
|
30
|
+
k: 3,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
expect(config.outputPath).toBeUndefined()
|
|
34
|
+
expect(config.cwd).toBeUndefined()
|
|
35
|
+
expect(config.timeout).toBeUndefined()
|
|
36
|
+
expect(config.progress).toBeUndefined()
|
|
37
|
+
expect(config.append).toBeUndefined()
|
|
38
|
+
expect(config.grader).toBeUndefined()
|
|
39
|
+
})
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
// ============================================================================
|
|
43
|
+
// CLI Help Output
|
|
44
|
+
// ============================================================================
|
|
45
|
+
|
|
46
|
+
describe('trials CLI', () => {
|
|
47
|
+
test('displays help with --help flag', async () => {
|
|
48
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '--help'], {
|
|
49
|
+
stdout: 'pipe',
|
|
50
|
+
stderr: 'pipe',
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
const stdout = await new Response(proc.stdout).text()
|
|
54
|
+
await proc.exited
|
|
55
|
+
|
|
56
|
+
expect(stdout).toContain('Usage: acp-harness trials')
|
|
57
|
+
expect(stdout).toContain('prompts.jsonl')
|
|
58
|
+
expect(stdout).toContain('-o, --output')
|
|
59
|
+
expect(stdout).toContain('-k')
|
|
60
|
+
expect(stdout).toContain('-c, --cwd')
|
|
61
|
+
expect(stdout).toContain('-t, --timeout')
|
|
62
|
+
expect(stdout).toContain('--progress')
|
|
63
|
+
expect(stdout).toContain('-g, --grader')
|
|
64
|
+
expect(stdout).toContain('pass@k')
|
|
65
|
+
})
|
|
66
|
+
|
|
67
|
+
test('shows error for missing prompts file argument', async () => {
|
|
68
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials'], {
|
|
69
|
+
stdout: 'pipe',
|
|
70
|
+
stderr: 'pipe',
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
const stderr = await new Response(proc.stderr).text()
|
|
74
|
+
const exitCode = await proc.exited
|
|
75
|
+
|
|
76
|
+
expect(exitCode).not.toBe(0)
|
|
77
|
+
expect(stderr).toContain('prompts.jsonl path is required')
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('shows error for missing agent command', async () => {
|
|
81
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl'], {
|
|
82
|
+
stdout: 'pipe',
|
|
83
|
+
stderr: 'pipe',
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
const stderr = await new Response(proc.stderr).text()
|
|
87
|
+
const exitCode = await proc.exited
|
|
88
|
+
|
|
89
|
+
expect(exitCode).not.toBe(0)
|
|
90
|
+
expect(stderr).toContain('ACP agent command is required')
|
|
91
|
+
})
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
// ============================================================================
|
|
95
|
+
// Schemas CLI
|
|
96
|
+
// ============================================================================
|
|
97
|
+
|
|
98
|
+
describe('schemas CLI', () => {
|
|
99
|
+
test('displays help with --help flag', async () => {
|
|
100
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--help'], {
|
|
101
|
+
stdout: 'pipe',
|
|
102
|
+
stderr: 'pipe',
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
const stdout = await new Response(proc.stdout).text()
|
|
106
|
+
await proc.exited
|
|
107
|
+
|
|
108
|
+
expect(stdout).toContain('Usage: acp-harness schemas')
|
|
109
|
+
expect(stdout).toContain('-o, --output')
|
|
110
|
+
expect(stdout).toContain('-j, --json')
|
|
111
|
+
expect(stdout).toContain('-s, --split')
|
|
112
|
+
expect(stdout).toContain('-l, --list')
|
|
113
|
+
expect(stdout).toContain('Available Schemas')
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
test('lists schemas with --list flag', async () => {
|
|
117
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', '--list'], {
|
|
118
|
+
stdout: 'pipe',
|
|
119
|
+
stderr: 'pipe',
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
const stdout = await new Response(proc.stdout).text()
|
|
123
|
+
await proc.exited
|
|
124
|
+
|
|
125
|
+
expect(stdout).toContain('Available schemas')
|
|
126
|
+
expect(stdout).toContain('PromptCase')
|
|
127
|
+
expect(stdout).toContain('CaptureResult')
|
|
128
|
+
expect(stdout).toContain('GraderResult')
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
test('exports schema as JSON', async () => {
|
|
132
|
+
const proc = Bun.spawn(['bun', './bin/cli.ts', 'schemas', 'PromptCase', '--json'], {
|
|
133
|
+
stdout: 'pipe',
|
|
134
|
+
stderr: 'pipe',
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
const stdout = await new Response(proc.stdout).text()
|
|
138
|
+
await proc.exited
|
|
139
|
+
|
|
140
|
+
const schema = JSON.parse(stdout)
|
|
141
|
+
expect(schema.$schema).toBe('https://json-schema.org/draft/2020-12/schema')
|
|
142
|
+
expect(schema.title).toBe('PromptCase')
|
|
143
|
+
expect(schema.type).toBe('object')
|
|
144
|
+
})
|
|
145
|
+
})
|
package/src/trials.ts
CHANGED
|
@@ -19,7 +19,6 @@ import { extractOutput, extractTrajectory, loadPrompts } from './capture.ts'
|
|
|
19
19
|
import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from './constants.ts'
|
|
20
20
|
import { loadGrader } from './grader-loader.ts'
|
|
21
21
|
import type { Grader, TrialEntry, TrialResult } from './schemas.ts'
|
|
22
|
-
import { McpServerSchema } from './schemas.ts'
|
|
23
22
|
|
|
24
23
|
// ============================================================================
|
|
25
24
|
// Pass@k/Pass^k Calculation
|
|
@@ -92,8 +91,6 @@ export type TrialsConfig = {
|
|
|
92
91
|
progress?: boolean
|
|
93
92
|
/** Append to output file */
|
|
94
93
|
append?: boolean
|
|
95
|
-
/** MCP server configurations */
|
|
96
|
-
mcpServers?: unknown[]
|
|
97
94
|
/** Optional grader function */
|
|
98
95
|
grader?: Grader
|
|
99
96
|
}
|
|
@@ -149,13 +146,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
149
146
|
timeout = DEFAULT_HARNESS_TIMEOUT,
|
|
150
147
|
progress = false,
|
|
151
148
|
append = false,
|
|
152
|
-
mcpServers = [],
|
|
153
149
|
grader,
|
|
154
150
|
} = config
|
|
155
151
|
|
|
156
|
-
// Parse MCP server configurations
|
|
157
|
-
const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
|
|
158
|
-
|
|
159
152
|
// Load prompts
|
|
160
153
|
const prompts = await loadPrompts(promptsPath)
|
|
161
154
|
|
|
@@ -182,10 +175,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
182
175
|
await Bun.write(resolvedOutputPath, '')
|
|
183
176
|
}
|
|
184
177
|
|
|
185
|
-
// Session params
|
|
178
|
+
// Session params - agents auto-discover MCP configs from cwd
|
|
186
179
|
const sessionParams = {
|
|
187
180
|
cwd: cwd ?? process.cwd(),
|
|
188
|
-
mcpServers: parsedMcpServers,
|
|
189
181
|
}
|
|
190
182
|
|
|
191
183
|
const results: TrialResult[] = []
|
|
@@ -211,7 +203,8 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
211
203
|
const startTime = Date.now()
|
|
212
204
|
|
|
213
205
|
try {
|
|
214
|
-
const
|
|
206
|
+
const inputText = Array.isArray(promptCase.input) ? promptCase.input.join('\n') : promptCase.input
|
|
207
|
+
const prompt = createPrompt(inputText)
|
|
215
208
|
const { updates } = await client.promptSync(session.id, prompt)
|
|
216
209
|
|
|
217
210
|
const endTime = Date.now()
|
|
@@ -230,7 +223,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
230
223
|
const graderResult = await grader({
|
|
231
224
|
input: promptCase.input,
|
|
232
225
|
output,
|
|
233
|
-
|
|
226
|
+
hint: promptCase.hint,
|
|
234
227
|
trajectory,
|
|
235
228
|
})
|
|
236
229
|
entry.pass = graderResult.pass
|
|
@@ -263,7 +256,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
263
256
|
const result: TrialResult = {
|
|
264
257
|
id: promptCase.id,
|
|
265
258
|
input: promptCase.input,
|
|
266
|
-
...(promptCase.
|
|
259
|
+
...(promptCase.hint && { hint: promptCase.hint }),
|
|
267
260
|
k,
|
|
268
261
|
trials: trialEntries,
|
|
269
262
|
}
|
|
@@ -318,7 +311,6 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
318
311
|
timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
|
|
319
312
|
progress: { type: 'boolean', default: false },
|
|
320
313
|
append: { type: 'boolean', default: false },
|
|
321
|
-
'mcp-server': { type: 'string', multiple: true },
|
|
322
314
|
grader: { type: 'string', short: 'g' },
|
|
323
315
|
help: { type: 'boolean', short: 'h' },
|
|
324
316
|
},
|
|
@@ -337,11 +329,10 @@ Arguments:
|
|
|
337
329
|
Options:
|
|
338
330
|
-o, --output Output file (default: stdout)
|
|
339
331
|
-k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
|
|
340
|
-
-c, --cwd Working directory for agent
|
|
332
|
+
-c, --cwd Working directory for agent (agents auto-discover MCP configs from here)
|
|
341
333
|
-t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
|
|
342
334
|
--progress Show progress to stderr
|
|
343
335
|
--append Append to output file
|
|
344
|
-
--mcp-server MCP server config JSON (repeatable)
|
|
345
336
|
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
346
337
|
-h, --help Show this help message
|
|
347
338
|
|
|
@@ -389,9 +380,6 @@ Examples:
|
|
|
389
380
|
}
|
|
390
381
|
}
|
|
391
382
|
|
|
392
|
-
// Parse MCP server configurations
|
|
393
|
-
const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
|
|
394
|
-
|
|
395
383
|
await runTrials({
|
|
396
384
|
promptsPath,
|
|
397
385
|
agentCommand,
|
|
@@ -401,7 +389,6 @@ Examples:
|
|
|
401
389
|
timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
|
|
402
390
|
progress: values.progress ?? false,
|
|
403
391
|
append: values.append ?? false,
|
|
404
|
-
mcpServers,
|
|
405
392
|
grader,
|
|
406
393
|
})
|
|
407
394
|
}
|
package/src/validate-refs.ts
CHANGED
|
@@ -69,7 +69,7 @@ export const runValidateRefs = async (config: ValidateRefsConfig): Promise<Valid
|
|
|
69
69
|
const graderResult = await grader({
|
|
70
70
|
input: prompt.input,
|
|
71
71
|
output: prompt.reference as string,
|
|
72
|
-
|
|
72
|
+
hint: prompt.hint,
|
|
73
73
|
trajectory: [], // No trajectory for reference validation
|
|
74
74
|
})
|
|
75
75
|
|
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ACP Client Integration Tests
|
|
3
|
-
*
|
|
4
|
-
* @remarks
|
|
5
|
-
* These tests verify the ACP client works against real Claude Code
|
|
6
|
-
* via the `claude-code-acp` adapter.
|
|
7
|
-
*
|
|
8
|
-
* **Run in Docker only** for consistent environment:
|
|
9
|
-
* ```bash
|
|
10
|
-
* ANTHROPIC_API_KEY=sk-... bun run test:acp
|
|
11
|
-
* ```
|
|
12
|
-
*
|
|
13
|
-
* Prerequisites:
|
|
14
|
-
* 1. Docker installed
|
|
15
|
-
* 2. API key: `ANTHROPIC_API_KEY` environment variable
|
|
16
|
-
*
|
|
17
|
-
* These tests make real API calls and consume credits.
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
import { afterAll, beforeAll, describe, expect, setDefaultTimeout, test } from 'bun:test'
|
|
21
|
-
import { type ACPClient, createACPClient } from '../acp-client.ts'
|
|
22
|
-
import { createPrompt, summarizeResponse } from '../acp-helpers.ts'
|
|
23
|
-
|
|
24
|
-
// Long timeout for real agent interactions (2 minutes)
|
|
25
|
-
setDefaultTimeout(120000)
|
|
26
|
-
|
|
27
|
-
// Fixtures directory with .claude/skills and .mcp.json
|
|
28
|
-
const FIXTURES_DIR = `${import.meta.dir}/fixtures`
|
|
29
|
-
|
|
30
|
-
// Use haiku for all tests to reduce costs
|
|
31
|
-
const TEST_MODEL = 'claude-haiku-4-5-20251001'
|
|
32
|
-
|
|
33
|
-
describe('ACP Client Integration', () => {
|
|
34
|
-
let client: ACPClient
|
|
35
|
-
|
|
36
|
-
beforeAll(async () => {
|
|
37
|
-
// cc-acp adapter expects ANTHROPIC_API_KEY
|
|
38
|
-
client = createACPClient({
|
|
39
|
-
command: ['bunx', 'claude-code-acp'],
|
|
40
|
-
timeout: 120000, // 2 min timeout for initialization
|
|
41
|
-
env: {
|
|
42
|
-
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '',
|
|
43
|
-
},
|
|
44
|
-
})
|
|
45
|
-
|
|
46
|
-
await client.connect()
|
|
47
|
-
})
|
|
48
|
-
|
|
49
|
-
afterAll(async () => {
|
|
50
|
-
await client?.disconnect()
|
|
51
|
-
})
|
|
52
|
-
|
|
53
|
-
test('connects and initializes', () => {
|
|
54
|
-
expect(client.isConnected()).toBe(true)
|
|
55
|
-
|
|
56
|
-
const initResult = client.getInitializeResult()
|
|
57
|
-
expect(initResult).toBeDefined()
|
|
58
|
-
expect(initResult?.protocolVersion).toBeDefined()
|
|
59
|
-
})
|
|
60
|
-
|
|
61
|
-
test('reports agent capabilities', () => {
|
|
62
|
-
const capabilities = client.getCapabilities()
|
|
63
|
-
expect(capabilities).toBeDefined()
|
|
64
|
-
})
|
|
65
|
-
|
|
66
|
-
test('creates session', async () => {
|
|
67
|
-
const session = await client.createSession({
|
|
68
|
-
cwd: FIXTURES_DIR,
|
|
69
|
-
mcpServers: [],
|
|
70
|
-
})
|
|
71
|
-
|
|
72
|
-
expect(session).toBeDefined()
|
|
73
|
-
expect(session.id).toBeDefined()
|
|
74
|
-
expect(typeof session.id).toBe('string')
|
|
75
|
-
})
|
|
76
|
-
|
|
77
|
-
test('sends prompt and receives response', async () => {
|
|
78
|
-
const session = await client.createSession({
|
|
79
|
-
cwd: FIXTURES_DIR,
|
|
80
|
-
mcpServers: [],
|
|
81
|
-
})
|
|
82
|
-
|
|
83
|
-
// Use haiku for faster/cheaper test runs
|
|
84
|
-
await client.setModel(session.id, TEST_MODEL)
|
|
85
|
-
|
|
86
|
-
// Simple prompt that doesn't require tools
|
|
87
|
-
const { result, updates } = await client.promptSync(
|
|
88
|
-
session.id,
|
|
89
|
-
createPrompt('What is 2 + 2? Reply with just the number.'),
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
expect(result).toBeDefined()
|
|
93
|
-
expect(updates).toBeInstanceOf(Array)
|
|
94
|
-
|
|
95
|
-
// Summarize and verify response structure
|
|
96
|
-
const summary = summarizeResponse(updates)
|
|
97
|
-
expect(summary.text).toBeDefined()
|
|
98
|
-
expect(summary.text.length).toBeGreaterThan(0)
|
|
99
|
-
})
|
|
100
|
-
|
|
101
|
-
test('streaming prompt yields updates', async () => {
|
|
102
|
-
const session = await client.createSession({
|
|
103
|
-
cwd: FIXTURES_DIR,
|
|
104
|
-
mcpServers: [],
|
|
105
|
-
})
|
|
106
|
-
|
|
107
|
-
// Use haiku for faster/cheaper test runs
|
|
108
|
-
await client.setModel(session.id, TEST_MODEL)
|
|
109
|
-
|
|
110
|
-
const events: string[] = []
|
|
111
|
-
|
|
112
|
-
for await (const event of client.prompt(session.id, createPrompt('Say "hello" and nothing else.'))) {
|
|
113
|
-
events.push(event.type)
|
|
114
|
-
if (event.type === 'complete') {
|
|
115
|
-
expect(event.result).toBeDefined()
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
expect(events).toContain('complete')
|
|
120
|
-
})
|
|
121
|
-
|
|
122
|
-
test('handles tool usage prompt', async () => {
|
|
123
|
-
const session = await client.createSession({
|
|
124
|
-
cwd: FIXTURES_DIR,
|
|
125
|
-
mcpServers: [],
|
|
126
|
-
})
|
|
127
|
-
|
|
128
|
-
// Use haiku for faster/cheaper test runs
|
|
129
|
-
await client.setModel(session.id, TEST_MODEL)
|
|
130
|
-
|
|
131
|
-
// Prompt that should trigger tool usage - reading a specific file
|
|
132
|
-
const { updates } = await client.promptSync(
|
|
133
|
-
session.id,
|
|
134
|
-
createPrompt('Use the Read tool to read calculator-mcp.ts and tell me what tools the MCP server provides.'),
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
const summary = summarizeResponse(updates)
|
|
138
|
-
|
|
139
|
-
// Verify response mentions calculator tools
|
|
140
|
-
expect(summary.text.length).toBeGreaterThan(0)
|
|
141
|
-
// Response should mention the calculator tools (add, subtract, etc.)
|
|
142
|
-
expect(summary.text.toLowerCase()).toMatch(/add|subtract|multiply|divide|calculator/)
|
|
143
|
-
})
|
|
144
|
-
|
|
145
|
-
test('uses skill from cwd', async () => {
|
|
146
|
-
const session = await client.createSession({
|
|
147
|
-
cwd: FIXTURES_DIR,
|
|
148
|
-
mcpServers: [],
|
|
149
|
-
})
|
|
150
|
-
|
|
151
|
-
// Use haiku for faster/cheaper test runs
|
|
152
|
-
await client.setModel(session.id, TEST_MODEL)
|
|
153
|
-
|
|
154
|
-
// Ask Claude to use the greeting skill
|
|
155
|
-
const { updates } = await client.promptSync(session.id, createPrompt('Please greet me using the greeting skill.'))
|
|
156
|
-
|
|
157
|
-
const summary = summarizeResponse(updates)
|
|
158
|
-
|
|
159
|
-
// The greeting skill instructs Claude to include specific phrases
|
|
160
|
-
expect(summary.text.length).toBeGreaterThan(0)
|
|
161
|
-
expect(summary.text.toLowerCase()).toMatch(/hello|greet|welcome/)
|
|
162
|
-
})
|
|
163
|
-
|
|
164
|
-
test('uses MCP server tools', async () => {
|
|
165
|
-
// Path to calculator MCP server fixture (must be absolute per ACP spec)
|
|
166
|
-
const calculatorPath = `${FIXTURES_DIR}/calculator-mcp.ts`
|
|
167
|
-
const bunPath = Bun.which('bun') ?? 'bun'
|
|
168
|
-
|
|
169
|
-
// Retry helper for flaky MCP server startup
|
|
170
|
-
const maxRetries = 3
|
|
171
|
-
let lastError: Error | undefined
|
|
172
|
-
|
|
173
|
-
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
174
|
-
const session = await client.createSession({
|
|
175
|
-
cwd: FIXTURES_DIR,
|
|
176
|
-
mcpServers: [
|
|
177
|
-
{
|
|
178
|
-
name: 'calculator',
|
|
179
|
-
command: bunPath,
|
|
180
|
-
args: [calculatorPath],
|
|
181
|
-
env: [],
|
|
182
|
-
},
|
|
183
|
-
],
|
|
184
|
-
})
|
|
185
|
-
|
|
186
|
-
// Set model to haiku for faster/cheaper test runs
|
|
187
|
-
await client.setModel(session.id, TEST_MODEL)
|
|
188
|
-
|
|
189
|
-
// Ask Claude to use the calculator MCP server
|
|
190
|
-
const { updates } = await client.promptSync(
|
|
191
|
-
session.id,
|
|
192
|
-
createPrompt('Use the calculator MCP server add tool to compute 15 + 27. Reply with just the number.'),
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
const summary = summarizeResponse(updates)
|
|
196
|
-
|
|
197
|
-
// Check if we got 42 in the response
|
|
198
|
-
if (summary.text.match(/42/)) {
|
|
199
|
-
expect(summary.text.length).toBeGreaterThan(0)
|
|
200
|
-
expect(summary.text).toMatch(/42/)
|
|
201
|
-
return // Success!
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// MCP server might not have been ready, retry
|
|
205
|
-
lastError = new Error(`Attempt ${attempt}: Response did not contain 42. Got: ${summary.text.slice(0, 100)}...`)
|
|
206
|
-
if (attempt < maxRetries) {
|
|
207
|
-
console.log(`MCP test attempt ${attempt} failed, retrying...`)
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
// All retries exhausted
|
|
212
|
-
throw lastError ?? new Error('MCP test failed after all retries')
|
|
213
|
-
})
|
|
214
|
-
})
|