@plaited/acp-harness 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/rules/accuracy.md +43 -0
- package/.claude/rules/bun-apis.md +80 -0
- package/.claude/rules/code-review.md +254 -0
- package/.claude/rules/git-workflow.md +37 -0
- package/.claude/rules/github.md +154 -0
- package/.claude/rules/testing.md +172 -0
- package/.claude/skills/acp-harness/SKILL.md +310 -0
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
- package/.claude/skills/acp-harness/references/downstream.md +288 -0
- package/.claude/skills/acp-harness/references/output-formats.md +221 -0
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +16 -0
- package/.github/CODEOWNERS +6 -0
- package/.github/workflows/ci.yml +63 -0
- package/.github/workflows/publish.yml +146 -0
- package/.mcp.json +20 -0
- package/CLAUDE.md +92 -0
- package/Dockerfile.test +23 -0
- package/LICENSE +15 -0
- package/README.md +94 -0
- package/bin/cli.ts +670 -0
- package/bin/tests/cli.spec.ts +362 -0
- package/biome.json +96 -0
- package/bun.lock +513 -0
- package/docker-compose.test.yml +21 -0
- package/package.json +57 -0
- package/scripts/bun-test-wrapper.sh +46 -0
- package/src/acp-client.ts +503 -0
- package/src/acp-helpers.ts +121 -0
- package/src/acp-transport.ts +455 -0
- package/src/acp-utils.ts +341 -0
- package/src/acp.constants.ts +56 -0
- package/src/acp.schemas.ts +161 -0
- package/src/acp.ts +27 -0
- package/src/acp.types.ts +28 -0
- package/src/tests/acp-client.spec.ts +205 -0
- package/src/tests/acp-helpers.spec.ts +105 -0
- package/src/tests/acp-integration.docker.ts +214 -0
- package/src/tests/acp-transport.spec.ts +153 -0
- package/src/tests/acp-utils.spec.ts +394 -0
- package/src/tests/fixtures/.claude/settings.local.json +8 -0
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
- package/src/tests/fixtures/calculator-mcp.ts +215 -0
- package/tsconfig.json +32 -0
package/bin/cli.ts
ADDED
|
@@ -0,0 +1,670 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Execute evaluation prompts against an ACP agent.
|
|
5
|
+
*
|
|
6
|
+
* @remarks
|
|
7
|
+
* Connects to an ACP-compatible agent (Claude Code, Droid, etc.) and
|
|
8
|
+
* runs evaluation prompts, capturing full trajectories for analysis.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* acp-harness <prompts.jsonl> --command <cmd> -o <results.jsonl>
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { appendFile } from 'node:fs/promises'
|
|
15
|
+
import { parseArgs } from 'node:util'
|
|
16
|
+
import type { PlanEntry, SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
|
|
17
|
+
import { z } from 'zod'
|
|
18
|
+
import { createACPClient, createPrompt } from '../src/acp.ts'
|
|
19
|
+
|
|
20
|
+
// ============================================================================
|
|
21
|
+
// Schemas (SDK-compatible MCP server format)
|
|
22
|
+
// ============================================================================
|
|
23
|
+
|
|
24
|
+
const EnvVariableSchema = z.object({
|
|
25
|
+
name: z.string(),
|
|
26
|
+
value: z.string(),
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
const HttpHeaderSchema = z.object({
|
|
30
|
+
name: z.string(),
|
|
31
|
+
value: z.string(),
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
const McpServerStdioSchema = z.object({
|
|
35
|
+
type: z.literal('stdio').optional(),
|
|
36
|
+
name: z.string(),
|
|
37
|
+
command: z.string(),
|
|
38
|
+
args: z.array(z.string()),
|
|
39
|
+
env: z.array(EnvVariableSchema),
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
const McpServerHttpSchema = z.object({
|
|
43
|
+
type: z.literal('http'),
|
|
44
|
+
name: z.string(),
|
|
45
|
+
url: z.string(),
|
|
46
|
+
headers: z.array(HttpHeaderSchema),
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
|
|
50
|
+
|
|
51
|
+
const PromptCaseSchema = z.object({
|
|
52
|
+
id: z.string(),
|
|
53
|
+
input: z.string(),
|
|
54
|
+
expected: z.string().optional(),
|
|
55
|
+
metadata: z.record(z.string(), z.unknown()).optional(),
|
|
56
|
+
timeout: z.number().optional(),
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
const ToolInputSchema = z
|
|
60
|
+
.object({
|
|
61
|
+
file_path: z.string().optional(),
|
|
62
|
+
path: z.string().optional(),
|
|
63
|
+
content: z.string().optional(),
|
|
64
|
+
new_string: z.string().optional(),
|
|
65
|
+
})
|
|
66
|
+
.passthrough()
|
|
67
|
+
|
|
68
|
+
// ============================================================================
|
|
69
|
+
// Types
|
|
70
|
+
// ============================================================================
|
|
71
|
+
|
|
72
|
+
type McpServerConfig = z.infer<typeof McpServerSchema>
|
|
73
|
+
type PromptCase = z.infer<typeof PromptCaseSchema>
|
|
74
|
+
|
|
75
|
+
/** Trajectory step types */
|
|
76
|
+
type TrajectoryStep =
|
|
77
|
+
| { type: 'thought'; content: string; timestamp: number }
|
|
78
|
+
| { type: 'message'; content: string; timestamp: number }
|
|
79
|
+
| {
|
|
80
|
+
type: 'tool_call'
|
|
81
|
+
name: string
|
|
82
|
+
status: string
|
|
83
|
+
input?: unknown
|
|
84
|
+
output?: unknown
|
|
85
|
+
duration?: number
|
|
86
|
+
timestamp: number
|
|
87
|
+
}
|
|
88
|
+
| { type: 'plan'; entries: PlanEntry[]; timestamp: number }
|
|
89
|
+
|
|
90
|
+
/** Full output format */
|
|
91
|
+
type FullResult = {
|
|
92
|
+
id: string
|
|
93
|
+
input: string
|
|
94
|
+
output: string
|
|
95
|
+
expected?: string
|
|
96
|
+
trajectory: TrajectoryStep[]
|
|
97
|
+
metadata: Record<string, unknown>
|
|
98
|
+
timing: {
|
|
99
|
+
start: number
|
|
100
|
+
end: number
|
|
101
|
+
firstResponse?: number
|
|
102
|
+
}
|
|
103
|
+
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
104
|
+
errors?: string[]
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/** Summary output format */
|
|
108
|
+
type SummaryResult = {
|
|
109
|
+
id: string
|
|
110
|
+
input: string
|
|
111
|
+
output: string
|
|
112
|
+
toolCalls: string[]
|
|
113
|
+
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
114
|
+
duration: number
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
type OutputFormat = 'summary' | 'judge'
|
|
118
|
+
|
|
119
|
+
/** Step with unique ID for judge format correlation */
|
|
120
|
+
type IndexedStep = TrajectoryStep & { stepId: string }
|
|
121
|
+
|
|
122
|
+
// ============================================================================
|
|
123
|
+
// Argument Parsing
|
|
124
|
+
// ============================================================================
|
|
125
|
+
|
|
126
|
+
const { values, positionals } = parseArgs({
|
|
127
|
+
args: Bun.argv.slice(2),
|
|
128
|
+
options: {
|
|
129
|
+
command: {
|
|
130
|
+
type: 'string',
|
|
131
|
+
},
|
|
132
|
+
cmd: {
|
|
133
|
+
type: 'string',
|
|
134
|
+
},
|
|
135
|
+
output: {
|
|
136
|
+
type: 'string',
|
|
137
|
+
short: 'o',
|
|
138
|
+
},
|
|
139
|
+
cwd: {
|
|
140
|
+
type: 'string',
|
|
141
|
+
short: 'c',
|
|
142
|
+
},
|
|
143
|
+
timeout: {
|
|
144
|
+
type: 'string',
|
|
145
|
+
short: 't',
|
|
146
|
+
default: '60000',
|
|
147
|
+
},
|
|
148
|
+
format: {
|
|
149
|
+
type: 'string',
|
|
150
|
+
short: 'f',
|
|
151
|
+
default: 'summary',
|
|
152
|
+
},
|
|
153
|
+
progress: {
|
|
154
|
+
type: 'boolean',
|
|
155
|
+
default: false,
|
|
156
|
+
},
|
|
157
|
+
append: {
|
|
158
|
+
type: 'boolean',
|
|
159
|
+
default: false,
|
|
160
|
+
},
|
|
161
|
+
'mcp-server': {
|
|
162
|
+
type: 'string',
|
|
163
|
+
multiple: true,
|
|
164
|
+
},
|
|
165
|
+
help: {
|
|
166
|
+
type: 'boolean',
|
|
167
|
+
short: 'h',
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
allowPositionals: true,
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
if (values.help || positionals.length === 0) {
|
|
174
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
175
|
+
console.log(`
|
|
176
|
+
Usage: acp-harness <prompts.jsonl> [options]
|
|
177
|
+
|
|
178
|
+
Arguments:
|
|
179
|
+
prompts.jsonl Input file with evaluation prompts
|
|
180
|
+
|
|
181
|
+
Options:
|
|
182
|
+
--cmd, --command ACP agent command (default: "claude-code-acp")
|
|
183
|
+
-o, --output Output file (default: stdout)
|
|
184
|
+
-c, --cwd Working directory for agent
|
|
185
|
+
-t, --timeout Request timeout in ms (default: 60000)
|
|
186
|
+
-f, --format Output format: summary, judge (default: summary)
|
|
187
|
+
--progress Show progress to stderr
|
|
188
|
+
--append Append to output file instead of overwriting
|
|
189
|
+
--mcp-server MCP server config JSON (repeatable)
|
|
190
|
+
-h, --help Show this help message
|
|
191
|
+
|
|
192
|
+
Input Format (JSONL):
|
|
193
|
+
{"id":"test-001","input":"Create a button","expected":"should contain <button>","metadata":{"category":"ui"}}
|
|
194
|
+
|
|
195
|
+
Output Formats:
|
|
196
|
+
summary - Minimal JSONL: id, input, output, toolCalls, status, duration
|
|
197
|
+
judge - Two-tier output:
|
|
198
|
+
1. Markdown with step IDs and head/tail previews → <output>.md
|
|
199
|
+
2. Full trajectory JSONL for reference → <output>.full.jsonl
|
|
200
|
+
|
|
201
|
+
Examples:
|
|
202
|
+
# Using the default claude-code-acp adapter
|
|
203
|
+
acp-harness prompts.jsonl -o results.jsonl
|
|
204
|
+
|
|
205
|
+
# Using bunx to run an adapter
|
|
206
|
+
acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
|
|
207
|
+
|
|
208
|
+
# Using a local adapter script
|
|
209
|
+
acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
|
|
210
|
+
|
|
211
|
+
# With judge format for LLM evaluation
|
|
212
|
+
acp-harness prompts.jsonl --cmd "bunx claude-code-acp" --format judge -o results
|
|
213
|
+
|
|
214
|
+
Note: Requires an ACP-compatible agent. For Claude Code, install the adapter:
|
|
215
|
+
npm install -g @zed-industries/claude-code-acp
|
|
216
|
+
ANTHROPIC_API_KEY=sk-... acp-harness prompts.jsonl -o results.jsonl
|
|
217
|
+
`)
|
|
218
|
+
process.exit(values.help ? 0 : 1)
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// ============================================================================
|
|
222
|
+
// Helpers
|
|
223
|
+
// ============================================================================
|
|
224
|
+
|
|
225
|
+
/** Parse command string into command array */
|
|
226
|
+
const parseCommand = (cmd: string): string[] => {
|
|
227
|
+
return cmd.split(/\s+/).filter(Boolean)
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/** Parse MCP server config from JSON string (SDK-compatible format) */
|
|
231
|
+
const parseMcpServerConfig = (json: string): McpServerConfig => {
|
|
232
|
+
return McpServerSchema.parse(JSON.parse(json))
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/** Load prompts from JSONL file */
|
|
236
|
+
const loadPrompts = async (path: string): Promise<PromptCase[]> => {
|
|
237
|
+
const content = await Bun.file(path).text()
|
|
238
|
+
return content
|
|
239
|
+
.trim()
|
|
240
|
+
.split('\n')
|
|
241
|
+
.filter(Boolean)
|
|
242
|
+
.map((line, index) => {
|
|
243
|
+
try {
|
|
244
|
+
return PromptCaseSchema.parse(JSON.parse(line))
|
|
245
|
+
} catch (error) {
|
|
246
|
+
throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
247
|
+
}
|
|
248
|
+
})
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/** Extract trajectory from session notifications */
|
|
252
|
+
const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
|
|
253
|
+
const trajectory: TrajectoryStep[] = []
|
|
254
|
+
const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
|
|
255
|
+
|
|
256
|
+
for (const notification of notifications) {
|
|
257
|
+
const timestamp = Date.now() - startTime
|
|
258
|
+
const update = notification.update
|
|
259
|
+
|
|
260
|
+
if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
|
|
261
|
+
trajectory.push({
|
|
262
|
+
type: 'thought',
|
|
263
|
+
content: update.content.text,
|
|
264
|
+
timestamp,
|
|
265
|
+
})
|
|
266
|
+
} else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
|
|
267
|
+
trajectory.push({
|
|
268
|
+
type: 'message',
|
|
269
|
+
content: update.content.text,
|
|
270
|
+
timestamp,
|
|
271
|
+
})
|
|
272
|
+
} else if (update.sessionUpdate === 'tool_call') {
|
|
273
|
+
const toolCall = update as ToolCall
|
|
274
|
+
const existing = toolCallMap.get(toolCall.toolCallId)
|
|
275
|
+
|
|
276
|
+
if (existing) {
|
|
277
|
+
// Update existing tool call with completion info
|
|
278
|
+
existing.step.status = toolCall.status ?? 'pending'
|
|
279
|
+
if (toolCall.content) {
|
|
280
|
+
existing.step.output = toolCall.content
|
|
281
|
+
}
|
|
282
|
+
if (toolCall.rawOutput) {
|
|
283
|
+
existing.step.output = toolCall.rawOutput
|
|
284
|
+
}
|
|
285
|
+
existing.step.duration = timestamp - existing.start
|
|
286
|
+
} else {
|
|
287
|
+
// New tool call
|
|
288
|
+
const step: TrajectoryStep & { type: 'tool_call' } = {
|
|
289
|
+
type: 'tool_call',
|
|
290
|
+
name: toolCall.title,
|
|
291
|
+
status: toolCall.status ?? 'pending',
|
|
292
|
+
input: toolCall.rawInput,
|
|
293
|
+
timestamp,
|
|
294
|
+
}
|
|
295
|
+
toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
|
|
296
|
+
trajectory.push(step)
|
|
297
|
+
}
|
|
298
|
+
} else if (update.sessionUpdate === 'plan') {
|
|
299
|
+
trajectory.push({
|
|
300
|
+
type: 'plan',
|
|
301
|
+
entries: update.entries,
|
|
302
|
+
timestamp,
|
|
303
|
+
})
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return trajectory
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/** Extract final text output from trajectory */
|
|
311
|
+
const extractOutput = (trajectory: TrajectoryStep[]): string => {
|
|
312
|
+
return trajectory
|
|
313
|
+
.filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
|
|
314
|
+
.map((step) => step.content)
|
|
315
|
+
.join('\n')
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/** Check if any tool calls failed */
|
|
319
|
+
const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
|
|
320
|
+
return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/** Head/tail preview configuration */
|
|
324
|
+
const HEAD_LINES = 8
|
|
325
|
+
const TAIL_LINES = 4
|
|
326
|
+
const MAX_CONTENT_LENGTH = 500
|
|
327
|
+
|
|
328
|
+
/** Extract head and tail lines from content */
|
|
329
|
+
const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
|
|
330
|
+
const lines = content.split('\n')
|
|
331
|
+
if (lines.length <= headLines + tailLines) {
|
|
332
|
+
return content
|
|
333
|
+
}
|
|
334
|
+
const head = lines.slice(0, headLines).join('\n')
|
|
335
|
+
const tail = lines.slice(-tailLines).join('\n')
|
|
336
|
+
const omitted = lines.length - headLines - tailLines
|
|
337
|
+
return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/** Extract file path from tool input if present */
|
|
341
|
+
const extractFilePath = (input: unknown): string | undefined => {
|
|
342
|
+
const result = ToolInputSchema.safeParse(input)
|
|
343
|
+
if (!result.success) return undefined
|
|
344
|
+
return result.data.file_path ?? result.data.path
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/** Extract content from tool input if present */
|
|
348
|
+
const extractContent = (input: unknown): string | undefined => {
|
|
349
|
+
const result = ToolInputSchema.safeParse(input)
|
|
350
|
+
if (!result.success) return undefined
|
|
351
|
+
return result.data.content ?? result.data.new_string
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/** Format result as summary JSONL */
|
|
355
|
+
const formatSummary = (result: FullResult): string => {
|
|
356
|
+
const summary: SummaryResult = {
|
|
357
|
+
id: result.id,
|
|
358
|
+
input: result.input,
|
|
359
|
+
output: result.output,
|
|
360
|
+
toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
|
|
361
|
+
status: result.status,
|
|
362
|
+
duration: result.timing.end - result.timing.start,
|
|
363
|
+
}
|
|
364
|
+
return JSON.stringify(summary)
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/** Format result as judge markdown with step IDs */
|
|
368
|
+
const formatJudgeMarkdown = (result: FullResult): string => {
|
|
369
|
+
const lines: string[] = [
|
|
370
|
+
`## Evaluation Record: ${result.id}`,
|
|
371
|
+
'',
|
|
372
|
+
`**Input:** ${result.input}`,
|
|
373
|
+
'',
|
|
374
|
+
'**Trajectory:**',
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
let stepNum = 1
|
|
378
|
+
for (const step of result.trajectory) {
|
|
379
|
+
const stepId = `${result.id}-step-${stepNum}`
|
|
380
|
+
|
|
381
|
+
if (step.type === 'thought') {
|
|
382
|
+
const preview = step.content.slice(0, 100)
|
|
383
|
+
const truncated = step.content.length > 100 ? '...' : ''
|
|
384
|
+
lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
|
|
385
|
+
stepNum++
|
|
386
|
+
} else if (step.type === 'tool_call') {
|
|
387
|
+
const duration = step.duration ? ` (${step.duration}ms)` : ''
|
|
388
|
+
const filePath = extractFilePath(step.input)
|
|
389
|
+
const content = extractContent(step.input)
|
|
390
|
+
|
|
391
|
+
lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
|
|
392
|
+
|
|
393
|
+
// Add file path if present
|
|
394
|
+
if (filePath) {
|
|
395
|
+
const charCount = content?.length ?? 0
|
|
396
|
+
lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// Add head/tail preview for content-producing tools
|
|
400
|
+
if (content && content.length > 0) {
|
|
401
|
+
const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content) : content
|
|
402
|
+
// Detect file extension for syntax highlighting
|
|
403
|
+
const ext = filePath?.split('.').pop() ?? 'typescript'
|
|
404
|
+
lines.push(` \`\`\`${ext}`)
|
|
405
|
+
lines.push(` ${preview.split('\n').join('\n ')}`)
|
|
406
|
+
lines.push(` \`\`\``)
|
|
407
|
+
}
|
|
408
|
+
stepNum++
|
|
409
|
+
} else if (step.type === 'plan') {
|
|
410
|
+
const planSummary = step.entries.map((e) => `${e.content}: ${e.status}`).join(', ')
|
|
411
|
+
const truncated = planSummary.length > 80 ? '...' : ''
|
|
412
|
+
lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
|
|
413
|
+
stepNum++
|
|
414
|
+
} else if (step.type === 'message') {
|
|
415
|
+
const preview = step.content.slice(0, 100)
|
|
416
|
+
const truncated = step.content.length > 100 ? '...' : ''
|
|
417
|
+
lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
|
|
418
|
+
stepNum++
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
lines.push('')
|
|
423
|
+
const outputPreview = result.output.slice(0, 200)
|
|
424
|
+
const outputTruncated = result.output.length > 200 ? '...' : ''
|
|
425
|
+
lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
|
|
426
|
+
lines.push('')
|
|
427
|
+
|
|
428
|
+
const metadataStr = Object.entries(result.metadata)
|
|
429
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
430
|
+
.join(', ')
|
|
431
|
+
lines.push(`**Metadata:** ${metadataStr}`)
|
|
432
|
+
lines.push(`**Status:** ${result.status}`)
|
|
433
|
+
lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
|
|
434
|
+
lines.push('')
|
|
435
|
+
lines.push('---')
|
|
436
|
+
lines.push('')
|
|
437
|
+
|
|
438
|
+
return lines.join('\n')
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/** Add step IDs to trajectory for full JSONL output */
|
|
442
|
+
const addStepIds = (result: FullResult): FullResult & { trajectory: IndexedStep[] } => {
|
|
443
|
+
let stepNum = 1
|
|
444
|
+
const indexedTrajectory = result.trajectory.map((step) => ({
|
|
445
|
+
...step,
|
|
446
|
+
stepId: `${result.id}-step-${stepNum++}`,
|
|
447
|
+
}))
|
|
448
|
+
return { ...result, trajectory: indexedTrajectory }
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/** Format result based on output format (returns markdown for judge, JSONL for summary) */
|
|
452
|
+
const formatResult = (result: FullResult, format: OutputFormat): string => {
|
|
453
|
+
if (format === 'summary') {
|
|
454
|
+
return formatSummary(result)
|
|
455
|
+
}
|
|
456
|
+
// Judge format returns markdown
|
|
457
|
+
return formatJudgeMarkdown(result)
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/** Format result as full JSONL with step IDs (for judge format's paired file) */
|
|
461
|
+
const formatFullWithStepIds = (result: FullResult): string => {
|
|
462
|
+
return JSON.stringify(addStepIds(result))
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
/** Write output line (to stdout or file) */
|
|
466
|
+
const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
|
|
467
|
+
if (outputPath) {
|
|
468
|
+
if (append) {
|
|
469
|
+
await appendFile(outputPath, `${line}\n`)
|
|
470
|
+
} else {
|
|
471
|
+
await Bun.write(outputPath, `${line}\n`)
|
|
472
|
+
}
|
|
473
|
+
} else {
|
|
474
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
475
|
+
console.log(line)
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
/** Log progress to stderr (doesn't pollute stdout) */
|
|
480
|
+
const logProgress = (message: string, showProgress: boolean): void => {
|
|
481
|
+
if (showProgress) {
|
|
482
|
+
console.error(message)
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
/** Resolve path relative to process.cwd() */
|
|
487
|
+
const resolvePath = (path: string): string => {
|
|
488
|
+
if (path.startsWith('/')) return path
|
|
489
|
+
return `${process.cwd()}/${path}`
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// ============================================================================
|
|
493
|
+
// Main
|
|
494
|
+
// ============================================================================
|
|
495
|
+
|
|
496
|
+
const main = async () => {
|
|
497
|
+
const promptsPath = positionals[0]
|
|
498
|
+
if (!promptsPath) {
|
|
499
|
+
console.error('Error: prompts.jsonl path is required')
|
|
500
|
+
process.exit(1)
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
const agentCommand = parseCommand(values.cmd ?? values.command ?? 'claude-code-acp')
|
|
504
|
+
const outputPath = values.output
|
|
505
|
+
const timeout = Number.parseInt(values.timeout ?? '60000', 10)
|
|
506
|
+
const cwd = values.cwd
|
|
507
|
+
const format = (values.format ?? 'summary') as OutputFormat
|
|
508
|
+
const showProgress = values.progress ?? false
|
|
509
|
+
const appendOutput = values.append ?? false
|
|
510
|
+
|
|
511
|
+
// Validate format
|
|
512
|
+
if (!['summary', 'judge'].includes(format)) {
|
|
513
|
+
console.error(`Error: Invalid format "${format}". Must be: summary, judge`)
|
|
514
|
+
process.exit(1)
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Judge format requires output path (creates two files)
|
|
518
|
+
if (format === 'judge' && !outputPath) {
|
|
519
|
+
console.error('Error: --format judge requires --output <path> (creates <path>.md and <path>.full.jsonl)')
|
|
520
|
+
process.exit(1)
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// Parse MCP server configurations (already SDK-compatible format)
|
|
524
|
+
const mcpServers = (values['mcp-server'] ?? []).map(parseMcpServerConfig)
|
|
525
|
+
|
|
526
|
+
// Load prompts
|
|
527
|
+
const prompts = await loadPrompts(promptsPath)
|
|
528
|
+
|
|
529
|
+
// Resolve output path relative to process.cwd()
|
|
530
|
+
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
531
|
+
|
|
532
|
+
// Compute output paths for judge format (creates two files)
|
|
533
|
+
const judgeMarkdownPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.md` : undefined
|
|
534
|
+
const judgeFullPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.full.jsonl` : undefined
|
|
535
|
+
|
|
536
|
+
// Log progress info
|
|
537
|
+
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, showProgress)
|
|
538
|
+
logProgress(`Command: ${agentCommand.join(' ')}`, showProgress)
|
|
539
|
+
logProgress(`Format: ${format}`, showProgress)
|
|
540
|
+
if (format === 'judge') {
|
|
541
|
+
logProgress(`Output: ${judgeMarkdownPath} + ${judgeFullPath}`, showProgress)
|
|
542
|
+
} else if (resolvedOutputPath) {
|
|
543
|
+
logProgress(`Output: ${resolvedOutputPath}`, showProgress)
|
|
544
|
+
}
|
|
545
|
+
if (mcpServers.length > 0) {
|
|
546
|
+
logProgress(`MCP Servers: ${mcpServers.map((s) => s.name).join(', ')}`, showProgress)
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// Create ACP client
|
|
550
|
+
const client = createACPClient({
|
|
551
|
+
command: agentCommand,
|
|
552
|
+
cwd,
|
|
553
|
+
timeout,
|
|
554
|
+
})
|
|
555
|
+
|
|
556
|
+
// Clear output file(s) if not appending
|
|
557
|
+
if (resolvedOutputPath && !appendOutput) {
|
|
558
|
+
if (format === 'judge') {
|
|
559
|
+
await Bun.write(judgeMarkdownPath!, '')
|
|
560
|
+
await Bun.write(judgeFullPath!, '')
|
|
561
|
+
} else {
|
|
562
|
+
await Bun.write(resolvedOutputPath, '')
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Session params with MCP servers
|
|
567
|
+
const sessionParams = {
|
|
568
|
+
cwd: cwd ?? process.cwd(),
|
|
569
|
+
mcpServers,
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
let isFirstOutput = true
|
|
573
|
+
|
|
574
|
+
try {
|
|
575
|
+
logProgress('Connecting to agent...', showProgress)
|
|
576
|
+
await client.connect()
|
|
577
|
+
logProgress('Connected!', showProgress)
|
|
578
|
+
|
|
579
|
+
// Create session with MCP servers
|
|
580
|
+
const session = await client.createSession(sessionParams)
|
|
581
|
+
logProgress(`Session: ${session.id}`, showProgress)
|
|
582
|
+
|
|
583
|
+
// Run evaluations sequentially
|
|
584
|
+
for (let i = 0; i < prompts.length; i++) {
|
|
585
|
+
const promptCase = prompts[i]
|
|
586
|
+
if (!promptCase) continue
|
|
587
|
+
|
|
588
|
+
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, showProgress)
|
|
589
|
+
|
|
590
|
+
const startTime = Date.now()
|
|
591
|
+
let result: FullResult
|
|
592
|
+
|
|
593
|
+
try {
|
|
594
|
+
const prompt = createPrompt(promptCase.input)
|
|
595
|
+
const { updates } = await client.promptSync(session.id, prompt)
|
|
596
|
+
|
|
597
|
+
const endTime = Date.now()
|
|
598
|
+
const trajectory = extractTrajectory(updates, startTime)
|
|
599
|
+
const output = extractOutput(trajectory)
|
|
600
|
+
const hasErrors = hasToolErrors(trajectory)
|
|
601
|
+
|
|
602
|
+
result = {
|
|
603
|
+
id: promptCase.id,
|
|
604
|
+
input: promptCase.input,
|
|
605
|
+
output,
|
|
606
|
+
...(promptCase.expected && { expected: promptCase.expected }),
|
|
607
|
+
trajectory,
|
|
608
|
+
metadata: {
|
|
609
|
+
...promptCase.metadata,
|
|
610
|
+
agent: agentCommand.join(' '),
|
|
611
|
+
},
|
|
612
|
+
timing: {
|
|
613
|
+
start: startTime,
|
|
614
|
+
end: endTime,
|
|
615
|
+
firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
|
|
616
|
+
},
|
|
617
|
+
status: hasErrors ? 'failed' : 'passed',
|
|
618
|
+
}
|
|
619
|
+
} catch (error) {
|
|
620
|
+
const endTime = Date.now()
|
|
621
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
622
|
+
const isTimeout = message.includes('timeout') || message.includes('timed out')
|
|
623
|
+
|
|
624
|
+
result = {
|
|
625
|
+
id: promptCase.id,
|
|
626
|
+
input: promptCase.input,
|
|
627
|
+
output: '',
|
|
628
|
+
trajectory: [],
|
|
629
|
+
metadata: {
|
|
630
|
+
...promptCase.metadata,
|
|
631
|
+
agent: agentCommand.join(' '),
|
|
632
|
+
},
|
|
633
|
+
timing: {
|
|
634
|
+
start: startTime,
|
|
635
|
+
end: endTime,
|
|
636
|
+
},
|
|
637
|
+
status: isTimeout ? 'timeout' : 'error',
|
|
638
|
+
errors: [message],
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
// Format and output result
|
|
643
|
+
if (format === 'judge') {
|
|
644
|
+
// Judge format: write markdown to .md, full JSONL to .full.jsonl
|
|
645
|
+
const markdown = formatJudgeMarkdown(result)
|
|
646
|
+
const fullJsonl = formatFullWithStepIds(result)
|
|
647
|
+
await writeOutput(markdown, judgeMarkdownPath, !isFirstOutput)
|
|
648
|
+
await writeOutput(fullJsonl, judgeFullPath, !isFirstOutput)
|
|
649
|
+
} else {
|
|
650
|
+
// Summary format: write to single file
|
|
651
|
+
const formatted = formatResult(result, format)
|
|
652
|
+
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
653
|
+
}
|
|
654
|
+
isFirstOutput = false
|
|
655
|
+
|
|
656
|
+
const statusIcon = result.status === 'passed' ? '✓' : result.status === 'failed' ? '✗' : '!'
|
|
657
|
+
logProgress(` ${statusIcon} ${result.status} (${result.timing.end - result.timing.start}ms)`, showProgress)
|
|
658
|
+
}
|
|
659
|
+
} finally {
|
|
660
|
+
logProgress('Disconnecting...', showProgress)
|
|
661
|
+
await client.disconnect()
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
logProgress('Done!', showProgress)
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
main().catch((error) => {
|
|
668
|
+
console.error('Error:', error)
|
|
669
|
+
process.exit(1)
|
|
670
|
+
})
|