@plaited/acp-harness 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/.claude/rules/accuracy.md +43 -0
  2. package/.claude/rules/bun-apis.md +80 -0
  3. package/.claude/rules/code-review.md +254 -0
  4. package/.claude/rules/git-workflow.md +37 -0
  5. package/.claude/rules/github.md +154 -0
  6. package/.claude/rules/testing.md +172 -0
  7. package/.claude/skills/acp-harness/SKILL.md +310 -0
  8. package/.claude/skills/acp-harness/assets/Dockerfile.acp +25 -0
  9. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +19 -0
  10. package/.claude/skills/acp-harness/references/downstream.md +288 -0
  11. package/.claude/skills/acp-harness/references/output-formats.md +221 -0
  12. package/.claude-plugin/marketplace.json +15 -0
  13. package/.claude-plugin/plugin.json +16 -0
  14. package/.github/CODEOWNERS +6 -0
  15. package/.github/workflows/ci.yml +63 -0
  16. package/.github/workflows/publish.yml +146 -0
  17. package/.mcp.json +20 -0
  18. package/CLAUDE.md +92 -0
  19. package/Dockerfile.test +23 -0
  20. package/LICENSE +15 -0
  21. package/README.md +94 -0
  22. package/bin/cli.ts +670 -0
  23. package/bin/tests/cli.spec.ts +362 -0
  24. package/biome.json +96 -0
  25. package/bun.lock +513 -0
  26. package/docker-compose.test.yml +21 -0
  27. package/package.json +57 -0
  28. package/scripts/bun-test-wrapper.sh +46 -0
  29. package/src/acp-client.ts +503 -0
  30. package/src/acp-helpers.ts +121 -0
  31. package/src/acp-transport.ts +455 -0
  32. package/src/acp-utils.ts +341 -0
  33. package/src/acp.constants.ts +56 -0
  34. package/src/acp.schemas.ts +161 -0
  35. package/src/acp.ts +27 -0
  36. package/src/acp.types.ts +28 -0
  37. package/src/tests/acp-client.spec.ts +205 -0
  38. package/src/tests/acp-helpers.spec.ts +105 -0
  39. package/src/tests/acp-integration.docker.ts +214 -0
  40. package/src/tests/acp-transport.spec.ts +153 -0
  41. package/src/tests/acp-utils.spec.ts +394 -0
  42. package/src/tests/fixtures/.claude/settings.local.json +8 -0
  43. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +17 -0
  44. package/src/tests/fixtures/calculator-mcp.ts +215 -0
  45. package/tsconfig.json +32 -0
package/bin/cli.ts ADDED
@@ -0,0 +1,670 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * Execute evaluation prompts against an ACP agent.
5
+ *
6
+ * @remarks
7
+ * Connects to an ACP-compatible agent (Claude Code, Droid, etc.) and
8
+ * runs evaluation prompts, capturing full trajectories for analysis.
9
+ *
10
+ * Usage:
11
+ * acp-harness <prompts.jsonl> --command <cmd> -o <results.jsonl>
12
+ */
13
+
14
+ import { appendFile } from 'node:fs/promises'
15
+ import { parseArgs } from 'node:util'
16
+ import type { PlanEntry, SessionNotification, ToolCall } from '@agentclientprotocol/sdk'
17
+ import { z } from 'zod'
18
+ import { createACPClient, createPrompt } from '../src/acp.ts'
19
+
20
+ // ============================================================================
21
+ // Schemas (SDK-compatible MCP server format)
22
+ // ============================================================================
23
+
24
+ const EnvVariableSchema = z.object({
25
+ name: z.string(),
26
+ value: z.string(),
27
+ })
28
+
29
+ const HttpHeaderSchema = z.object({
30
+ name: z.string(),
31
+ value: z.string(),
32
+ })
33
+
34
+ const McpServerStdioSchema = z.object({
35
+ type: z.literal('stdio').optional(),
36
+ name: z.string(),
37
+ command: z.string(),
38
+ args: z.array(z.string()),
39
+ env: z.array(EnvVariableSchema),
40
+ })
41
+
42
+ const McpServerHttpSchema = z.object({
43
+ type: z.literal('http'),
44
+ name: z.string(),
45
+ url: z.string(),
46
+ headers: z.array(HttpHeaderSchema),
47
+ })
48
+
49
+ const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
50
+
51
+ const PromptCaseSchema = z.object({
52
+ id: z.string(),
53
+ input: z.string(),
54
+ expected: z.string().optional(),
55
+ metadata: z.record(z.string(), z.unknown()).optional(),
56
+ timeout: z.number().optional(),
57
+ })
58
+
59
+ const ToolInputSchema = z
60
+ .object({
61
+ file_path: z.string().optional(),
62
+ path: z.string().optional(),
63
+ content: z.string().optional(),
64
+ new_string: z.string().optional(),
65
+ })
66
+ .passthrough()
67
+
68
+ // ============================================================================
69
+ // Types
70
+ // ============================================================================
71
+
72
+ type McpServerConfig = z.infer<typeof McpServerSchema>
73
+ type PromptCase = z.infer<typeof PromptCaseSchema>
74
+
75
+ /** Trajectory step types */
76
+ type TrajectoryStep =
77
+ | { type: 'thought'; content: string; timestamp: number }
78
+ | { type: 'message'; content: string; timestamp: number }
79
+ | {
80
+ type: 'tool_call'
81
+ name: string
82
+ status: string
83
+ input?: unknown
84
+ output?: unknown
85
+ duration?: number
86
+ timestamp: number
87
+ }
88
+ | { type: 'plan'; entries: PlanEntry[]; timestamp: number }
89
+
90
+ /** Full output format */
91
+ type FullResult = {
92
+ id: string
93
+ input: string
94
+ output: string
95
+ expected?: string
96
+ trajectory: TrajectoryStep[]
97
+ metadata: Record<string, unknown>
98
+ timing: {
99
+ start: number
100
+ end: number
101
+ firstResponse?: number
102
+ }
103
+ status: 'passed' | 'failed' | 'error' | 'timeout'
104
+ errors?: string[]
105
+ }
106
+
107
+ /** Summary output format */
108
+ type SummaryResult = {
109
+ id: string
110
+ input: string
111
+ output: string
112
+ toolCalls: string[]
113
+ status: 'passed' | 'failed' | 'error' | 'timeout'
114
+ duration: number
115
+ }
116
+
117
+ type OutputFormat = 'summary' | 'judge'
118
+
119
+ /** Step with unique ID for judge format correlation */
120
+ type IndexedStep = TrajectoryStep & { stepId: string }
121
+
122
+ // ============================================================================
123
+ // Argument Parsing
124
+ // ============================================================================
125
+
126
+ const { values, positionals } = parseArgs({
127
+ args: Bun.argv.slice(2),
128
+ options: {
129
+ command: {
130
+ type: 'string',
131
+ },
132
+ cmd: {
133
+ type: 'string',
134
+ },
135
+ output: {
136
+ type: 'string',
137
+ short: 'o',
138
+ },
139
+ cwd: {
140
+ type: 'string',
141
+ short: 'c',
142
+ },
143
+ timeout: {
144
+ type: 'string',
145
+ short: 't',
146
+ default: '60000',
147
+ },
148
+ format: {
149
+ type: 'string',
150
+ short: 'f',
151
+ default: 'summary',
152
+ },
153
+ progress: {
154
+ type: 'boolean',
155
+ default: false,
156
+ },
157
+ append: {
158
+ type: 'boolean',
159
+ default: false,
160
+ },
161
+ 'mcp-server': {
162
+ type: 'string',
163
+ multiple: true,
164
+ },
165
+ help: {
166
+ type: 'boolean',
167
+ short: 'h',
168
+ },
169
+ },
170
+ allowPositionals: true,
171
+ })
172
+
173
+ if (values.help || positionals.length === 0) {
174
+ // biome-ignore lint/suspicious/noConsole: CLI help output
175
+ console.log(`
176
+ Usage: acp-harness <prompts.jsonl> [options]
177
+
178
+ Arguments:
179
+ prompts.jsonl Input file with evaluation prompts
180
+
181
+ Options:
182
+ --cmd, --command ACP agent command (default: "claude-code-acp")
183
+ -o, --output Output file (default: stdout)
184
+ -c, --cwd Working directory for agent
185
+ -t, --timeout Request timeout in ms (default: 60000)
186
+ -f, --format Output format: summary, judge (default: summary)
187
+ --progress Show progress to stderr
188
+ --append Append to output file instead of overwriting
189
+ --mcp-server MCP server config JSON (repeatable)
190
+ -h, --help Show this help message
191
+
192
+ Input Format (JSONL):
193
+ {"id":"test-001","input":"Create a button","expected":"should contain <button>","metadata":{"category":"ui"}}
194
+
195
+ Output Formats:
196
+ summary - Minimal JSONL: id, input, output, toolCalls, status, duration
197
+ judge - Two-tier output:
198
+ 1. Markdown with step IDs and head/tail previews → <output>.md
199
+ 2. Full trajectory JSONL for reference → <output>.full.jsonl
200
+
201
+ Examples:
202
+ # Using the default claude-code-acp adapter
203
+ acp-harness prompts.jsonl -o results.jsonl
204
+
205
+ # Using bunx to run an adapter
206
+ acp-harness prompts.jsonl --cmd "bunx claude-code-acp" -o results.jsonl
207
+
208
+ # Using a local adapter script
209
+ acp-harness prompts.jsonl --cmd "bun ./my-adapter.ts" -o results.jsonl
210
+
211
+ # With judge format for LLM evaluation
212
+ acp-harness prompts.jsonl --cmd "bunx claude-code-acp" --format judge -o results
213
+
214
+ Note: Requires an ACP-compatible agent. For Claude Code, install the adapter:
215
+ npm install -g @zed-industries/claude-code-acp
216
+ ANTHROPIC_API_KEY=sk-... acp-harness prompts.jsonl -o results.jsonl
217
+ `)
218
+ process.exit(values.help ? 0 : 1)
219
+ }
220
+
221
+ // ============================================================================
222
+ // Helpers
223
+ // ============================================================================
224
+
225
+ /** Parse command string into command array */
226
+ const parseCommand = (cmd: string): string[] => {
227
+ return cmd.split(/\s+/).filter(Boolean)
228
+ }
229
+
230
+ /** Parse MCP server config from JSON string (SDK-compatible format) */
231
+ const parseMcpServerConfig = (json: string): McpServerConfig => {
232
+ return McpServerSchema.parse(JSON.parse(json))
233
+ }
234
+
235
+ /** Load prompts from JSONL file */
236
+ const loadPrompts = async (path: string): Promise<PromptCase[]> => {
237
+ const content = await Bun.file(path).text()
238
+ return content
239
+ .trim()
240
+ .split('\n')
241
+ .filter(Boolean)
242
+ .map((line, index) => {
243
+ try {
244
+ return PromptCaseSchema.parse(JSON.parse(line))
245
+ } catch (error) {
246
+ throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
247
+ }
248
+ })
249
+ }
250
+
251
+ /** Extract trajectory from session notifications */
252
+ const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
253
+ const trajectory: TrajectoryStep[] = []
254
+ const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
255
+
256
+ for (const notification of notifications) {
257
+ const timestamp = Date.now() - startTime
258
+ const update = notification.update
259
+
260
+ if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
261
+ trajectory.push({
262
+ type: 'thought',
263
+ content: update.content.text,
264
+ timestamp,
265
+ })
266
+ } else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
267
+ trajectory.push({
268
+ type: 'message',
269
+ content: update.content.text,
270
+ timestamp,
271
+ })
272
+ } else if (update.sessionUpdate === 'tool_call') {
273
+ const toolCall = update as ToolCall
274
+ const existing = toolCallMap.get(toolCall.toolCallId)
275
+
276
+ if (existing) {
277
+ // Update existing tool call with completion info
278
+ existing.step.status = toolCall.status ?? 'pending'
279
+ if (toolCall.content) {
280
+ existing.step.output = toolCall.content
281
+ }
282
+ if (toolCall.rawOutput) {
283
+ existing.step.output = toolCall.rawOutput
284
+ }
285
+ existing.step.duration = timestamp - existing.start
286
+ } else {
287
+ // New tool call
288
+ const step: TrajectoryStep & { type: 'tool_call' } = {
289
+ type: 'tool_call',
290
+ name: toolCall.title,
291
+ status: toolCall.status ?? 'pending',
292
+ input: toolCall.rawInput,
293
+ timestamp,
294
+ }
295
+ toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
296
+ trajectory.push(step)
297
+ }
298
+ } else if (update.sessionUpdate === 'plan') {
299
+ trajectory.push({
300
+ type: 'plan',
301
+ entries: update.entries,
302
+ timestamp,
303
+ })
304
+ }
305
+ }
306
+
307
+ return trajectory
308
+ }
309
+
310
+ /** Extract final text output from trajectory */
311
+ const extractOutput = (trajectory: TrajectoryStep[]): string => {
312
+ return trajectory
313
+ .filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
314
+ .map((step) => step.content)
315
+ .join('\n')
316
+ }
317
+
318
+ /** Check if any tool calls failed */
319
+ const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
320
+ return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
321
+ }
322
+
323
+ /** Head/tail preview configuration */
324
+ const HEAD_LINES = 8
325
+ const TAIL_LINES = 4
326
+ const MAX_CONTENT_LENGTH = 500
327
+
328
+ /** Extract head and tail lines from content */
329
+ const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
330
+ const lines = content.split('\n')
331
+ if (lines.length <= headLines + tailLines) {
332
+ return content
333
+ }
334
+ const head = lines.slice(0, headLines).join('\n')
335
+ const tail = lines.slice(-tailLines).join('\n')
336
+ const omitted = lines.length - headLines - tailLines
337
+ return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
338
+ }
339
+
340
+ /** Extract file path from tool input if present */
341
+ const extractFilePath = (input: unknown): string | undefined => {
342
+ const result = ToolInputSchema.safeParse(input)
343
+ if (!result.success) return undefined
344
+ return result.data.file_path ?? result.data.path
345
+ }
346
+
347
+ /** Extract content from tool input if present */
348
+ const extractContent = (input: unknown): string | undefined => {
349
+ const result = ToolInputSchema.safeParse(input)
350
+ if (!result.success) return undefined
351
+ return result.data.content ?? result.data.new_string
352
+ }
353
+
354
+ /** Format result as summary JSONL */
355
+ const formatSummary = (result: FullResult): string => {
356
+ const summary: SummaryResult = {
357
+ id: result.id,
358
+ input: result.input,
359
+ output: result.output,
360
+ toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
361
+ status: result.status,
362
+ duration: result.timing.end - result.timing.start,
363
+ }
364
+ return JSON.stringify(summary)
365
+ }
366
+
367
+ /** Format result as judge markdown with step IDs */
368
+ const formatJudgeMarkdown = (result: FullResult): string => {
369
+ const lines: string[] = [
370
+ `## Evaluation Record: ${result.id}`,
371
+ '',
372
+ `**Input:** ${result.input}`,
373
+ '',
374
+ '**Trajectory:**',
375
+ ]
376
+
377
+ let stepNum = 1
378
+ for (const step of result.trajectory) {
379
+ const stepId = `${result.id}-step-${stepNum}`
380
+
381
+ if (step.type === 'thought') {
382
+ const preview = step.content.slice(0, 100)
383
+ const truncated = step.content.length > 100 ? '...' : ''
384
+ lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
385
+ stepNum++
386
+ } else if (step.type === 'tool_call') {
387
+ const duration = step.duration ? ` (${step.duration}ms)` : ''
388
+ const filePath = extractFilePath(step.input)
389
+ const content = extractContent(step.input)
390
+
391
+ lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
392
+
393
+ // Add file path if present
394
+ if (filePath) {
395
+ const charCount = content?.length ?? 0
396
+ lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
397
+ }
398
+
399
+ // Add head/tail preview for content-producing tools
400
+ if (content && content.length > 0) {
401
+ const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content) : content
402
+ // Detect file extension for syntax highlighting
403
+ const ext = filePath?.split('.').pop() ?? 'typescript'
404
+ lines.push(` \`\`\`${ext}`)
405
+ lines.push(` ${preview.split('\n').join('\n ')}`)
406
+ lines.push(` \`\`\``)
407
+ }
408
+ stepNum++
409
+ } else if (step.type === 'plan') {
410
+ const planSummary = step.entries.map((e) => `${e.content}: ${e.status}`).join(', ')
411
+ const truncated = planSummary.length > 80 ? '...' : ''
412
+ lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
413
+ stepNum++
414
+ } else if (step.type === 'message') {
415
+ const preview = step.content.slice(0, 100)
416
+ const truncated = step.content.length > 100 ? '...' : ''
417
+ lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
418
+ stepNum++
419
+ }
420
+ }
421
+
422
+ lines.push('')
423
+ const outputPreview = result.output.slice(0, 200)
424
+ const outputTruncated = result.output.length > 200 ? '...' : ''
425
+ lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
426
+ lines.push('')
427
+
428
+ const metadataStr = Object.entries(result.metadata)
429
+ .map(([k, v]) => `${k}=${v}`)
430
+ .join(', ')
431
+ lines.push(`**Metadata:** ${metadataStr}`)
432
+ lines.push(`**Status:** ${result.status}`)
433
+ lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
434
+ lines.push('')
435
+ lines.push('---')
436
+ lines.push('')
437
+
438
+ return lines.join('\n')
439
+ }
440
+
441
+ /** Add step IDs to trajectory for full JSONL output */
442
+ const addStepIds = (result: FullResult): FullResult & { trajectory: IndexedStep[] } => {
443
+ let stepNum = 1
444
+ const indexedTrajectory = result.trajectory.map((step) => ({
445
+ ...step,
446
+ stepId: `${result.id}-step-${stepNum++}`,
447
+ }))
448
+ return { ...result, trajectory: indexedTrajectory }
449
+ }
450
+
451
+ /** Format result based on output format (returns markdown for judge, JSONL for summary) */
452
+ const formatResult = (result: FullResult, format: OutputFormat): string => {
453
+ if (format === 'summary') {
454
+ return formatSummary(result)
455
+ }
456
+ // Judge format returns markdown
457
+ return formatJudgeMarkdown(result)
458
+ }
459
+
460
+ /** Format result as full JSONL with step IDs (for judge format's paired file) */
461
+ const formatFullWithStepIds = (result: FullResult): string => {
462
+ return JSON.stringify(addStepIds(result))
463
+ }
464
+
465
+ /** Write output line (to stdout or file) */
466
+ const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
467
+ if (outputPath) {
468
+ if (append) {
469
+ await appendFile(outputPath, `${line}\n`)
470
+ } else {
471
+ await Bun.write(outputPath, `${line}\n`)
472
+ }
473
+ } else {
474
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
475
+ console.log(line)
476
+ }
477
+ }
478
+
479
+ /** Log progress to stderr (doesn't pollute stdout) */
480
+ const logProgress = (message: string, showProgress: boolean): void => {
481
+ if (showProgress) {
482
+ console.error(message)
483
+ }
484
+ }
485
+
486
+ /** Resolve path relative to process.cwd() */
487
+ const resolvePath = (path: string): string => {
488
+ if (path.startsWith('/')) return path
489
+ return `${process.cwd()}/${path}`
490
+ }
491
+
492
+ // ============================================================================
493
+ // Main
494
+ // ============================================================================
495
+
496
+ const main = async () => {
497
+ const promptsPath = positionals[0]
498
+ if (!promptsPath) {
499
+ console.error('Error: prompts.jsonl path is required')
500
+ process.exit(1)
501
+ }
502
+
503
+ const agentCommand = parseCommand(values.cmd ?? values.command ?? 'claude-code-acp')
504
+ const outputPath = values.output
505
+ const timeout = Number.parseInt(values.timeout ?? '60000', 10)
506
+ const cwd = values.cwd
507
+ const format = (values.format ?? 'summary') as OutputFormat
508
+ const showProgress = values.progress ?? false
509
+ const appendOutput = values.append ?? false
510
+
511
+ // Validate format
512
+ if (!['summary', 'judge'].includes(format)) {
513
+ console.error(`Error: Invalid format "${format}". Must be: summary, judge`)
514
+ process.exit(1)
515
+ }
516
+
517
+ // Judge format requires output path (creates two files)
518
+ if (format === 'judge' && !outputPath) {
519
+ console.error('Error: --format judge requires --output <path> (creates <path>.md and <path>.full.jsonl)')
520
+ process.exit(1)
521
+ }
522
+
523
+ // Parse MCP server configurations (already SDK-compatible format)
524
+ const mcpServers = (values['mcp-server'] ?? []).map(parseMcpServerConfig)
525
+
526
+ // Load prompts
527
+ const prompts = await loadPrompts(promptsPath)
528
+
529
+ // Resolve output path relative to process.cwd()
530
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
531
+
532
+ // Compute output paths for judge format (creates two files)
533
+ const judgeMarkdownPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.md` : undefined
534
+ const judgeFullPath = format === 'judge' && resolvedOutputPath ? `${resolvedOutputPath}.full.jsonl` : undefined
535
+
536
+ // Log progress info
537
+ logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, showProgress)
538
+ logProgress(`Command: ${agentCommand.join(' ')}`, showProgress)
539
+ logProgress(`Format: ${format}`, showProgress)
540
+ if (format === 'judge') {
541
+ logProgress(`Output: ${judgeMarkdownPath} + ${judgeFullPath}`, showProgress)
542
+ } else if (resolvedOutputPath) {
543
+ logProgress(`Output: ${resolvedOutputPath}`, showProgress)
544
+ }
545
+ if (mcpServers.length > 0) {
546
+ logProgress(`MCP Servers: ${mcpServers.map((s) => s.name).join(', ')}`, showProgress)
547
+ }
548
+
549
+ // Create ACP client
550
+ const client = createACPClient({
551
+ command: agentCommand,
552
+ cwd,
553
+ timeout,
554
+ })
555
+
556
+ // Clear output file(s) if not appending
557
+ if (resolvedOutputPath && !appendOutput) {
558
+ if (format === 'judge') {
559
+ await Bun.write(judgeMarkdownPath!, '')
560
+ await Bun.write(judgeFullPath!, '')
561
+ } else {
562
+ await Bun.write(resolvedOutputPath, '')
563
+ }
564
+ }
565
+
566
+ // Session params with MCP servers
567
+ const sessionParams = {
568
+ cwd: cwd ?? process.cwd(),
569
+ mcpServers,
570
+ }
571
+
572
+ let isFirstOutput = true
573
+
574
+ try {
575
+ logProgress('Connecting to agent...', showProgress)
576
+ await client.connect()
577
+ logProgress('Connected!', showProgress)
578
+
579
+ // Create session with MCP servers
580
+ const session = await client.createSession(sessionParams)
581
+ logProgress(`Session: ${session.id}`, showProgress)
582
+
583
+ // Run evaluations sequentially
584
+ for (let i = 0; i < prompts.length; i++) {
585
+ const promptCase = prompts[i]
586
+ if (!promptCase) continue
587
+
588
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, showProgress)
589
+
590
+ const startTime = Date.now()
591
+ let result: FullResult
592
+
593
+ try {
594
+ const prompt = createPrompt(promptCase.input)
595
+ const { updates } = await client.promptSync(session.id, prompt)
596
+
597
+ const endTime = Date.now()
598
+ const trajectory = extractTrajectory(updates, startTime)
599
+ const output = extractOutput(trajectory)
600
+ const hasErrors = hasToolErrors(trajectory)
601
+
602
+ result = {
603
+ id: promptCase.id,
604
+ input: promptCase.input,
605
+ output,
606
+ ...(promptCase.expected && { expected: promptCase.expected }),
607
+ trajectory,
608
+ metadata: {
609
+ ...promptCase.metadata,
610
+ agent: agentCommand.join(' '),
611
+ },
612
+ timing: {
613
+ start: startTime,
614
+ end: endTime,
615
+ firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
616
+ },
617
+ status: hasErrors ? 'failed' : 'passed',
618
+ }
619
+ } catch (error) {
620
+ const endTime = Date.now()
621
+ const message = error instanceof Error ? error.message : String(error)
622
+ const isTimeout = message.includes('timeout') || message.includes('timed out')
623
+
624
+ result = {
625
+ id: promptCase.id,
626
+ input: promptCase.input,
627
+ output: '',
628
+ trajectory: [],
629
+ metadata: {
630
+ ...promptCase.metadata,
631
+ agent: agentCommand.join(' '),
632
+ },
633
+ timing: {
634
+ start: startTime,
635
+ end: endTime,
636
+ },
637
+ status: isTimeout ? 'timeout' : 'error',
638
+ errors: [message],
639
+ }
640
+ }
641
+
642
+ // Format and output result
643
+ if (format === 'judge') {
644
+ // Judge format: write markdown to .md, full JSONL to .full.jsonl
645
+ const markdown = formatJudgeMarkdown(result)
646
+ const fullJsonl = formatFullWithStepIds(result)
647
+ await writeOutput(markdown, judgeMarkdownPath, !isFirstOutput)
648
+ await writeOutput(fullJsonl, judgeFullPath, !isFirstOutput)
649
+ } else {
650
+ // Summary format: write to single file
651
+ const formatted = formatResult(result, format)
652
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
653
+ }
654
+ isFirstOutput = false
655
+
656
+ const statusIcon = result.status === 'passed' ? '✓' : result.status === 'failed' ? '✗' : '!'
657
+ logProgress(` ${statusIcon} ${result.status} (${result.timing.end - result.timing.start}ms)`, showProgress)
658
+ }
659
+ } finally {
660
+ logProgress('Disconnecting...', showProgress)
661
+ await client.disconnect()
662
+ }
663
+
664
+ logProgress('Done!', showProgress)
665
+ }
666
+
667
+ main().catch((error) => {
668
+ console.error('Error:', error)
669
+ process.exit(1)
670
+ })