@plaited/acp-harness 0.2.6 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +120 -16
- package/bin/cli.ts +105 -636
- package/bin/tests/cli.spec.ts +218 -51
- package/package.json +20 -4
- package/src/acp-client.ts +5 -4
- package/src/acp-transport.ts +14 -7
- package/src/adapter-check.ts +542 -0
- package/src/adapter-scaffold.ts +934 -0
- package/src/balance.ts +232 -0
- package/src/calibrate.ts +300 -0
- package/src/capture.ts +457 -0
- package/src/constants.ts +94 -0
- package/src/grader-loader.ts +174 -0
- package/src/harness.ts +35 -0
- package/src/schemas-cli.ts +239 -0
- package/src/schemas.ts +567 -0
- package/src/summarize.ts +245 -0
- package/src/tests/adapter-check.spec.ts +70 -0
- package/src/tests/adapter-scaffold.spec.ts +112 -0
- package/src/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/tests/fixtures/grader-exec.py +29 -0
- package/src/tests/fixtures/grader-module.ts +14 -0
- package/src/tests/grader-loader.spec.ts +153 -0
- package/src/trials.ts +395 -0
- package/src/validate-refs.ts +188 -0
- package/.claude/rules/accuracy.md +0 -43
- package/.claude/rules/bun-apis.md +0 -80
- package/.claude/rules/code-review.md +0 -254
- package/.claude/rules/git-workflow.md +0 -37
- package/.claude/rules/github.md +0 -154
- package/.claude/rules/testing.md +0 -172
- package/.claude/skills/acp-harness/SKILL.md +0 -310
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
- package/.claude/skills/acp-harness/references/downstream.md +0 -288
- package/.claude/skills/acp-harness/references/output-formats.md +0 -221
- package/.claude-plugin/marketplace.json +0 -15
- package/.claude-plugin/plugin.json +0 -16
- package/.github/CODEOWNERS +0 -6
- package/.github/workflows/ci.yml +0 -63
- package/.github/workflows/publish.yml +0 -146
- package/.mcp.json +0 -20
- package/CLAUDE.md +0 -92
- package/Dockerfile.test +0 -23
- package/biome.json +0 -96
- package/bun.lock +0 -513
- package/docker-compose.test.yml +0 -21
- package/scripts/bun-test-wrapper.sh +0 -46
- package/src/acp.constants.ts +0 -56
- package/src/acp.schemas.ts +0 -161
- package/src/acp.types.ts +0 -28
- package/src/tests/fixtures/.claude/settings.local.json +0 -8
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
- package/tsconfig.json +0 -32
package/bin/cli.ts
CHANGED
|
@@ -1,670 +1,139 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
*
|
|
4
|
+
* ACP Harness CLI - Agent evaluation toolkit.
|
|
5
5
|
*
|
|
6
6
|
* @remarks
|
|
7
|
-
*
|
|
8
|
-
* runs evaluation prompts, capturing full trajectories for analysis.
|
|
7
|
+
* Router for harness commands. Thin wrapper that delegates to command modules.
|
|
9
8
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
9
|
+
* Commands:
|
|
10
|
+
* - capture: Core trajectory capture
|
|
11
|
+
* - trials: Multi-run pass@k/pass^k analysis
|
|
12
|
+
* - summarize: Derive compact views from results
|
|
13
|
+
* - calibrate: Sample failures for grader review
|
|
14
|
+
* - validate-refs: Check reference solutions
|
|
15
|
+
* - balance: Analyze test set coverage
|
|
16
|
+
* - schemas: Export JSON schemas for non-TS users
|
|
17
|
+
* - adapter:scaffold: Scaffold new ACP adapter project
|
|
18
|
+
* - adapter:check: Validate adapter ACP compliance
|
|
12
19
|
*/
|
|
13
20
|
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
21
|
+
import { adapterCheck } from '../src/adapter-check.ts'
|
|
22
|
+
import { adapterScaffold } from '../src/adapter-scaffold.ts'
|
|
23
|
+
import { balance } from '../src/balance.ts'
|
|
24
|
+
import { calibrate } from '../src/calibrate.ts'
|
|
25
|
+
import { capture } from '../src/capture.ts'
|
|
26
|
+
import { schemasCli } from '../src/schemas-cli.ts'
|
|
27
|
+
import { summarize } from '../src/summarize.ts'
|
|
28
|
+
import { trials } from '../src/trials.ts'
|
|
29
|
+
import { validateRefs } from '../src/validate-refs.ts'
|
|
19
30
|
|
|
20
|
-
|
|
21
|
-
// Schemas (SDK-compatible MCP server format)
|
|
22
|
-
// ============================================================================
|
|
31
|
+
const [command, ...args] = Bun.argv.slice(2)
|
|
23
32
|
|
|
24
|
-
const
|
|
25
|
-
name: z.string(),
|
|
26
|
-
value: z.string(),
|
|
27
|
-
})
|
|
28
|
-
|
|
29
|
-
const HttpHeaderSchema = z.object({
|
|
30
|
-
name: z.string(),
|
|
31
|
-
value: z.string(),
|
|
32
|
-
})
|
|
33
|
-
|
|
34
|
-
const McpServerStdioSchema = z.object({
|
|
35
|
-
type: z.literal('stdio').optional(),
|
|
36
|
-
name: z.string(),
|
|
37
|
-
command: z.string(),
|
|
38
|
-
args: z.array(z.string()),
|
|
39
|
-
env: z.array(EnvVariableSchema),
|
|
40
|
-
})
|
|
41
|
-
|
|
42
|
-
const McpServerHttpSchema = z.object({
|
|
43
|
-
type: z.literal('http'),
|
|
44
|
-
name: z.string(),
|
|
45
|
-
url: z.string(),
|
|
46
|
-
headers: z.array(HttpHeaderSchema),
|
|
47
|
-
})
|
|
48
|
-
|
|
49
|
-
const McpServerSchema = z.union([McpServerStdioSchema, McpServerHttpSchema])
|
|
50
|
-
|
|
51
|
-
const PromptCaseSchema = z.object({
|
|
52
|
-
id: z.string(),
|
|
53
|
-
input: z.string(),
|
|
54
|
-
expected: z.string().optional(),
|
|
55
|
-
metadata: z.record(z.string(), z.unknown()).optional(),
|
|
56
|
-
timeout: z.number().optional(),
|
|
57
|
-
})
|
|
58
|
-
|
|
59
|
-
const ToolInputSchema = z
|
|
60
|
-
.object({
|
|
61
|
-
file_path: z.string().optional(),
|
|
62
|
-
path: z.string().optional(),
|
|
63
|
-
content: z.string().optional(),
|
|
64
|
-
new_string: z.string().optional(),
|
|
65
|
-
})
|
|
66
|
-
.passthrough()
|
|
67
|
-
|
|
68
|
-
// ============================================================================
|
|
69
|
-
// Types
|
|
70
|
-
// ============================================================================
|
|
71
|
-
|
|
72
|
-
type McpServerConfig = z.infer<typeof McpServerSchema>
|
|
73
|
-
type PromptCase = z.infer<typeof PromptCaseSchema>
|
|
74
|
-
|
|
75
|
-
/** Trajectory step types */
|
|
76
|
-
type TrajectoryStep =
|
|
77
|
-
| { type: 'thought'; content: string; timestamp: number }
|
|
78
|
-
| { type: 'message'; content: string; timestamp: number }
|
|
79
|
-
| {
|
|
80
|
-
type: 'tool_call'
|
|
81
|
-
name: string
|
|
82
|
-
status: string
|
|
83
|
-
input?: unknown
|
|
84
|
-
output?: unknown
|
|
85
|
-
duration?: number
|
|
86
|
-
timestamp: number
|
|
87
|
-
}
|
|
88
|
-
| { type: 'plan'; entries: PlanEntry[]; timestamp: number }
|
|
89
|
-
|
|
90
|
-
/** Full output format */
|
|
91
|
-
type FullResult = {
|
|
92
|
-
id: string
|
|
93
|
-
input: string
|
|
94
|
-
output: string
|
|
95
|
-
expected?: string
|
|
96
|
-
trajectory: TrajectoryStep[]
|
|
97
|
-
metadata: Record<string, unknown>
|
|
98
|
-
timing: {
|
|
99
|
-
start: number
|
|
100
|
-
end: number
|
|
101
|
-
firstResponse?: number
|
|
102
|
-
}
|
|
103
|
-
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
104
|
-
errors?: string[]
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
/** Summary output format */
|
|
108
|
-
type SummaryResult = {
|
|
109
|
-
id: string
|
|
110
|
-
input: string
|
|
111
|
-
output: string
|
|
112
|
-
toolCalls: string[]
|
|
113
|
-
status: 'passed' | 'failed' | 'error' | 'timeout'
|
|
114
|
-
duration: number
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
type OutputFormat = 'summary' | 'judge'
|
|
118
|
-
|
|
119
|
-
/** Step with unique ID for judge format correlation */
|
|
120
|
-
type IndexedStep = TrajectoryStep & { stepId: string }
|
|
121
|
-
|
|
122
|
-
// ============================================================================
|
|
123
|
-
// Argument Parsing
|
|
124
|
-
// ============================================================================
|
|
125
|
-
|
|
126
|
-
const { values, positionals } = parseArgs({
|
|
127
|
-
args: Bun.argv.slice(2),
|
|
128
|
-
options: {
|
|
129
|
-
command: {
|
|
130
|
-
type: 'string',
|
|
131
|
-
},
|
|
132
|
-
cmd: {
|
|
133
|
-
type: 'string',
|
|
134
|
-
},
|
|
135
|
-
output: {
|
|
136
|
-
type: 'string',
|
|
137
|
-
short: 'o',
|
|
138
|
-
},
|
|
139
|
-
cwd: {
|
|
140
|
-
type: 'string',
|
|
141
|
-
short: 'c',
|
|
142
|
-
},
|
|
143
|
-
timeout: {
|
|
144
|
-
type: 'string',
|
|
145
|
-
short: 't',
|
|
146
|
-
default: '60000',
|
|
147
|
-
},
|
|
148
|
-
format: {
|
|
149
|
-
type: 'string',
|
|
150
|
-
short: 'f',
|
|
151
|
-
default: 'summary',
|
|
152
|
-
},
|
|
153
|
-
progress: {
|
|
154
|
-
type: 'boolean',
|
|
155
|
-
default: false,
|
|
156
|
-
},
|
|
157
|
-
append: {
|
|
158
|
-
type: 'boolean',
|
|
159
|
-
default: false,
|
|
160
|
-
},
|
|
161
|
-
'mcp-server': {
|
|
162
|
-
type: 'string',
|
|
163
|
-
multiple: true,
|
|
164
|
-
},
|
|
165
|
-
help: {
|
|
166
|
-
type: 'boolean',
|
|
167
|
-
short: 'h',
|
|
168
|
-
},
|
|
169
|
-
},
|
|
170
|
-
allowPositionals: true,
|
|
171
|
-
})
|
|
172
|
-
|
|
173
|
-
if (values.help || positionals.length === 0) {
|
|
33
|
+
const printHelp = () => {
|
|
174
34
|
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
175
35
|
console.log(`
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
Arguments:
|
|
179
|
-
prompts.jsonl Input file with evaluation prompts
|
|
36
|
+
acp-harness - CLI tool for agent evaluation
|
|
180
37
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
38
|
+
Commands:
|
|
39
|
+
capture Capture trajectories from ACP agent
|
|
40
|
+
trials Run prompts multiple times for pass@k/pass^k metrics
|
|
41
|
+
summarize Derive compact views from results
|
|
42
|
+
calibrate Sample failures for grader review
|
|
43
|
+
validate-refs Check reference solutions against grader
|
|
44
|
+
balance Analyze test set coverage
|
|
45
|
+
schemas Export JSON schemas for non-TypeScript users
|
|
46
|
+
adapter:scaffold Scaffold a new ACP adapter project
|
|
47
|
+
adapter:check Validate adapter ACP compliance
|
|
191
48
|
|
|
192
|
-
|
|
193
|
-
{"id":"test-001","input":"Create a button","expected":"should contain <button>","metadata":{"category":"ui"}}
|
|
194
|
-
|
|
195
|
-
Output Formats:
|
|
196
|
-
summary - Minimal JSONL: id, input, output, toolCalls, status, duration
|
|
197
|
-
judge - Two-tier output:
|
|
198
|
-
1. Markdown with step IDs and head/tail previews → <output>.md
|
|
199
|
-
2. Full trajectory JSONL for reference → <output>.full.jsonl
|
|
49
|
+
Run 'acp-harness <command> --help' for command-specific help.
|
|
200
50
|
|
|
201
51
|
Examples:
|
|
202
|
-
#
|
|
203
|
-
acp-harness prompts.jsonl -o results.jsonl
|
|
52
|
+
# Basic capture
|
|
53
|
+
acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
|
|
204
54
|
|
|
205
|
-
#
|
|
206
|
-
acp-harness prompts.jsonl
|
|
55
|
+
# With grader
|
|
56
|
+
acp-harness capture prompts.jsonl bunx claude-code-acp --grader ./grader.ts -o results.jsonl
|
|
207
57
|
|
|
208
|
-
#
|
|
209
|
-
acp-harness prompts.jsonl --
|
|
58
|
+
# Multi-run trials
|
|
59
|
+
acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts -o trials.jsonl
|
|
210
60
|
|
|
211
|
-
#
|
|
212
|
-
acp-harness
|
|
61
|
+
# Derive summary view
|
|
62
|
+
acp-harness summarize results.jsonl -o summary.jsonl
|
|
213
63
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
ANTHROPIC_API_KEY=sk-... acp-harness prompts.jsonl -o results.jsonl
|
|
217
|
-
`)
|
|
218
|
-
process.exit(values.help ? 0 : 1)
|
|
219
|
-
}
|
|
64
|
+
# Export schemas
|
|
65
|
+
acp-harness schemas --json -o schemas.json
|
|
220
66
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
// ============================================================================
|
|
224
|
-
|
|
225
|
-
/** Parse command string into command array */
|
|
226
|
-
const parseCommand = (cmd: string): string[] => {
|
|
227
|
-
return cmd.split(/\s+/).filter(Boolean)
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
/** Parse MCP server config from JSON string (SDK-compatible format) */
|
|
231
|
-
const parseMcpServerConfig = (json: string): McpServerConfig => {
|
|
232
|
-
return McpServerSchema.parse(JSON.parse(json))
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
/** Load prompts from JSONL file */
|
|
236
|
-
const loadPrompts = async (path: string): Promise<PromptCase[]> => {
|
|
237
|
-
const content = await Bun.file(path).text()
|
|
238
|
-
return content
|
|
239
|
-
.trim()
|
|
240
|
-
.split('\n')
|
|
241
|
-
.filter(Boolean)
|
|
242
|
-
.map((line, index) => {
|
|
243
|
-
try {
|
|
244
|
-
return PromptCaseSchema.parse(JSON.parse(line))
|
|
245
|
-
} catch (error) {
|
|
246
|
-
throw new Error(`Invalid prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
247
|
-
}
|
|
248
|
-
})
|
|
249
|
-
}
|
|
250
|
-
|
|
251
|
-
/** Extract trajectory from session notifications */
|
|
252
|
-
const extractTrajectory = (notifications: SessionNotification[], startTime: number): TrajectoryStep[] => {
|
|
253
|
-
const trajectory: TrajectoryStep[] = []
|
|
254
|
-
const toolCallMap = new Map<string, { start: number; step: TrajectoryStep & { type: 'tool_call' } }>()
|
|
255
|
-
|
|
256
|
-
for (const notification of notifications) {
|
|
257
|
-
const timestamp = Date.now() - startTime
|
|
258
|
-
const update = notification.update
|
|
259
|
-
|
|
260
|
-
if (update.sessionUpdate === 'agent_thought_chunk' && update.content.type === 'text') {
|
|
261
|
-
trajectory.push({
|
|
262
|
-
type: 'thought',
|
|
263
|
-
content: update.content.text,
|
|
264
|
-
timestamp,
|
|
265
|
-
})
|
|
266
|
-
} else if (update.sessionUpdate === 'agent_message_chunk' && update.content.type === 'text') {
|
|
267
|
-
trajectory.push({
|
|
268
|
-
type: 'message',
|
|
269
|
-
content: update.content.text,
|
|
270
|
-
timestamp,
|
|
271
|
-
})
|
|
272
|
-
} else if (update.sessionUpdate === 'tool_call') {
|
|
273
|
-
const toolCall = update as ToolCall
|
|
274
|
-
const existing = toolCallMap.get(toolCall.toolCallId)
|
|
275
|
-
|
|
276
|
-
if (existing) {
|
|
277
|
-
// Update existing tool call with completion info
|
|
278
|
-
existing.step.status = toolCall.status ?? 'pending'
|
|
279
|
-
if (toolCall.content) {
|
|
280
|
-
existing.step.output = toolCall.content
|
|
281
|
-
}
|
|
282
|
-
if (toolCall.rawOutput) {
|
|
283
|
-
existing.step.output = toolCall.rawOutput
|
|
284
|
-
}
|
|
285
|
-
existing.step.duration = timestamp - existing.start
|
|
286
|
-
} else {
|
|
287
|
-
// New tool call
|
|
288
|
-
const step: TrajectoryStep & { type: 'tool_call' } = {
|
|
289
|
-
type: 'tool_call',
|
|
290
|
-
name: toolCall.title,
|
|
291
|
-
status: toolCall.status ?? 'pending',
|
|
292
|
-
input: toolCall.rawInput,
|
|
293
|
-
timestamp,
|
|
294
|
-
}
|
|
295
|
-
toolCallMap.set(toolCall.toolCallId, { start: timestamp, step })
|
|
296
|
-
trajectory.push(step)
|
|
297
|
-
}
|
|
298
|
-
} else if (update.sessionUpdate === 'plan') {
|
|
299
|
-
trajectory.push({
|
|
300
|
-
type: 'plan',
|
|
301
|
-
entries: update.entries,
|
|
302
|
-
timestamp,
|
|
303
|
-
})
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
return trajectory
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
/** Extract final text output from trajectory */
|
|
311
|
-
const extractOutput = (trajectory: TrajectoryStep[]): string => {
|
|
312
|
-
return trajectory
|
|
313
|
-
.filter((step): step is TrajectoryStep & { type: 'message' } => step.type === 'message')
|
|
314
|
-
.map((step) => step.content)
|
|
315
|
-
.join('\n')
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
/** Check if any tool calls failed */
|
|
319
|
-
const hasToolErrors = (trajectory: TrajectoryStep[]): boolean => {
|
|
320
|
-
return trajectory.some((step) => step.type === 'tool_call' && step.status === 'failed')
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
/** Head/tail preview configuration */
|
|
324
|
-
const HEAD_LINES = 8
|
|
325
|
-
const TAIL_LINES = 4
|
|
326
|
-
const MAX_CONTENT_LENGTH = 500
|
|
327
|
-
|
|
328
|
-
/** Extract head and tail lines from content */
|
|
329
|
-
const headTailPreview = (content: string, headLines = HEAD_LINES, tailLines = TAIL_LINES): string => {
|
|
330
|
-
const lines = content.split('\n')
|
|
331
|
-
if (lines.length <= headLines + tailLines) {
|
|
332
|
-
return content
|
|
333
|
-
}
|
|
334
|
-
const head = lines.slice(0, headLines).join('\n')
|
|
335
|
-
const tail = lines.slice(-tailLines).join('\n')
|
|
336
|
-
const omitted = lines.length - headLines - tailLines
|
|
337
|
-
return `${head}\n\n// ... ${omitted} lines omitted ...\n\n${tail}`
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
/** Extract file path from tool input if present */
|
|
341
|
-
const extractFilePath = (input: unknown): string | undefined => {
|
|
342
|
-
const result = ToolInputSchema.safeParse(input)
|
|
343
|
-
if (!result.success) return undefined
|
|
344
|
-
return result.data.file_path ?? result.data.path
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
/** Extract content from tool input if present */
|
|
348
|
-
const extractContent = (input: unknown): string | undefined => {
|
|
349
|
-
const result = ToolInputSchema.safeParse(input)
|
|
350
|
-
if (!result.success) return undefined
|
|
351
|
-
return result.data.content ?? result.data.new_string
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
/** Format result as summary JSONL */
|
|
355
|
-
const formatSummary = (result: FullResult): string => {
|
|
356
|
-
const summary: SummaryResult = {
|
|
357
|
-
id: result.id,
|
|
358
|
-
input: result.input,
|
|
359
|
-
output: result.output,
|
|
360
|
-
toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
|
|
361
|
-
status: result.status,
|
|
362
|
-
duration: result.timing.end - result.timing.start,
|
|
363
|
-
}
|
|
364
|
-
return JSON.stringify(summary)
|
|
365
|
-
}
|
|
67
|
+
# Scaffold new adapter
|
|
68
|
+
acp-harness adapter:scaffold my-agent -o ./adapters/my-agent
|
|
366
69
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
const lines: string[] = [
|
|
370
|
-
`## Evaluation Record: ${result.id}`,
|
|
371
|
-
'',
|
|
372
|
-
`**Input:** ${result.input}`,
|
|
373
|
-
'',
|
|
374
|
-
'**Trajectory:**',
|
|
375
|
-
]
|
|
70
|
+
# Validate adapter compliance
|
|
71
|
+
acp-harness adapter:check bun ./my-adapter/src/index.ts
|
|
376
72
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
const stepId = `${result.id}-step-${stepNum}`
|
|
380
|
-
|
|
381
|
-
if (step.type === 'thought') {
|
|
382
|
-
const preview = step.content.slice(0, 100)
|
|
383
|
-
const truncated = step.content.length > 100 ? '...' : ''
|
|
384
|
-
lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
|
|
385
|
-
stepNum++
|
|
386
|
-
} else if (step.type === 'tool_call') {
|
|
387
|
-
const duration = step.duration ? ` (${step.duration}ms)` : ''
|
|
388
|
-
const filePath = extractFilePath(step.input)
|
|
389
|
-
const content = extractContent(step.input)
|
|
390
|
-
|
|
391
|
-
lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
|
|
392
|
-
|
|
393
|
-
// Add file path if present
|
|
394
|
-
if (filePath) {
|
|
395
|
-
const charCount = content?.length ?? 0
|
|
396
|
-
lines.push(` File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
// Add head/tail preview for content-producing tools
|
|
400
|
-
if (content && content.length > 0) {
|
|
401
|
-
const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content) : content
|
|
402
|
-
// Detect file extension for syntax highlighting
|
|
403
|
-
const ext = filePath?.split('.').pop() ?? 'typescript'
|
|
404
|
-
lines.push(` \`\`\`${ext}`)
|
|
405
|
-
lines.push(` ${preview.split('\n').join('\n ')}`)
|
|
406
|
-
lines.push(` \`\`\``)
|
|
407
|
-
}
|
|
408
|
-
stepNum++
|
|
409
|
-
} else if (step.type === 'plan') {
|
|
410
|
-
const planSummary = step.entries.map((e) => `${e.content}: ${e.status}`).join(', ')
|
|
411
|
-
const truncated = planSummary.length > 80 ? '...' : ''
|
|
412
|
-
lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
|
|
413
|
-
stepNum++
|
|
414
|
-
} else if (step.type === 'message') {
|
|
415
|
-
const preview = step.content.slice(0, 100)
|
|
416
|
-
const truncated = step.content.length > 100 ? '...' : ''
|
|
417
|
-
lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
|
|
418
|
-
stepNum++
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
lines.push('')
|
|
423
|
-
const outputPreview = result.output.slice(0, 200)
|
|
424
|
-
const outputTruncated = result.output.length > 200 ? '...' : ''
|
|
425
|
-
lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
|
|
426
|
-
lines.push('')
|
|
427
|
-
|
|
428
|
-
const metadataStr = Object.entries(result.metadata)
|
|
429
|
-
.map(([k, v]) => `${k}=${v}`)
|
|
430
|
-
.join(', ')
|
|
431
|
-
lines.push(`**Metadata:** ${metadataStr}`)
|
|
432
|
-
lines.push(`**Status:** ${result.status}`)
|
|
433
|
-
lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
|
|
434
|
-
lines.push('')
|
|
435
|
-
lines.push('---')
|
|
436
|
-
lines.push('')
|
|
437
|
-
|
|
438
|
-
return lines.join('\n')
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
/** Add step IDs to trajectory for full JSONL output */
|
|
442
|
-
const addStepIds = (result: FullResult): FullResult & { trajectory: IndexedStep[] } => {
|
|
443
|
-
let stepNum = 1
|
|
444
|
-
const indexedTrajectory = result.trajectory.map((step) => ({
|
|
445
|
-
...step,
|
|
446
|
-
stepId: `${result.id}-step-${stepNum++}`,
|
|
447
|
-
}))
|
|
448
|
-
return { ...result, trajectory: indexedTrajectory }
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
/** Format result based on output format (returns markdown for judge, JSONL for summary) */
|
|
452
|
-
const formatResult = (result: FullResult, format: OutputFormat): string => {
|
|
453
|
-
if (format === 'summary') {
|
|
454
|
-
return formatSummary(result)
|
|
455
|
-
}
|
|
456
|
-
// Judge format returns markdown
|
|
457
|
-
return formatJudgeMarkdown(result)
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
/** Format result as full JSONL with step IDs (for judge format's paired file) */
|
|
461
|
-
const formatFullWithStepIds = (result: FullResult): string => {
|
|
462
|
-
return JSON.stringify(addStepIds(result))
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
/** Write output line (to stdout or file) */
|
|
466
|
-
const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
|
|
467
|
-
if (outputPath) {
|
|
468
|
-
if (append) {
|
|
469
|
-
await appendFile(outputPath, `${line}\n`)
|
|
470
|
-
} else {
|
|
471
|
-
await Bun.write(outputPath, `${line}\n`)
|
|
472
|
-
}
|
|
473
|
-
} else {
|
|
474
|
-
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
475
|
-
console.log(line)
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
/** Log progress to stderr (doesn't pollute stdout) */
|
|
480
|
-
const logProgress = (message: string, showProgress: boolean): void => {
|
|
481
|
-
if (showProgress) {
|
|
482
|
-
console.error(message)
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
/** Resolve path relative to process.cwd() */
|
|
487
|
-
const resolvePath = (path: string): string => {
|
|
488
|
-
if (path.startsWith('/')) return path
|
|
489
|
-
return `${process.cwd()}/${path}`
|
|
73
|
+
Documentation: https://github.com/plaited/acp-harness
|
|
74
|
+
`)
|
|
490
75
|
}
|
|
491
76
|
|
|
492
|
-
// ============================================================================
|
|
493
|
-
// Main
|
|
494
|
-
// ============================================================================
|
|
495
|
-
|
|
496
77
|
const main = async () => {
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
logProgress(`MCP Servers: ${mcpServers.map((s) => s.name).join(', ')}`, showProgress)
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
// Create ACP client
|
|
550
|
-
const client = createACPClient({
|
|
551
|
-
command: agentCommand,
|
|
552
|
-
cwd,
|
|
553
|
-
timeout,
|
|
554
|
-
})
|
|
555
|
-
|
|
556
|
-
// Clear output file(s) if not appending
|
|
557
|
-
if (resolvedOutputPath && !appendOutput) {
|
|
558
|
-
if (format === 'judge') {
|
|
559
|
-
await Bun.write(judgeMarkdownPath!, '')
|
|
560
|
-
await Bun.write(judgeFullPath!, '')
|
|
561
|
-
} else {
|
|
562
|
-
await Bun.write(resolvedOutputPath, '')
|
|
78
|
+
switch (command) {
|
|
79
|
+
case 'capture':
|
|
80
|
+
await capture(args)
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
case 'trials':
|
|
84
|
+
await trials(args)
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
case 'summarize':
|
|
88
|
+
await summarize(args)
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
case 'calibrate':
|
|
92
|
+
await calibrate(args)
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
case 'validate-refs':
|
|
96
|
+
await validateRefs(args)
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
case 'balance':
|
|
100
|
+
await balance(args)
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
case 'schemas':
|
|
104
|
+
await schemasCli(args)
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
case 'adapter:scaffold':
|
|
108
|
+
await adapterScaffold(args)
|
|
109
|
+
break
|
|
110
|
+
|
|
111
|
+
case 'adapter:check':
|
|
112
|
+
await adapterCheck(args)
|
|
113
|
+
break
|
|
114
|
+
|
|
115
|
+
case '-h':
|
|
116
|
+
case '--help':
|
|
117
|
+
case undefined:
|
|
118
|
+
printHelp()
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
case '-v':
|
|
122
|
+
case '--version': {
|
|
123
|
+
const { version } = await import('../package.json')
|
|
124
|
+
// biome-ignore lint/suspicious/noConsole: CLI version output
|
|
125
|
+
console.log(version)
|
|
126
|
+
break
|
|
563
127
|
}
|
|
564
|
-
}
|
|
565
128
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
129
|
+
default:
|
|
130
|
+
console.error(`Unknown command: ${command}`)
|
|
131
|
+
console.error("Run 'acp-harness --help' for usage")
|
|
132
|
+
process.exit(1)
|
|
570
133
|
}
|
|
571
|
-
|
|
572
|
-
let isFirstOutput = true
|
|
573
|
-
|
|
574
|
-
try {
|
|
575
|
-
logProgress('Connecting to agent...', showProgress)
|
|
576
|
-
await client.connect()
|
|
577
|
-
logProgress('Connected!', showProgress)
|
|
578
|
-
|
|
579
|
-
// Create session with MCP servers
|
|
580
|
-
const session = await client.createSession(sessionParams)
|
|
581
|
-
logProgress(`Session: ${session.id}`, showProgress)
|
|
582
|
-
|
|
583
|
-
// Run evaluations sequentially
|
|
584
|
-
for (let i = 0; i < prompts.length; i++) {
|
|
585
|
-
const promptCase = prompts[i]
|
|
586
|
-
if (!promptCase) continue
|
|
587
|
-
|
|
588
|
-
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, showProgress)
|
|
589
|
-
|
|
590
|
-
const startTime = Date.now()
|
|
591
|
-
let result: FullResult
|
|
592
|
-
|
|
593
|
-
try {
|
|
594
|
-
const prompt = createPrompt(promptCase.input)
|
|
595
|
-
const { updates } = await client.promptSync(session.id, prompt)
|
|
596
|
-
|
|
597
|
-
const endTime = Date.now()
|
|
598
|
-
const trajectory = extractTrajectory(updates, startTime)
|
|
599
|
-
const output = extractOutput(trajectory)
|
|
600
|
-
const hasErrors = hasToolErrors(trajectory)
|
|
601
|
-
|
|
602
|
-
result = {
|
|
603
|
-
id: promptCase.id,
|
|
604
|
-
input: promptCase.input,
|
|
605
|
-
output,
|
|
606
|
-
...(promptCase.expected && { expected: promptCase.expected }),
|
|
607
|
-
trajectory,
|
|
608
|
-
metadata: {
|
|
609
|
-
...promptCase.metadata,
|
|
610
|
-
agent: agentCommand.join(' '),
|
|
611
|
-
},
|
|
612
|
-
timing: {
|
|
613
|
-
start: startTime,
|
|
614
|
-
end: endTime,
|
|
615
|
-
firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
|
|
616
|
-
},
|
|
617
|
-
status: hasErrors ? 'failed' : 'passed',
|
|
618
|
-
}
|
|
619
|
-
} catch (error) {
|
|
620
|
-
const endTime = Date.now()
|
|
621
|
-
const message = error instanceof Error ? error.message : String(error)
|
|
622
|
-
const isTimeout = message.includes('timeout') || message.includes('timed out')
|
|
623
|
-
|
|
624
|
-
result = {
|
|
625
|
-
id: promptCase.id,
|
|
626
|
-
input: promptCase.input,
|
|
627
|
-
output: '',
|
|
628
|
-
trajectory: [],
|
|
629
|
-
metadata: {
|
|
630
|
-
...promptCase.metadata,
|
|
631
|
-
agent: agentCommand.join(' '),
|
|
632
|
-
},
|
|
633
|
-
timing: {
|
|
634
|
-
start: startTime,
|
|
635
|
-
end: endTime,
|
|
636
|
-
},
|
|
637
|
-
status: isTimeout ? 'timeout' : 'error',
|
|
638
|
-
errors: [message],
|
|
639
|
-
}
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
// Format and output result
|
|
643
|
-
if (format === 'judge') {
|
|
644
|
-
// Judge format: write markdown to .md, full JSONL to .full.jsonl
|
|
645
|
-
const markdown = formatJudgeMarkdown(result)
|
|
646
|
-
const fullJsonl = formatFullWithStepIds(result)
|
|
647
|
-
await writeOutput(markdown, judgeMarkdownPath, !isFirstOutput)
|
|
648
|
-
await writeOutput(fullJsonl, judgeFullPath, !isFirstOutput)
|
|
649
|
-
} else {
|
|
650
|
-
// Summary format: write to single file
|
|
651
|
-
const formatted = formatResult(result, format)
|
|
652
|
-
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
653
|
-
}
|
|
654
|
-
isFirstOutput = false
|
|
655
|
-
|
|
656
|
-
const statusIcon = result.status === 'passed' ? '✓' : result.status === 'failed' ? '✗' : '!'
|
|
657
|
-
logProgress(` ${statusIcon} ${result.status} (${result.timing.end - result.timing.start}ms)`, showProgress)
|
|
658
|
-
}
|
|
659
|
-
} finally {
|
|
660
|
-
logProgress('Disconnecting...', showProgress)
|
|
661
|
-
await client.disconnect()
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
logProgress('Done!', showProgress)
|
|
665
134
|
}
|
|
666
135
|
|
|
667
136
|
main().catch((error) => {
|
|
668
|
-
console.error('Error:', error)
|
|
137
|
+
console.error('Error:', error instanceof Error ? error.message : error)
|
|
669
138
|
process.exit(1)
|
|
670
139
|
})
|