@plaited/acp-harness 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -31
- package/bin/cli.ts +15 -0
- package/package.json +5 -7
- package/src/acp-client.ts +7 -4
- package/src/adapter-check.ts +0 -1
- package/src/adapter-scaffold.ts +16 -15
- package/src/calibrate.ts +28 -8
- package/src/capture.ts +114 -33
- package/src/grader-loader.ts +3 -3
- package/src/harness.ts +4 -0
- package/src/headless-cli.ts +433 -0
- package/src/headless-history-builder.ts +141 -0
- package/src/headless-output-parser.ts +251 -0
- package/src/headless-session-manager.ts +389 -0
- package/src/headless.schemas.ts +241 -0
- package/src/headless.ts +71 -0
- package/src/headless.types.ts +19 -0
- package/src/integration_tests/acp-claude.spec.ts +170 -0
- package/src/integration_tests/acp-gemini.spec.ts +174 -0
- package/src/schemas.ts +88 -36
- package/src/summarize.ts +4 -8
- package/src/tests/acp-client.spec.ts +1 -1
- package/src/tests/capture-cli.spec.ts +188 -0
- package/src/tests/capture-helpers.spec.ts +229 -67
- package/src/tests/constants.spec.ts +121 -0
- package/src/tests/fixtures/grader-exec.py +3 -3
- package/src/tests/fixtures/grader-module.ts +2 -2
- package/src/tests/grader-loader.spec.ts +5 -5
- package/src/tests/headless.spec.ts +460 -0
- package/src/tests/schemas-cli.spec.ts +142 -0
- package/src/tests/schemas.spec.ts +657 -0
- package/src/tests/summarize-helpers.spec.ts +3 -3
- package/src/tests/trials-cli.spec.ts +145 -0
- package/src/trials.ts +6 -19
- package/src/validate-refs.ts +1 -1
- package/src/tests/acp-integration.docker.ts +0 -214
package/src/capture.ts
CHANGED
|
@@ -18,8 +18,8 @@ import { createACPClient } from './acp-client.ts'
|
|
|
18
18
|
import { createPrompt } from './acp-helpers.ts'
|
|
19
19
|
import { DEFAULT_HARNESS_TIMEOUT, HEAD_LINES, TAIL_LINES } from './constants.ts'
|
|
20
20
|
import { loadGrader } from './grader-loader.ts'
|
|
21
|
-
import type { CaptureResult, Grader, PromptCase, TrajectoryStep } from './schemas.ts'
|
|
22
|
-
import {
|
|
21
|
+
import type { CaptureResult, Grader, PromptCase, TrajectoryRichness, TrajectoryStep } from './schemas.ts'
|
|
22
|
+
import { PromptCaseSchema, TokenUsageSchema, ToolInputSchema } from './schemas.ts'
|
|
23
23
|
|
|
24
24
|
// ============================================================================
|
|
25
25
|
// Types
|
|
@@ -41,8 +41,6 @@ export type CaptureConfig = {
|
|
|
41
41
|
progress?: boolean
|
|
42
42
|
/** Append to output file instead of overwriting */
|
|
43
43
|
append?: boolean
|
|
44
|
-
/** MCP server configurations */
|
|
45
|
-
mcpServers?: unknown[]
|
|
46
44
|
/** Optional grader function */
|
|
47
45
|
grader?: Grader
|
|
48
46
|
}
|
|
@@ -192,6 +190,73 @@ const resolvePath = (path: string): string => {
|
|
|
192
190
|
return `${process.cwd()}/${path}`
|
|
193
191
|
}
|
|
194
192
|
|
|
193
|
+
/**
|
|
194
|
+
* Detect trajectory richness level from captured steps.
|
|
195
|
+
*
|
|
196
|
+
* @remarks
|
|
197
|
+
* Different adapters provide varying levels of detail:
|
|
198
|
+
* - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code)
|
|
199
|
+
* - `messages-only`: Only message steps present
|
|
200
|
+
* - `minimal`: Empty or unknown content
|
|
201
|
+
*
|
|
202
|
+
* Uses single-pass iteration with early exit for efficiency.
|
|
203
|
+
*/
|
|
204
|
+
export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
|
|
205
|
+
let hasMessages = false
|
|
206
|
+
|
|
207
|
+
for (const step of trajectory) {
|
|
208
|
+
// Early exit: any of these means 'full' richness
|
|
209
|
+
if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') {
|
|
210
|
+
return 'full'
|
|
211
|
+
}
|
|
212
|
+
if (step.type === 'message') {
|
|
213
|
+
hasMessages = true
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return hasMessages ? 'messages-only' : 'minimal'
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Extract token counts from session notifications if available.
|
|
222
|
+
*
|
|
223
|
+
* @remarks
|
|
224
|
+
* Token usage is adapter-dependent. If the adapter doesn't expose usage,
|
|
225
|
+
* these fields will be undefined. Uses Zod validation for runtime type safety.
|
|
226
|
+
*/
|
|
227
|
+
export const extractTokenCounts = (updates: SessionNotification[]): { inputTokens?: number; outputTokens?: number } => {
|
|
228
|
+
let inputTokens: number | undefined
|
|
229
|
+
let outputTokens: number | undefined
|
|
230
|
+
|
|
231
|
+
for (const update of updates) {
|
|
232
|
+
// Check for token usage in update (adapter-specific)
|
|
233
|
+
// ACP SDK doesn't declare 'usage' field, but adapters extend it at runtime
|
|
234
|
+
const updateRecord = update as Record<string, unknown>
|
|
235
|
+
const usageData = updateRecord.usage ?? (updateRecord.update as Record<string, unknown> | undefined)?.usage
|
|
236
|
+
const usage = TokenUsageSchema.safeParse(usageData)
|
|
237
|
+
|
|
238
|
+
if (usage.success) {
|
|
239
|
+
if (usage.data.inputTokens !== undefined) {
|
|
240
|
+
inputTokens = (inputTokens ?? 0) + usage.data.inputTokens
|
|
241
|
+
}
|
|
242
|
+
if (usage.data.outputTokens !== undefined) {
|
|
243
|
+
outputTokens = (outputTokens ?? 0) + usage.data.outputTokens
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return { inputTokens, outputTokens }
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/** Get preview text for input (handles string or array) */
|
|
252
|
+
const getInputPreview = (input: string | string[]): string => {
|
|
253
|
+
if (Array.isArray(input)) {
|
|
254
|
+
const first = input[0] ?? ''
|
|
255
|
+
return `[${input.length} turns] ${first.slice(0, 40)}...`
|
|
256
|
+
}
|
|
257
|
+
return input.slice(0, 50)
|
|
258
|
+
}
|
|
259
|
+
|
|
195
260
|
// ============================================================================
|
|
196
261
|
// Capture Implementation
|
|
197
262
|
// ============================================================================
|
|
@@ -199,6 +264,10 @@ const resolvePath = (path: string): string => {
|
|
|
199
264
|
/**
|
|
200
265
|
* Execute capture with configuration object.
|
|
201
266
|
*
|
|
267
|
+
* @remarks
|
|
268
|
+
* Creates a fresh session for each JSONL entry to ensure isolation.
|
|
269
|
+
* Supports multi-turn conversations via `input: string[]`.
|
|
270
|
+
*
|
|
202
271
|
* @param config - Capture configuration
|
|
203
272
|
* @returns Array of capture results
|
|
204
273
|
*/
|
|
@@ -211,13 +280,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
211
280
|
timeout = DEFAULT_HARNESS_TIMEOUT,
|
|
212
281
|
progress = false,
|
|
213
282
|
append = false,
|
|
214
|
-
mcpServers = [],
|
|
215
283
|
grader,
|
|
216
284
|
} = config
|
|
217
285
|
|
|
218
|
-
// Parse MCP server configurations
|
|
219
|
-
const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
|
|
220
|
-
|
|
221
286
|
// Load prompts
|
|
222
287
|
const prompts = await loadPrompts(promptsPath)
|
|
223
288
|
|
|
@@ -230,9 +295,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
230
295
|
if (resolvedOutputPath) {
|
|
231
296
|
logProgress(`Output: ${resolvedOutputPath}`, progress)
|
|
232
297
|
}
|
|
233
|
-
if (parsedMcpServers.length > 0) {
|
|
234
|
-
logProgress(`MCP Servers: ${parsedMcpServers.map((s) => s.name).join(', ')}`, progress)
|
|
235
|
-
}
|
|
236
298
|
|
|
237
299
|
// Create ACP client
|
|
238
300
|
const client = createACPClient({
|
|
@@ -246,10 +308,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
246
308
|
await Bun.write(resolvedOutputPath, '')
|
|
247
309
|
}
|
|
248
310
|
|
|
249
|
-
// Session params
|
|
311
|
+
// Session params - agents auto-discover MCP configs from cwd
|
|
250
312
|
const sessionParams = {
|
|
251
313
|
cwd: cwd ?? process.cwd(),
|
|
252
|
-
mcpServers: parsedMcpServers,
|
|
253
314
|
}
|
|
254
315
|
|
|
255
316
|
const results: CaptureResult[] = []
|
|
@@ -260,43 +321,64 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
260
321
|
await client.connect()
|
|
261
322
|
logProgress('Connected!', progress)
|
|
262
323
|
|
|
263
|
-
//
|
|
264
|
-
const session = await client.createSession(sessionParams)
|
|
265
|
-
logProgress(`Session: ${session.id}`, progress)
|
|
266
|
-
|
|
267
|
-
// Run evaluations sequentially
|
|
324
|
+
// Run evaluations sequentially - fresh session per entry
|
|
268
325
|
for (let i = 0; i < prompts.length; i++) {
|
|
269
326
|
const promptCase = prompts[i]
|
|
270
327
|
if (!promptCase) continue
|
|
271
328
|
|
|
272
|
-
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input
|
|
329
|
+
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
|
|
273
330
|
|
|
274
331
|
const startTime = Date.now()
|
|
275
332
|
let result: CaptureResult
|
|
276
333
|
|
|
277
334
|
try {
|
|
278
|
-
|
|
279
|
-
const
|
|
335
|
+
// Create fresh session for each entry (ensures isolation)
|
|
336
|
+
const sessionStart = Date.now()
|
|
337
|
+
const session = await client.createSession(sessionParams)
|
|
338
|
+
const sessionCreation = Date.now() - sessionStart
|
|
339
|
+
logProgress(` Session: ${session.id}`, progress)
|
|
340
|
+
|
|
341
|
+
// Handle string or array input
|
|
342
|
+
const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
|
|
343
|
+
const turnCount = inputs.length
|
|
344
|
+
|
|
345
|
+
// Collect all updates from all turns
|
|
346
|
+
const allUpdates: SessionNotification[] = []
|
|
347
|
+
|
|
348
|
+
// Execute each turn sequentially in the same session
|
|
349
|
+
for (const turnInput of inputs) {
|
|
350
|
+
const prompt = createPrompt(turnInput)
|
|
351
|
+
const { updates } = await client.promptSync(session.id, prompt)
|
|
352
|
+
allUpdates.push(...updates)
|
|
353
|
+
}
|
|
280
354
|
|
|
281
355
|
const endTime = Date.now()
|
|
282
|
-
const trajectory = extractTrajectory(
|
|
356
|
+
const trajectory = extractTrajectory(allUpdates, startTime)
|
|
283
357
|
const output = extractOutput(trajectory)
|
|
284
358
|
const toolErrors = hasToolErrors(trajectory)
|
|
359
|
+
const trajectoryRichness = detectTrajectoryRichness(trajectory)
|
|
360
|
+
const tokenCounts = extractTokenCounts(allUpdates)
|
|
285
361
|
|
|
286
362
|
result = {
|
|
287
363
|
id: promptCase.id,
|
|
288
|
-
input: promptCase.input,
|
|
364
|
+
input: promptCase.input, // Preserve original (string or array)
|
|
289
365
|
output,
|
|
290
|
-
...(promptCase.
|
|
366
|
+
...(promptCase.hint && { hint: promptCase.hint }),
|
|
291
367
|
trajectory,
|
|
292
368
|
metadata: {
|
|
293
369
|
...promptCase.metadata,
|
|
294
370
|
agent: agentCommand.join(' '),
|
|
371
|
+
trajectoryRichness,
|
|
372
|
+
turnCount,
|
|
295
373
|
},
|
|
296
374
|
timing: {
|
|
297
375
|
start: startTime,
|
|
298
376
|
end: endTime,
|
|
299
377
|
firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
|
|
378
|
+
sessionCreation,
|
|
379
|
+
total: endTime - startTime,
|
|
380
|
+
...(tokenCounts.inputTokens !== undefined && { inputTokens: tokenCounts.inputTokens }),
|
|
381
|
+
...(tokenCounts.outputTokens !== undefined && { outputTokens: tokenCounts.outputTokens }),
|
|
300
382
|
},
|
|
301
383
|
toolErrors,
|
|
302
384
|
}
|
|
@@ -306,13 +388,14 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
306
388
|
result.score = await grader({
|
|
307
389
|
input: promptCase.input,
|
|
308
390
|
output,
|
|
309
|
-
|
|
391
|
+
hint: promptCase.hint,
|
|
310
392
|
trajectory,
|
|
311
393
|
})
|
|
312
394
|
}
|
|
313
395
|
} catch (error) {
|
|
314
396
|
const endTime = Date.now()
|
|
315
397
|
const message = error instanceof Error ? error.message : String(error)
|
|
398
|
+
const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
|
|
316
399
|
|
|
317
400
|
result = {
|
|
318
401
|
id: promptCase.id,
|
|
@@ -322,10 +405,14 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
322
405
|
metadata: {
|
|
323
406
|
...promptCase.metadata,
|
|
324
407
|
agent: agentCommand.join(' '),
|
|
408
|
+
trajectoryRichness: 'minimal' as TrajectoryRichness,
|
|
409
|
+
turnCount: inputs.length,
|
|
325
410
|
},
|
|
326
411
|
timing: {
|
|
327
412
|
start: startTime,
|
|
328
413
|
end: endTime,
|
|
414
|
+
sessionCreation: 0,
|
|
415
|
+
total: endTime - startTime,
|
|
329
416
|
},
|
|
330
417
|
toolErrors: true,
|
|
331
418
|
errors: [message],
|
|
@@ -340,7 +427,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
340
427
|
isFirstOutput = false
|
|
341
428
|
|
|
342
429
|
const statusIcon = result.toolErrors ? '!' : '✓'
|
|
343
|
-
logProgress(` ${statusIcon} (${result.timing.
|
|
430
|
+
logProgress(` ${statusIcon} (${result.timing.total}ms)`, progress)
|
|
344
431
|
}
|
|
345
432
|
} finally {
|
|
346
433
|
logProgress('Disconnecting...', progress)
|
|
@@ -369,7 +456,6 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
369
456
|
timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
|
|
370
457
|
progress: { type: 'boolean', default: false },
|
|
371
458
|
append: { type: 'boolean', default: false },
|
|
372
|
-
'mcp-server': { type: 'string', multiple: true },
|
|
373
459
|
grader: { type: 'string', short: 'g' },
|
|
374
460
|
help: { type: 'boolean', short: 'h' },
|
|
375
461
|
},
|
|
@@ -387,11 +473,10 @@ Arguments:
|
|
|
387
473
|
|
|
388
474
|
Options:
|
|
389
475
|
-o, --output Output file (default: stdout)
|
|
390
|
-
-c, --cwd Working directory for agent
|
|
476
|
+
-c, --cwd Working directory for agent (agents auto-discover MCP configs from here)
|
|
391
477
|
-t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
|
|
392
478
|
--progress Show progress to stderr
|
|
393
479
|
--append Append to output file instead of overwriting
|
|
394
|
-
--mcp-server MCP server config JSON (repeatable)
|
|
395
480
|
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
396
481
|
-h, --help Show this help message
|
|
397
482
|
|
|
@@ -440,9 +525,6 @@ Examples:
|
|
|
440
525
|
}
|
|
441
526
|
}
|
|
442
527
|
|
|
443
|
-
// Parse MCP server configurations
|
|
444
|
-
const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
|
|
445
|
-
|
|
446
528
|
await runCapture({
|
|
447
529
|
promptsPath,
|
|
448
530
|
agentCommand,
|
|
@@ -451,7 +533,6 @@ Examples:
|
|
|
451
533
|
timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
|
|
452
534
|
progress: values.progress ?? false,
|
|
453
535
|
append: values.append ?? false,
|
|
454
|
-
mcpServers,
|
|
455
536
|
grader,
|
|
456
537
|
})
|
|
457
538
|
}
|
package/src/grader-loader.ts
CHANGED
|
@@ -42,9 +42,9 @@ const resolvePath = (path: string): string => {
|
|
|
42
42
|
|
|
43
43
|
/** Input format for executable graders (stdin JSON) */
|
|
44
44
|
type ExecGraderInput = {
|
|
45
|
-
input: string
|
|
45
|
+
input: string | string[]
|
|
46
46
|
output: string
|
|
47
|
-
|
|
47
|
+
hint?: string
|
|
48
48
|
trajectory?: TrajectoryStep[]
|
|
49
49
|
}
|
|
50
50
|
|
|
@@ -63,7 +63,7 @@ const createExecGrader = (execPath: string): Grader => {
|
|
|
63
63
|
const input: ExecGraderInput = {
|
|
64
64
|
input: params.input,
|
|
65
65
|
output: params.output,
|
|
66
|
-
|
|
66
|
+
hint: params.hint,
|
|
67
67
|
trajectory: params.trajectory,
|
|
68
68
|
}
|
|
69
69
|
|
package/src/harness.ts
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* - `validateRefs` - Check reference solutions
|
|
14
14
|
* - `balance` - Analyze test set coverage
|
|
15
15
|
* - `schemasCli` - Export JSON schemas
|
|
16
|
+
* - `headless` - Schema-driven adapter for headless CLI agents
|
|
16
17
|
*
|
|
17
18
|
* @packageDocumentation
|
|
18
19
|
*/
|
|
@@ -25,6 +26,9 @@ export { calibrate, runCalibrate } from './calibrate.ts'
|
|
|
25
26
|
export type { CaptureConfig } from './capture.ts'
|
|
26
27
|
// Command implementations (for programmatic use)
|
|
27
28
|
export { capture, extractOutput, extractTrajectory, hasToolErrors, loadPrompts, runCapture } from './capture.ts'
|
|
29
|
+
export type { HeadlessAdapterConfig } from './headless.ts'
|
|
30
|
+
// Headless adapter factory
|
|
31
|
+
export { headless } from './headless.ts'
|
|
28
32
|
export type { SchemasConfig } from './schemas-cli.ts'
|
|
29
33
|
export { runSchemas, schemasCli } from './schemas-cli.ts'
|
|
30
34
|
export type { SummarizeConfig } from './summarize.ts'
|