@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/capture.ts CHANGED
@@ -18,8 +18,8 @@ import { createACPClient } from './acp-client.ts'
18
18
  import { createPrompt } from './acp-helpers.ts'
19
19
  import { DEFAULT_HARNESS_TIMEOUT, HEAD_LINES, TAIL_LINES } from './constants.ts'
20
20
  import { loadGrader } from './grader-loader.ts'
21
- import type { CaptureResult, Grader, PromptCase, TrajectoryStep } from './schemas.ts'
22
- import { McpServerSchema, PromptCaseSchema, ToolInputSchema } from './schemas.ts'
21
+ import type { CaptureResult, Grader, PromptCase, TrajectoryRichness, TrajectoryStep } from './schemas.ts'
22
+ import { PromptCaseSchema, TokenUsageSchema, ToolInputSchema } from './schemas.ts'
23
23
 
24
24
  // ============================================================================
25
25
  // Types
@@ -41,8 +41,6 @@ export type CaptureConfig = {
41
41
  progress?: boolean
42
42
  /** Append to output file instead of overwriting */
43
43
  append?: boolean
44
- /** MCP server configurations */
45
- mcpServers?: unknown[]
46
44
  /** Optional grader function */
47
45
  grader?: Grader
48
46
  }
@@ -192,6 +190,73 @@ const resolvePath = (path: string): string => {
192
190
  return `${process.cwd()}/${path}`
193
191
  }
194
192
 
193
+ /**
194
+ * Detect trajectory richness level from captured steps.
195
+ *
196
+ * @remarks
197
+ * Different adapters provide varying levels of detail:
198
+ * - `full`: Has thoughts, tool calls, or plans (e.g., Claude Code)
199
+ * - `messages-only`: Only message steps present
200
+ * - `minimal`: Empty or unknown content
201
+ *
202
+ * Uses single-pass iteration with early exit for efficiency.
203
+ */
204
+ export const detectTrajectoryRichness = (trajectory: TrajectoryStep[]): TrajectoryRichness => {
205
+ let hasMessages = false
206
+
207
+ for (const step of trajectory) {
208
+ // Early exit: any of these means 'full' richness
209
+ if (step.type === 'thought' || step.type === 'tool_call' || step.type === 'plan') {
210
+ return 'full'
211
+ }
212
+ if (step.type === 'message') {
213
+ hasMessages = true
214
+ }
215
+ }
216
+
217
+ return hasMessages ? 'messages-only' : 'minimal'
218
+ }
219
+
220
+ /**
221
+ * Extract token counts from session notifications if available.
222
+ *
223
+ * @remarks
224
+ * Token usage is adapter-dependent. If the adapter doesn't expose usage,
225
+ * these fields will be undefined. Uses Zod validation for runtime type safety.
226
+ */
227
+ export const extractTokenCounts = (updates: SessionNotification[]): { inputTokens?: number; outputTokens?: number } => {
228
+ let inputTokens: number | undefined
229
+ let outputTokens: number | undefined
230
+
231
+ for (const update of updates) {
232
+ // Check for token usage in update (adapter-specific)
233
+ // ACP SDK doesn't declare 'usage' field, but adapters extend it at runtime
234
+ const updateRecord = update as Record<string, unknown>
235
+ const usageData = updateRecord.usage ?? (updateRecord.update as Record<string, unknown> | undefined)?.usage
236
+ const usage = TokenUsageSchema.safeParse(usageData)
237
+
238
+ if (usage.success) {
239
+ if (usage.data.inputTokens !== undefined) {
240
+ inputTokens = (inputTokens ?? 0) + usage.data.inputTokens
241
+ }
242
+ if (usage.data.outputTokens !== undefined) {
243
+ outputTokens = (outputTokens ?? 0) + usage.data.outputTokens
244
+ }
245
+ }
246
+ }
247
+
248
+ return { inputTokens, outputTokens }
249
+ }
250
+
251
+ /** Get preview text for input (handles string or array) */
252
+ const getInputPreview = (input: string | string[]): string => {
253
+ if (Array.isArray(input)) {
254
+ const first = input[0] ?? ''
255
+ return `[${input.length} turns] ${first.slice(0, 40)}...`
256
+ }
257
+ return input.slice(0, 50)
258
+ }
259
+
195
260
  // ============================================================================
196
261
  // Capture Implementation
197
262
  // ============================================================================
@@ -199,6 +264,10 @@ const resolvePath = (path: string): string => {
199
264
  /**
200
265
  * Execute capture with configuration object.
201
266
  *
267
+ * @remarks
268
+ * Creates a fresh session for each JSONL entry to ensure isolation.
269
+ * Supports multi-turn conversations via `input: string[]`.
270
+ *
202
271
  * @param config - Capture configuration
203
272
  * @returns Array of capture results
204
273
  */
@@ -211,13 +280,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
211
280
  timeout = DEFAULT_HARNESS_TIMEOUT,
212
281
  progress = false,
213
282
  append = false,
214
- mcpServers = [],
215
283
  grader,
216
284
  } = config
217
285
 
218
- // Parse MCP server configurations
219
- const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
220
-
221
286
  // Load prompts
222
287
  const prompts = await loadPrompts(promptsPath)
223
288
 
@@ -230,9 +295,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
230
295
  if (resolvedOutputPath) {
231
296
  logProgress(`Output: ${resolvedOutputPath}`, progress)
232
297
  }
233
- if (parsedMcpServers.length > 0) {
234
- logProgress(`MCP Servers: ${parsedMcpServers.map((s) => s.name).join(', ')}`, progress)
235
- }
236
298
 
237
299
  // Create ACP client
238
300
  const client = createACPClient({
@@ -246,10 +308,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
246
308
  await Bun.write(resolvedOutputPath, '')
247
309
  }
248
310
 
249
- // Session params with MCP servers
311
+ // Session params - agents auto-discover MCP configs from cwd
250
312
  const sessionParams = {
251
313
  cwd: cwd ?? process.cwd(),
252
- mcpServers: parsedMcpServers,
253
314
  }
254
315
 
255
316
  const results: CaptureResult[] = []
@@ -260,43 +321,64 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
260
321
  await client.connect()
261
322
  logProgress('Connected!', progress)
262
323
 
263
- // Create session with MCP servers
264
- const session = await client.createSession(sessionParams)
265
- logProgress(`Session: ${session.id}`, progress)
266
-
267
- // Run evaluations sequentially
324
+ // Run evaluations sequentially - fresh session per entry
268
325
  for (let i = 0; i < prompts.length; i++) {
269
326
  const promptCase = prompts[i]
270
327
  if (!promptCase) continue
271
328
 
272
- logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${promptCase.input.slice(0, 50)}...`, progress)
329
+ logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
273
330
 
274
331
  const startTime = Date.now()
275
332
  let result: CaptureResult
276
333
 
277
334
  try {
278
- const prompt = createPrompt(promptCase.input)
279
- const { updates } = await client.promptSync(session.id, prompt)
335
+ // Create fresh session for each entry (ensures isolation)
336
+ const sessionStart = Date.now()
337
+ const session = await client.createSession(sessionParams)
338
+ const sessionCreation = Date.now() - sessionStart
339
+ logProgress(` Session: ${session.id}`, progress)
340
+
341
+ // Handle string or array input
342
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
343
+ const turnCount = inputs.length
344
+
345
+ // Collect all updates from all turns
346
+ const allUpdates: SessionNotification[] = []
347
+
348
+ // Execute each turn sequentially in the same session
349
+ for (const turnInput of inputs) {
350
+ const prompt = createPrompt(turnInput)
351
+ const { updates } = await client.promptSync(session.id, prompt)
352
+ allUpdates.push(...updates)
353
+ }
280
354
 
281
355
  const endTime = Date.now()
282
- const trajectory = extractTrajectory(updates, startTime)
356
+ const trajectory = extractTrajectory(allUpdates, startTime)
283
357
  const output = extractOutput(trajectory)
284
358
  const toolErrors = hasToolErrors(trajectory)
359
+ const trajectoryRichness = detectTrajectoryRichness(trajectory)
360
+ const tokenCounts = extractTokenCounts(allUpdates)
285
361
 
286
362
  result = {
287
363
  id: promptCase.id,
288
- input: promptCase.input,
364
+ input: promptCase.input, // Preserve original (string or array)
289
365
  output,
290
- ...(promptCase.expected && { expected: promptCase.expected }),
366
+ ...(promptCase.hint && { hint: promptCase.hint }),
291
367
  trajectory,
292
368
  metadata: {
293
369
  ...promptCase.metadata,
294
370
  agent: agentCommand.join(' '),
371
+ trajectoryRichness,
372
+ turnCount,
295
373
  },
296
374
  timing: {
297
375
  start: startTime,
298
376
  end: endTime,
299
377
  firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
378
+ sessionCreation,
379
+ total: endTime - startTime,
380
+ ...(tokenCounts.inputTokens !== undefined && { inputTokens: tokenCounts.inputTokens }),
381
+ ...(tokenCounts.outputTokens !== undefined && { outputTokens: tokenCounts.outputTokens }),
300
382
  },
301
383
  toolErrors,
302
384
  }
@@ -306,13 +388,14 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
306
388
  result.score = await grader({
307
389
  input: promptCase.input,
308
390
  output,
309
- expected: promptCase.expected,
391
+ hint: promptCase.hint,
310
392
  trajectory,
311
393
  })
312
394
  }
313
395
  } catch (error) {
314
396
  const endTime = Date.now()
315
397
  const message = error instanceof Error ? error.message : String(error)
398
+ const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
316
399
 
317
400
  result = {
318
401
  id: promptCase.id,
@@ -322,10 +405,14 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
322
405
  metadata: {
323
406
  ...promptCase.metadata,
324
407
  agent: agentCommand.join(' '),
408
+ trajectoryRichness: 'minimal' as TrajectoryRichness,
409
+ turnCount: inputs.length,
325
410
  },
326
411
  timing: {
327
412
  start: startTime,
328
413
  end: endTime,
414
+ sessionCreation: 0,
415
+ total: endTime - startTime,
329
416
  },
330
417
  toolErrors: true,
331
418
  errors: [message],
@@ -340,7 +427,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
340
427
  isFirstOutput = false
341
428
 
342
429
  const statusIcon = result.toolErrors ? '!' : '✓'
343
- logProgress(` ${statusIcon} (${result.timing.end - result.timing.start}ms)`, progress)
430
+ logProgress(` ${statusIcon} (${result.timing.total}ms)`, progress)
344
431
  }
345
432
  } finally {
346
433
  logProgress('Disconnecting...', progress)
@@ -369,7 +456,6 @@ export const capture = async (args: string[]): Promise<void> => {
369
456
  timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
370
457
  progress: { type: 'boolean', default: false },
371
458
  append: { type: 'boolean', default: false },
372
- 'mcp-server': { type: 'string', multiple: true },
373
459
  grader: { type: 'string', short: 'g' },
374
460
  help: { type: 'boolean', short: 'h' },
375
461
  },
@@ -387,11 +473,10 @@ Arguments:
387
473
 
388
474
  Options:
389
475
  -o, --output Output file (default: stdout)
390
- -c, --cwd Working directory for agent
476
+ -c, --cwd Working directory for agent (agents auto-discover MCP configs from here)
391
477
  -t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
392
478
  --progress Show progress to stderr
393
479
  --append Append to output file instead of overwriting
394
- --mcp-server MCP server config JSON (repeatable)
395
480
  -g, --grader Path to grader (.ts/.js module or executable script)
396
481
  -h, --help Show this help message
397
482
 
@@ -440,9 +525,6 @@ Examples:
440
525
  }
441
526
  }
442
527
 
443
- // Parse MCP server configurations
444
- const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
445
-
446
528
  await runCapture({
447
529
  promptsPath,
448
530
  agentCommand,
@@ -451,7 +533,6 @@ Examples:
451
533
  timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
452
534
  progress: values.progress ?? false,
453
535
  append: values.append ?? false,
454
- mcpServers,
455
536
  grader,
456
537
  })
457
538
  }
@@ -42,9 +42,9 @@ const resolvePath = (path: string): string => {
42
42
 
43
43
  /** Input format for executable graders (stdin JSON) */
44
44
  type ExecGraderInput = {
45
- input: string
45
+ input: string | string[]
46
46
  output: string
47
- expected?: string
47
+ hint?: string
48
48
  trajectory?: TrajectoryStep[]
49
49
  }
50
50
 
@@ -63,7 +63,7 @@ const createExecGrader = (execPath: string): Grader => {
63
63
  const input: ExecGraderInput = {
64
64
  input: params.input,
65
65
  output: params.output,
66
- expected: params.expected,
66
+ hint: params.hint,
67
67
  trajectory: params.trajectory,
68
68
  }
69
69
 
package/src/harness.ts CHANGED
@@ -13,6 +13,7 @@
13
13
  * - `validateRefs` - Check reference solutions
14
14
  * - `balance` - Analyze test set coverage
15
15
  * - `schemasCli` - Export JSON schemas
16
+ * - `headless` - Schema-driven adapter for headless CLI agents
16
17
  *
17
18
  * @packageDocumentation
18
19
  */
@@ -25,6 +26,9 @@ export { calibrate, runCalibrate } from './calibrate.ts'
25
26
  export type { CaptureConfig } from './capture.ts'
26
27
  // Command implementations (for programmatic use)
27
28
  export { capture, extractOutput, extractTrajectory, hasToolErrors, loadPrompts, runCapture } from './capture.ts'
29
+ export type { HeadlessAdapterConfig } from './headless.ts'
30
+ // Headless adapter factory
31
+ export { headless } from './headless.ts'
28
32
  export type { SchemasConfig } from './schemas-cli.ts'
29
33
  export { runSchemas, schemasCli } from './schemas-cli.ts'
30
34
  export type { SummarizeConfig } from './summarize.ts'