@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/schemas.ts CHANGED
@@ -222,14 +222,16 @@ export type McpServerConfig = z.infer<typeof McpServerSchema>
222
222
  *
223
223
  * @remarks
224
224
  * Each line in a prompts.jsonl file should match this schema.
225
+ * - Single turn: `input: "Hello"` - one prompt, one session
226
+ * - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session
225
227
  */
226
228
  export const PromptCaseSchema = z.object({
227
229
  /** Unique identifier for the test case */
228
230
  id: z.string(),
229
- /** The prompt text to send to the agent */
230
- input: z.string(),
231
- /** Optional expected output for grading */
232
- expected: z.string().optional(),
231
+ /** Prompt text(s) - string for single turn, array for multi-turn conversation */
232
+ input: z.union([z.string(), z.array(z.string())]),
233
+ /** Optional grader context hint (not a strict expected match) */
234
+ hint: z.string().optional(),
233
235
  /** Optional reference solution for validation */
234
236
  reference: z.string().optional(),
235
237
  /** Optional metadata for categorization and analysis */
@@ -268,25 +270,13 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
268
270
  *
269
271
  * @remarks
270
272
  * User-provided graders implement this interface to score agent outputs.
271
- *
272
- * @example
273
- * ```typescript
274
- * import type { Grader } from '@plaited/acp-harness/schemas'
275
- *
276
- * export const grade: Grader = async ({ input, output, expected, trajectory }) => {
277
- * const pass = output.toLowerCase().includes(expected?.toLowerCase() ?? '')
278
- * return {
279
- * pass,
280
- * score: pass ? 1 : 0,
281
- * reasoning: pass ? 'Contains expected answer' : 'Missing expected answer'
282
- * }
283
- * }
284
- * ```
273
+ * - `input` is the original prompt (string or array for multi-turn)
274
+ * - `hint` provides grader context (renamed from `expected`)
285
275
  */
286
276
  export type Grader = (params: {
287
- input: string
277
+ input: string | string[]
288
278
  output: string
289
- expected?: string
279
+ hint?: string
290
280
  trajectory?: TrajectoryStep[]
291
281
  }) => Promise<GraderResult>
292
282
 
@@ -307,6 +297,24 @@ export const ToolInputSchema = z
307
297
  /** Tool input type */
308
298
  export type ToolInput = z.infer<typeof ToolInputSchema>
309
299
 
300
+ /**
301
+ * Token usage schema for adapter-specific usage data.
302
+ *
303
+ * @remarks
304
+ * ACP SDK's SessionNotification doesn't declare a 'usage' field, but adapters
305
+ * like Claude Code extend responses with token counts at runtime. This schema
306
+ * provides runtime validation for that extension.
307
+ */
308
+ export const TokenUsageSchema = z
309
+ .object({
310
+ inputTokens: z.number().optional(),
311
+ outputTokens: z.number().optional(),
312
+ })
313
+ .passthrough()
314
+
315
+ /** Token usage type */
316
+ export type TokenUsage = z.infer<typeof TokenUsageSchema>
317
+
310
318
  /** Thought trajectory step */
311
319
  export const ThoughtStepSchema = z.object({
312
320
  type: z.literal('thought'),
@@ -366,36 +374,80 @@ export type IndexedStep = TrajectoryStep & { stepId: string }
366
374
  // Capture Result Schemas
367
375
  // ============================================================================
368
376
 
369
- /** Timing information for a capture result */
377
+ /**
378
+ * Timing information for a capture result.
379
+ *
380
+ * @remarks
381
+ * Captures both absolute timestamps and derived durations for analysis:
382
+ * - `sessionCreation`: Time to initialize session (agent startup overhead)
383
+ * - `total`: End-to-end duration including all turns
384
+ * - `firstResponse`: Latency to first agent output (optional)
385
+ *
386
+ * Token counts are adapter-dependent and only present if the adapter
387
+ * exposes usage information (e.g., Claude Code includes them, others may not).
388
+ *
389
+ * @public
390
+ */
370
391
  export const TimingSchema = z.object({
392
+ /** Epoch timestamp when capture started */
371
393
  start: z.number(),
394
+ /** Epoch timestamp when capture ended */
372
395
  end: z.number(),
396
+ /** Time to first response (ms from start) */
373
397
  firstResponse: z.number().optional(),
398
+ /** Time to create session (ms) - measures agent initialization overhead */
399
+ sessionCreation: z.number(),
400
+ /** Total duration (end - start) in milliseconds */
401
+ total: z.number(),
402
+ /** Input tokens consumed (if available from ACP adapter) */
403
+ inputTokens: z.number().optional(),
404
+ /** Output tokens generated (if available from ACP adapter) */
405
+ outputTokens: z.number().optional(),
374
406
  })
375
407
 
376
- /** Timing information type */
408
+ /**
409
+ * Timing information type inferred from TimingSchema.
410
+ *
411
+ * @public
412
+ */
377
413
  export type Timing = z.infer<typeof TimingSchema>
378
414
 
415
+ /**
416
+ * Trajectory richness level indicating the depth of captured agent activity.
417
+ *
418
+ * @remarks
419
+ * Different adapters provide varying levels of detail:
420
+ * - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter)
421
+ * - `minimal`: Basic output only (e.g., Droid adapter)
422
+ * - `messages-only`: Messages without internal reasoning
423
+ */
424
+ export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
425
+
426
+ /** Trajectory richness type */
427
+ export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
428
+
379
429
  /**
380
430
  * Capture result schema.
381
431
  *
382
432
  * @remarks
383
433
  * Full trajectory output from the `capture` command.
384
- * The `toolErrors` field replaces the misleading `status: 'passed'|'failed'`.
434
+ * - `input` can be string (single turn) or string[] (multi-turn)
435
+ * - `hint` provides grader context (renamed from `expected`)
436
+ * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
385
437
  * Real pass/fail determination comes from your grader.
386
438
  */
387
439
  export const CaptureResultSchema = z.object({
388
440
  /** Test case identifier */
389
441
  id: z.string(),
390
- /** Original prompt input */
391
- input: z.string(),
442
+ /** Original prompt input (string for single turn, array for multi-turn) */
443
+ input: z.union([z.string(), z.array(z.string())]),
392
444
  /** Final agent output */
393
445
  output: z.string(),
394
- /** Expected output (if provided) */
395
- expected: z.string().optional(),
446
+ /** Grader context hint (renamed from expected) */
447
+ hint: z.string().optional(),
396
448
  /** Full execution trajectory */
397
449
  trajectory: z.array(TrajectoryStepSchema),
398
- /** Metadata including category, agent info, etc. */
450
+ /** Metadata including category, agent info, trajectoryRichness, turnCount */
399
451
  metadata: z.record(z.string(), z.unknown()),
400
452
  /** Timing information */
401
453
  timing: TimingSchema,
@@ -471,10 +523,10 @@ export type TrialEntry = z.infer<typeof TrialEntrySchema>
471
523
  export const TrialResultSchema = z.object({
472
524
  /** Test case identifier */
473
525
  id: z.string(),
474
- /** Original prompt input */
475
- input: z.string(),
476
- /** Expected output (if provided) */
477
- expected: z.string().optional(),
526
+ /** Original prompt input (string for single turn, array for multi-turn) */
527
+ input: z.union([z.string(), z.array(z.string())]),
528
+ /** Grader context hint (renamed from expected) */
529
+ hint: z.string().optional(),
478
530
  /** Number of trials (k) */
479
531
  k: z.number(),
480
532
  /** Simple pass rate: passes / k (with grader only) */
@@ -498,12 +550,12 @@ export type TrialResult = z.infer<typeof TrialResultSchema>
498
550
  export const CalibrationSampleSchema = z.object({
499
551
  /** Test case identifier */
500
552
  id: z.string(),
501
- /** Original prompt input */
502
- input: z.string(),
553
+ /** Original prompt input (string for single turn, array for multi-turn) */
554
+ input: z.union([z.string(), z.array(z.string())]),
503
555
  /** Agent output */
504
556
  output: z.string(),
505
- /** Expected output (if provided) */
506
- expected: z.string().optional(),
557
+ /** Grader context hint (renamed from expected) */
558
+ hint: z.string().optional(),
507
559
  /** Original grader score */
508
560
  originalScore: GraderResultSchema,
509
561
  /** Re-scored result (if different grader provided) */
package/src/summarize.ts CHANGED
@@ -64,9 +64,10 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
64
64
  * @public
65
65
  */
66
66
  export const formatSummary = (result: CaptureResult): SummaryResult => {
67
+ const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
67
68
  return {
68
69
  id: result.id,
69
- input: result.input,
70
+ input: inputText,
70
71
  output: result.output,
71
72
  toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
72
73
  duration: result.timing.end - result.timing.start,
@@ -82,13 +83,8 @@ export const formatSummary = (result: CaptureResult): SummaryResult => {
82
83
  * @public
83
84
  */
84
85
  export const formatMarkdown = (result: CaptureResult): string => {
85
- const lines: string[] = [
86
- `## Evaluation Record: ${result.id}`,
87
- '',
88
- `**Input:** ${result.input}`,
89
- '',
90
- '**Trajectory:**',
91
- ]
86
+ const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
87
+ const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**']
92
88
 
93
89
  let stepNum = 1
94
90
  for (const step of result.trajectory) {
@@ -102,7 +102,7 @@ describe('Operations before connection', () => {
102
102
  command: ['echo', 'test'],
103
103
  })
104
104
 
105
- await expect(client.createSession({ cwd: '/tmp', mcpServers: [] })).rejects.toThrow('Not connected')
105
+ await expect(client.createSession({ cwd: '/tmp' })).rejects.toThrow('Not connected')
106
106
  })
107
107
 
108
108
  test('promptSync throws when not connected', async () => {
@@ -0,0 +1,188 @@
1
+ import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
2
+ import type { CaptureConfig } from '../capture.ts'
3
+ import { loadPrompts } from '../capture.ts'
4
+
5
+ // ============================================================================
6
+ // loadPrompts
7
+ // ============================================================================
8
+
9
+ describe('loadPrompts', () => {
10
+ const testPromptFile = '/tmp/acp-harness-test-prompts.jsonl'
11
+
12
+ beforeEach(async () => {
13
+ await Bun.$`rm -f ${testPromptFile}`.nothrow()
14
+ })
15
+
16
+ afterEach(async () => {
17
+ await Bun.$`rm -f ${testPromptFile}`.nothrow()
18
+ })
19
+
20
+ test('loads single-turn prompts', async () => {
21
+ await Bun.write(
22
+ testPromptFile,
23
+ `{"id": "t1", "input": "Hello"}
24
+ {"id": "t2", "input": "World"}`,
25
+ )
26
+
27
+ const prompts = await loadPrompts(testPromptFile)
28
+
29
+ expect(prompts).toHaveLength(2)
30
+ expect(prompts[0]?.id).toBe('t1')
31
+ expect(prompts[0]?.input).toBe('Hello')
32
+ expect(prompts[1]?.id).toBe('t2')
33
+ expect(prompts[1]?.input).toBe('World')
34
+ })
35
+
36
+ test('loads multi-turn prompts', async () => {
37
+ await Bun.write(testPromptFile, `{"id": "conv1", "input": ["Hi", "How are you?", "Bye"]}`)
38
+
39
+ const prompts = await loadPrompts(testPromptFile)
40
+
41
+ expect(prompts).toHaveLength(1)
42
+ expect(prompts[0]?.id).toBe('conv1')
43
+ expect(Array.isArray(prompts[0]?.input)).toBe(true)
44
+ expect(prompts[0]?.input).toEqual(['Hi', 'How are you?', 'Bye'])
45
+ })
46
+
47
+ test('loads prompts with hint field', async () => {
48
+ await Bun.write(testPromptFile, `{"id": "t1", "input": "2+2?", "hint": "4"}`)
49
+
50
+ const prompts = await loadPrompts(testPromptFile)
51
+
52
+ expect(prompts).toHaveLength(1)
53
+ expect(prompts[0]?.hint).toBe('4')
54
+ })
55
+
56
+ test('loads prompts with metadata', async () => {
57
+ await Bun.write(
58
+ testPromptFile,
59
+ `{"id": "t1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`,
60
+ )
61
+
62
+ const prompts = await loadPrompts(testPromptFile)
63
+
64
+ expect(prompts).toHaveLength(1)
65
+ expect(prompts[0]?.metadata).toEqual({ category: 'math', difficulty: 'easy' })
66
+ })
67
+
68
+ test('loads prompts with timeout override', async () => {
69
+ await Bun.write(testPromptFile, `{"id": "t1", "input": "Slow task", "timeout": 120000}`)
70
+
71
+ const prompts = await loadPrompts(testPromptFile)
72
+
73
+ expect(prompts).toHaveLength(1)
74
+ expect(prompts[0]?.timeout).toBe(120000)
75
+ })
76
+
77
+ test('skips empty lines', async () => {
78
+ await Bun.write(
79
+ testPromptFile,
80
+ `{"id": "t1", "input": "First"}
81
+
82
+ {"id": "t2", "input": "Second"}
83
+ `,
84
+ )
85
+
86
+ const prompts = await loadPrompts(testPromptFile)
87
+
88
+ expect(prompts).toHaveLength(2)
89
+ })
90
+
91
+ test('throws on invalid JSON', async () => {
92
+ await Bun.write(testPromptFile, 'not valid json')
93
+
94
+ await expect(loadPrompts(testPromptFile)).rejects.toThrow()
95
+ })
96
+
97
+ test('throws on missing required fields', async () => {
98
+ await Bun.write(testPromptFile, `{"id": "t1"}`) // missing input
99
+
100
+ await expect(loadPrompts(testPromptFile)).rejects.toThrow()
101
+ })
102
+ })
103
+
104
+ // ============================================================================
105
+ // runCapture configuration
106
+ // ============================================================================
107
+
108
+ describe('runCapture configuration', () => {
109
+ test('CaptureConfig type accepts valid configuration', () => {
110
+ // Type-level test - if this compiles, the types are correct
111
+ const config: CaptureConfig = {
112
+ promptsPath: '/tmp/prompts.jsonl',
113
+ agentCommand: ['bunx', 'test-agent'],
114
+ outputPath: '/tmp/output.jsonl',
115
+ cwd: '/tmp',
116
+ timeout: 30000,
117
+ progress: true,
118
+ append: false,
119
+ }
120
+
121
+ expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
122
+ expect(config.agentCommand).toEqual(['bunx', 'test-agent'])
123
+ })
124
+
125
+ test('CaptureConfig allows minimal configuration', () => {
126
+ const config: CaptureConfig = {
127
+ promptsPath: '/tmp/prompts.jsonl',
128
+ agentCommand: ['echo', 'test'],
129
+ }
130
+
131
+ expect(config.outputPath).toBeUndefined()
132
+ expect(config.cwd).toBeUndefined()
133
+ expect(config.timeout).toBeUndefined()
134
+ expect(config.progress).toBeUndefined()
135
+ expect(config.append).toBeUndefined()
136
+ expect(config.grader).toBeUndefined()
137
+ })
138
+ })
139
+
140
+ // ============================================================================
141
+ // CLI Help Output
142
+ // ============================================================================
143
+
144
+ describe('capture CLI', () => {
145
+ test('displays help with --help flag', async () => {
146
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '--help'], {
147
+ stdout: 'pipe',
148
+ stderr: 'pipe',
149
+ })
150
+
151
+ const stdout = await new Response(proc.stdout).text()
152
+ await proc.exited
153
+
154
+ expect(stdout).toContain('Usage: acp-harness capture')
155
+ expect(stdout).toContain('prompts.jsonl')
156
+ expect(stdout).toContain('-o, --output')
157
+ expect(stdout).toContain('-c, --cwd')
158
+ expect(stdout).toContain('-t, --timeout')
159
+ expect(stdout).toContain('--progress')
160
+ expect(stdout).toContain('-g, --grader')
161
+ })
162
+
163
+ test('shows error for missing prompts file argument', async () => {
164
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture'], {
165
+ stdout: 'pipe',
166
+ stderr: 'pipe',
167
+ })
168
+
169
+ const stderr = await new Response(proc.stderr).text()
170
+ const exitCode = await proc.exited
171
+
172
+ expect(exitCode).not.toBe(0)
173
+ expect(stderr).toContain('prompts.jsonl path is required')
174
+ })
175
+
176
+ test('shows error for missing agent command', async () => {
177
+ const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl'], {
178
+ stdout: 'pipe',
179
+ stderr: 'pipe',
180
+ })
181
+
182
+ const stderr = await new Response(proc.stderr).text()
183
+ const exitCode = await proc.exited
184
+
185
+ expect(exitCode).not.toBe(0)
186
+ expect(stderr).toContain('ACP agent command is required')
187
+ })
188
+ })