npm - @plaited/acp-harness - Versions diffs - 0.3.2 → 0.4.0 - Mend

@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +53 -31
package/bin/cli.ts +15 -0
package/package.json +5 -7
package/src/acp-client.ts +7 -4
package/src/adapter-check.ts +0 -1
package/src/adapter-scaffold.ts +16 -15
package/src/calibrate.ts +28 -8
package/src/capture.ts +114 -33
package/src/grader-loader.ts +3 -3
package/src/harness.ts +4 -0
package/src/headless-cli.ts +433 -0
package/src/headless-history-builder.ts +141 -0
package/src/headless-output-parser.ts +251 -0
package/src/headless-session-manager.ts +389 -0
package/src/headless.schemas.ts +241 -0
package/src/headless.ts +71 -0
package/src/headless.types.ts +19 -0
package/src/integration_tests/acp-claude.spec.ts +170 -0
package/src/integration_tests/acp-gemini.spec.ts +174 -0
package/src/schemas.ts +88 -36
package/src/summarize.ts +4 -8
package/src/tests/acp-client.spec.ts +1 -1
package/src/tests/capture-cli.spec.ts +188 -0
package/src/tests/capture-helpers.spec.ts +229 -67
package/src/tests/constants.spec.ts +121 -0
package/src/tests/fixtures/grader-exec.py +3 -3
package/src/tests/fixtures/grader-module.ts +2 -2
package/src/tests/grader-loader.spec.ts +5 -5
package/src/tests/headless.spec.ts +460 -0
package/src/tests/schemas-cli.spec.ts +142 -0
package/src/tests/schemas.spec.ts +657 -0
package/src/tests/summarize-helpers.spec.ts +3 -3
package/src/tests/trials-cli.spec.ts +145 -0
package/src/trials.ts +6 -19
package/src/validate-refs.ts +1 -1
package/src/tests/acp-integration.docker.ts +0 -214

package/src/schemas.ts CHANGED Viewed

@@ -222,14 +222,16 @@ export type McpServerConfig = z.infer<typeof McpServerSchema>
  *
  * @remarks
  * Each line in a prompts.jsonl file should match this schema.
+ * - Single turn: `input: "Hello"` - one prompt, one session
+ * - Multi-turn: `input: ["Hello", "How are you?", "Goodbye"]` - sequential turns in one session
  */
 export const PromptCaseSchema = z.object({
   /** Unique identifier for the test case */
   id: z.string(),
-  /** The prompt text to send to the agent */
-  input: z.string(),
-  /** Optional expected output for grading */
-  expected: z.string().optional(),
+  /** Prompt text(s) - string for single turn, array for multi-turn conversation */
+  input: z.union([z.string(), z.array(z.string())]),
+  /** Optional grader context hint (not a strict expected match) */
+  hint: z.string().optional(),
   /** Optional reference solution for validation */
   reference: z.string().optional(),
   /** Optional metadata for categorization and analysis */
@@ -268,25 +270,13 @@ export type GraderResult = z.infer<typeof GraderResultSchema>
  *
  * @remarks
  * User-provided graders implement this interface to score agent outputs.
- *
- * @example
- * ```typescript
- * import type { Grader } from '@plaited/acp-harness/schemas'
- *
- * export const grade: Grader = async ({ input, output, expected, trajectory }) => {
- *   const pass = output.toLowerCase().includes(expected?.toLowerCase() ?? '')
- *   return {
- *     pass,
- *     score: pass ? 1 : 0,
- *     reasoning: pass ? 'Contains expected answer' : 'Missing expected answer'
- *   }
- * }
- * ```
+ * - `input` is the original prompt (string or array for multi-turn)
+ * - `hint` provides grader context (renamed from `expected`)
  */
 export type Grader = (params: {
-  input: string
+  input: string | string[]
   output: string
-  expected?: string
+  hint?: string
   trajectory?: TrajectoryStep[]
 }) => Promise<GraderResult>
@@ -307,6 +297,24 @@ export const ToolInputSchema = z
 /** Tool input type */
 export type ToolInput = z.infer<typeof ToolInputSchema>
+/**
+ * Token usage schema for adapter-specific usage data.
+ *
+ * @remarks
+ * ACP SDK's SessionNotification doesn't declare a 'usage' field, but adapters
+ * like Claude Code extend responses with token counts at runtime. This schema
+ * provides runtime validation for that extension.
+ */
+export const TokenUsageSchema = z
+  .object({
+    inputTokens: z.number().optional(),
+    outputTokens: z.number().optional(),
+  })
+  .passthrough()
+/** Token usage type */
+export type TokenUsage = z.infer<typeof TokenUsageSchema>
 /** Thought trajectory step */
 export const ThoughtStepSchema = z.object({
   type: z.literal('thought'),
@@ -366,36 +374,80 @@ export type IndexedStep = TrajectoryStep & { stepId: string }
 // Capture Result Schemas
 // ============================================================================
-/** Timing information for a capture result */
+/**
+ * Timing information for a capture result.
+ *
+ * @remarks
+ * Captures both absolute timestamps and derived durations for analysis:
+ * - `sessionCreation`: Time to initialize session (agent startup overhead)
+ * - `total`: End-to-end duration including all turns
+ * - `firstResponse`: Latency to first agent output (optional)
+ *
+ * Token counts are adapter-dependent and only present if the adapter
+ * exposes usage information (e.g., Claude Code includes them, others may not).
+ *
+ * @public
+ */
 export const TimingSchema = z.object({
+  /** Epoch timestamp when capture started */
   start: z.number(),
+  /** Epoch timestamp when capture ended */
   end: z.number(),
+  /** Time to first response (ms from start) */
   firstResponse: z.number().optional(),
+  /** Time to create session (ms) - measures agent initialization overhead */
+  sessionCreation: z.number(),
+  /** Total duration (end - start) in milliseconds */
+  total: z.number(),
+  /** Input tokens consumed (if available from ACP adapter) */
+  inputTokens: z.number().optional(),
+  /** Output tokens generated (if available from ACP adapter) */
+  outputTokens: z.number().optional(),
 })
-/** Timing information type */
+/**
+ * Timing information type inferred from TimingSchema.
+ *
+ * @public
+ */
 export type Timing = z.infer<typeof TimingSchema>
+/**
+ * Trajectory richness level indicating the depth of captured agent activity.
+ *
+ * @remarks
+ * Different adapters provide varying levels of detail:
+ * - `full`: Thoughts, tool calls, plans (e.g., Claude Code adapter)
+ * - `minimal`: Basic output only (e.g., Droid adapter)
+ * - `messages-only`: Messages without internal reasoning
+ */
+export const TrajectoryRichnessSchema = z.enum(['full', 'minimal', 'messages-only'])
+/** Trajectory richness type */
+export type TrajectoryRichness = z.infer<typeof TrajectoryRichnessSchema>
 /**
  * Capture result schema.
  *
  * @remarks
  * Full trajectory output from the `capture` command.
- * The `toolErrors` field replaces the misleading `status: 'passed'|'failed'`.
+ * - `input` can be string (single turn) or string[] (multi-turn)
+ * - `hint` provides grader context (renamed from `expected`)
+ * - `toolErrors` replaces misleading `status: 'passed'|'failed'`
  * Real pass/fail determination comes from your grader.
  */
 export const CaptureResultSchema = z.object({
   /** Test case identifier */
   id: z.string(),
-  /** Original prompt input */
-  input: z.string(),
+  /** Original prompt input (string for single turn, array for multi-turn) */
+  input: z.union([z.string(), z.array(z.string())]),
   /** Final agent output */
   output: z.string(),
-  /** Expected output (if provided) */
-  expected: z.string().optional(),
+  /** Grader context hint (renamed from expected) */
+  hint: z.string().optional(),
   /** Full execution trajectory */
   trajectory: z.array(TrajectoryStepSchema),
-  /** Metadata including category, agent info, etc. */
+  /** Metadata including category, agent info, trajectoryRichness, turnCount */
   metadata: z.record(z.string(), z.unknown()),
   /** Timing information */
   timing: TimingSchema,
@@ -471,10 +523,10 @@ export type TrialEntry = z.infer<typeof TrialEntrySchema>
 export const TrialResultSchema = z.object({
   /** Test case identifier */
   id: z.string(),
-  /** Original prompt input */
-  input: z.string(),
-  /** Expected output (if provided) */
-  expected: z.string().optional(),
+  /** Original prompt input (string for single turn, array for multi-turn) */
+  input: z.union([z.string(), z.array(z.string())]),
+  /** Grader context hint (renamed from expected) */
+  hint: z.string().optional(),
   /** Number of trials (k) */
   k: z.number(),
   /** Simple pass rate: passes / k (with grader only) */
@@ -498,12 +550,12 @@ export type TrialResult = z.infer<typeof TrialResultSchema>
 export const CalibrationSampleSchema = z.object({
   /** Test case identifier */
   id: z.string(),
-  /** Original prompt input */
-  input: z.string(),
+  /** Original prompt input (string for single turn, array for multi-turn) */
+  input: z.union([z.string(), z.array(z.string())]),
   /** Agent output */
   output: z.string(),
-  /** Expected output (if provided) */
-  expected: z.string().optional(),
+  /** Grader context hint (renamed from expected) */
+  hint: z.string().optional(),
   /** Original grader score */
   originalScore: GraderResultSchema,
   /** Re-scored result (if different grader provided) */

package/src/summarize.ts CHANGED Viewed

@@ -64,9 +64,10 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
  * @public
  */
 export const formatSummary = (result: CaptureResult): SummaryResult => {
+  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
   return {
     id: result.id,
-    input: result.input,
+    input: inputText,
     output: result.output,
     toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
     duration: result.timing.end - result.timing.start,
@@ -82,13 +83,8 @@ export const formatSummary = (result: CaptureResult): SummaryResult => {
  * @public
  */
 export const formatMarkdown = (result: CaptureResult): string => {
-  const lines: string[] = [
-    `## Evaluation Record: ${result.id}`,
-    '',
-    `**Input:** ${result.input}`,
-    '',
-    '**Trajectory:**',
-  ]
+  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
+  const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**']
   let stepNum = 1
   for (const step of result.trajectory) {

package/src/tests/acp-client.spec.ts CHANGED Viewed

@@ -102,7 +102,7 @@ describe('Operations before connection', () => {
       command: ['echo', 'test'],
     })
-    await expect(client.createSession({ cwd: '/tmp', mcpServers: [] })).rejects.toThrow('Not connected')
+    await expect(client.createSession({ cwd: '/tmp' })).rejects.toThrow('Not connected')
   })
   test('promptSync throws when not connected', async () => {

package/src/tests/capture-cli.spec.ts ADDED Viewed

@@ -0,0 +1,188 @@
+import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
+import type { CaptureConfig } from '../capture.ts'
+import { loadPrompts } from '../capture.ts'
+// ============================================================================
+// loadPrompts
+// ============================================================================
+describe('loadPrompts', () => {
+  const testPromptFile = '/tmp/acp-harness-test-prompts.jsonl'
+  beforeEach(async () => {
+    await Bun.$`rm -f ${testPromptFile}`.nothrow()
+  })
+  afterEach(async () => {
+    await Bun.$`rm -f ${testPromptFile}`.nothrow()
+  })
+  test('loads single-turn prompts', async () => {
+    await Bun.write(
+      testPromptFile,
+      `{"id": "t1", "input": "Hello"}
+{"id": "t2", "input": "World"}`,
+    )
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(2)
+    expect(prompts[0]?.id).toBe('t1')
+    expect(prompts[0]?.input).toBe('Hello')
+    expect(prompts[1]?.id).toBe('t2')
+    expect(prompts[1]?.input).toBe('World')
+  })
+  test('loads multi-turn prompts', async () => {
+    await Bun.write(testPromptFile, `{"id": "conv1", "input": ["Hi", "How are you?", "Bye"]}`)
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(1)
+    expect(prompts[0]?.id).toBe('conv1')
+    expect(Array.isArray(prompts[0]?.input)).toBe(true)
+    expect(prompts[0]?.input).toEqual(['Hi', 'How are you?', 'Bye'])
+  })
+  test('loads prompts with hint field', async () => {
+    await Bun.write(testPromptFile, `{"id": "t1", "input": "2+2?", "hint": "4"}`)
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(1)
+    expect(prompts[0]?.hint).toBe('4')
+  })
+  test('loads prompts with metadata', async () => {
+    await Bun.write(
+      testPromptFile,
+      `{"id": "t1", "input": "Test", "metadata": {"category": "math", "difficulty": "easy"}}`,
+    )
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(1)
+    expect(prompts[0]?.metadata).toEqual({ category: 'math', difficulty: 'easy' })
+  })
+  test('loads prompts with timeout override', async () => {
+    await Bun.write(testPromptFile, `{"id": "t1", "input": "Slow task", "timeout": 120000}`)
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(1)
+    expect(prompts[0]?.timeout).toBe(120000)
+  })
+  test('skips empty lines', async () => {
+    await Bun.write(
+      testPromptFile,
+      `{"id": "t1", "input": "First"}
+{"id": "t2", "input": "Second"}
+`,
+    )
+    const prompts = await loadPrompts(testPromptFile)
+    expect(prompts).toHaveLength(2)
+  })
+  test('throws on invalid JSON', async () => {
+    await Bun.write(testPromptFile, 'not valid json')
+    await expect(loadPrompts(testPromptFile)).rejects.toThrow()
+  })
+  test('throws on missing required fields', async () => {
+    await Bun.write(testPromptFile, `{"id": "t1"}`) // missing input
+    await expect(loadPrompts(testPromptFile)).rejects.toThrow()
+  })
+})
+// ============================================================================
+// runCapture configuration
+// ============================================================================
+describe('runCapture configuration', () => {
+  test('CaptureConfig type accepts valid configuration', () => {
+    // Type-level test - if this compiles, the types are correct
+    const config: CaptureConfig = {
+      promptsPath: '/tmp/prompts.jsonl',
+      agentCommand: ['bunx', 'test-agent'],
+      outputPath: '/tmp/output.jsonl',
+      cwd: '/tmp',
+      timeout: 30000,
+      progress: true,
+      append: false,
+    }
+    expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
+    expect(config.agentCommand).toEqual(['bunx', 'test-agent'])
+  })
+  test('CaptureConfig allows minimal configuration', () => {
+    const config: CaptureConfig = {
+      promptsPath: '/tmp/prompts.jsonl',
+      agentCommand: ['echo', 'test'],
+    }
+    expect(config.outputPath).toBeUndefined()
+    expect(config.cwd).toBeUndefined()
+    expect(config.timeout).toBeUndefined()
+    expect(config.progress).toBeUndefined()
+    expect(config.append).toBeUndefined()
+    expect(config.grader).toBeUndefined()
+  })
+})
+// ============================================================================
+// CLI Help Output
+// ============================================================================
+describe('capture CLI', () => {
+  test('displays help with --help flag', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '--help'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stdout = await new Response(proc.stdout).text()
+    await proc.exited
+    expect(stdout).toContain('Usage: acp-harness capture')
+    expect(stdout).toContain('prompts.jsonl')
+    expect(stdout).toContain('-o, --output')
+    expect(stdout).toContain('-c, --cwd')
+    expect(stdout).toContain('-t, --timeout')
+    expect(stdout).toContain('--progress')
+    expect(stdout).toContain('-g, --grader')
+  })
+  test('shows error for missing prompts file argument', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('prompts.jsonl path is required')
+  })
+  test('shows error for missing agent command', async () => {
+    const proc = Bun.spawn(['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl'], {
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('ACP agent command is required')
+  })
+})