npm - @plaited/acp-harness - Versions diffs - 0.3.2 → 0.4.0 - Mend

@plaited/acp-harness 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +53 -31
package/bin/cli.ts +15 -0
package/package.json +5 -7
package/src/acp-client.ts +7 -4
package/src/adapter-check.ts +0 -1
package/src/adapter-scaffold.ts +16 -15
package/src/calibrate.ts +28 -8
package/src/capture.ts +114 -33
package/src/grader-loader.ts +3 -3
package/src/harness.ts +4 -0
package/src/headless-cli.ts +433 -0
package/src/headless-history-builder.ts +141 -0
package/src/headless-output-parser.ts +251 -0
package/src/headless-session-manager.ts +389 -0
package/src/headless.schemas.ts +241 -0
package/src/headless.ts +71 -0
package/src/headless.types.ts +19 -0
package/src/integration_tests/acp-claude.spec.ts +170 -0
package/src/integration_tests/acp-gemini.spec.ts +174 -0
package/src/schemas.ts +88 -36
package/src/summarize.ts +4 -8
package/src/tests/acp-client.spec.ts +1 -1
package/src/tests/capture-cli.spec.ts +188 -0
package/src/tests/capture-helpers.spec.ts +229 -67
package/src/tests/constants.spec.ts +121 -0
package/src/tests/fixtures/grader-exec.py +3 -3
package/src/tests/fixtures/grader-module.ts +2 -2
package/src/tests/grader-loader.spec.ts +5 -5
package/src/tests/headless.spec.ts +460 -0
package/src/tests/schemas-cli.spec.ts +142 -0
package/src/tests/schemas.spec.ts +657 -0
package/src/tests/summarize-helpers.spec.ts +3 -3
package/src/tests/trials-cli.spec.ts +145 -0
package/src/trials.ts +6 -19
package/src/validate-refs.ts +1 -1
package/src/tests/acp-integration.docker.ts +0 -214

package/src/tests/capture-helpers.spec.ts CHANGED Viewed

@@ -1,9 +1,11 @@
 import { describe, expect, test } from 'bun:test'
 import type { SessionNotification } from '@agentclientprotocol/sdk'
 import {
+  detectTrajectoryRichness,
   extractContent,
   extractFilePath,
   extractOutput,
+  extractTokenCounts,
   extractTrajectory,
   hasToolErrors,
   headTailPreview,
@@ -16,13 +18,13 @@ import type { TrajectoryStep } from '../schemas.ts'
 // ============================================================================
 describe('loadPrompts', () => {
-  test('parses valid JSONL file', async () => {
+  test('parses valid JSONL file with string input', async () => {
     // Create a temporary test file
     const testPath = '/tmp/test-prompts-valid.jsonl'
     await Bun.write(
       testPath,
       `{"id": "test-1", "input": "What is 2+2?"}
-{"id": "test-2", "input": "Hello world", "expected": "greeting"}`,
+{"id": "test-2", "input": "Hello world", "hint": "greeting"}`,
     )
     const prompts = await loadPrompts(testPath)
@@ -31,7 +33,20 @@ describe('loadPrompts', () => {
     expect(prompts[0]?.id).toBe('test-1')
     expect(prompts[0]?.input).toBe('What is 2+2?')
     expect(prompts[1]?.id).toBe('test-2')
-    expect(prompts[1]?.expected).toBe('greeting')
+    expect(prompts[1]?.hint).toBe('greeting')
+  })
+  test('parses multi-turn input (string array)', async () => {
+    const testPath = '/tmp/test-prompts-multiturn.jsonl'
+    await Bun.write(testPath, `{"id": "test-1", "input": ["Hello", "How are you?", "Goodbye"], "hint": "farewell"}`)
+    const prompts = await loadPrompts(testPath)
+    expect(prompts).toHaveLength(1)
+    expect(prompts[0]?.id).toBe('test-1')
+    expect(Array.isArray(prompts[0]?.input)).toBe(true)
+    expect(prompts[0]?.input).toEqual(['Hello', 'How are you?', 'Goodbye'])
+    expect(prompts[0]?.hint).toBe('farewell')
   })
   test('parses prompts with metadata', async () => {
@@ -104,9 +119,9 @@ describe('extractTrajectory', () => {
     expect(trajectory).toHaveLength(1)
     expect(trajectory[0]?.type).toBe('thought')
-    if (trajectory[0]?.type === 'thought') {
-      expect(trajectory[0].content).toBe('Let me think about this...')
-    }
+    // Type narrowing after explicit assertion
+    const step = trajectory[0]!
+    expect(step.type === 'thought' && step.content).toBe('Let me think about this...')
   })
   test('extracts messages from agent_message_chunk notifications', () => {
@@ -124,9 +139,9 @@ describe('extractTrajectory', () => {
     expect(trajectory).toHaveLength(1)
     expect(trajectory[0]?.type).toBe('message')
-    if (trajectory[0]?.type === 'message') {
-      expect(trajectory[0].content).toBe('Here is my answer.')
-    }
+    // Type narrowing after explicit assertion
+    const step = trajectory[0]!
+    expect(step.type === 'message' && step.content).toBe('Here is my answer.')
   })
   test('extracts tool calls with initial pending status', () => {
@@ -147,11 +162,11 @@ describe('extractTrajectory', () => {
     expect(trajectory).toHaveLength(1)
     expect(trajectory[0]?.type).toBe('tool_call')
-    if (trajectory[0]?.type === 'tool_call') {
-      expect(trajectory[0].name).toBe('Read')
-      expect(trajectory[0].status).toBe('pending')
-      expect(trajectory[0].input).toBe('{"file_path": "/test.ts"}')
-    }
+    // Type narrowing after explicit assertion
+    const step = trajectory[0]!
+    expect(step.type === 'tool_call' && step.name).toBe('Read')
+    expect(step.type === 'tool_call' && step.status).toBe('pending')
+    expect(step.type === 'tool_call' && step.input).toBe('{"file_path": "/test.ts"}')
   })
   test('updates tool call status on subsequent notifications', () => {
@@ -181,10 +196,11 @@ describe('extractTrajectory', () => {
     // Should still be 1 entry, just updated
     expect(trajectory).toHaveLength(1)
-    if (trajectory[0]?.type === 'tool_call') {
-      expect(trajectory[0].status).toBe('completed')
-      expect(trajectory[0].output).toBe('file contents here')
-    }
+    expect(trajectory[0]?.type).toBe('tool_call')
+    // Type narrowing after explicit assertion
+    const step = trajectory[0]!
+    expect(step.type === 'tool_call' && step.status).toBe('completed')
+    expect(step.type === 'tool_call' && step.output).toBe('file contents here')
   })
   test('tracks multiple independent tool calls', () => {
@@ -202,8 +218,13 @@ describe('extractTrajectory', () => {
     const trajectory = extractTrajectory(notifications, baseTime)
     expect(trajectory).toHaveLength(2)
-    expect(trajectory[0]?.type === 'tool_call' && trajectory[0].name).toBe('Read')
-    expect(trajectory[1]?.type === 'tool_call' && trajectory[1].name).toBe('Write')
+    expect(trajectory[0]?.type).toBe('tool_call')
+    expect(trajectory[1]?.type).toBe('tool_call')
+    // Type narrowing after explicit assertions
+    const step0 = trajectory[0]!
+    const step1 = trajectory[1]!
+    expect(step0.type === 'tool_call' && step0.name).toBe('Read')
+    expect(step1.type === 'tool_call' && step1.name).toBe('Write')
   })
   test('extracts plan entries', () => {
@@ -224,9 +245,9 @@ describe('extractTrajectory', () => {
     expect(trajectory).toHaveLength(1)
     expect(trajectory[0]?.type).toBe('plan')
-    if (trajectory[0]?.type === 'plan') {
-      expect(trajectory[0].entries).toHaveLength(2)
-    }
+    // Type narrowing after explicit assertion
+    const step = trajectory[0]!
+    expect(step.type === 'plan' && step.entries).toHaveLength(2)
   })
   test('handles empty notifications', () => {
@@ -237,69 +258,72 @@ describe('extractTrajectory', () => {
   test('assigns timestamps relative to start time', () => {
     // Mock Date.now to control timestamps
     const originalNow = Date.now
-    let currentTime = 1000
+    try {
+      let currentTime = 1000
-    Date.now = () => currentTime
+      Date.now = () => currentTime
-    const notifications: SessionNotification[] = [
-      {
-        sessionId: 's1',
-        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'First' } },
-      },
-    ]
-    const startTime = 1000
-    currentTime = 1500 // 500ms later
+      const notifications: SessionNotification[] = [
+        {
+          sessionId: 's1',
+          update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'First' } },
+        },
+      ]
-    const trajectory = extractTrajectory(notifications, startTime)
+      const startTime = 1000
+      currentTime = 1500 // 500ms later
-    expect(trajectory[0]?.timestamp).toBe(500)
+      const trajectory = extractTrajectory(notifications, startTime)
-    // Restore
-    Date.now = originalNow
+      expect(trajectory[0]?.timestamp).toBe(500)
+    } finally {
+      Date.now = originalNow
+    }
   })
   test('calculates tool call duration correctly', () => {
     const originalNow = Date.now
-    let currentTime = 1000
+    try {
+      let currentTime = 1000
-    Date.now = () => currentTime
+      Date.now = () => currentTime
-    const startTime = 1000
+      const startTime = 1000
-    // Simulate time passing between notifications
-    // First notification at t=100 (currentTime = 1100)
-    // Second notification at t=600 (currentTime = 1600)
-    const notifications: SessionNotification[] = []
+      // Simulate time passing between notifications
+      // First notification at t=100 (currentTime = 1100)
+      // Second notification at t=600 (currentTime = 1600)
+      const notifications: SessionNotification[] = []
-    currentTime = 1100 // First call at 100ms relative to start
-    notifications.push({
-      sessionId: 's1',
-      update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'pending' },
-    })
+      currentTime = 1100 // First call at 100ms relative to start
+      notifications.push({
+        sessionId: 's1',
+        update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'pending' },
+      })
-    currentTime = 1600 // Second call at 600ms relative to start
-    notifications.push({
-      sessionId: 's1',
-      update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'completed' },
-    })
+      currentTime = 1600 // Second call at 600ms relative to start
+      notifications.push({
+        sessionId: 's1',
+        update: { sessionUpdate: 'tool_call', toolCallId: 't1', title: 'Bash', status: 'completed' },
+      })
-    // Now process all notifications in one call
-    // But the issue is extractTrajectory calls Date.now() for each notification
-    // so we need to mock it to return different values for each call
+      // Now process all notifications in one call
+      // But the issue is extractTrajectory calls Date.now() for each notification
+      // so we need to mock it to return different values for each call
-    let callCount = 0
-    const times = [1100, 1600]
-    Date.now = () => times[callCount++] ?? 1600
+      let callCount = 0
+      const times = [1100, 1600]
+      Date.now = () => times[callCount++] ?? 1600
-    const trajectory = extractTrajectory(notifications, startTime)
+      const trajectory = extractTrajectory(notifications, startTime)
-    if (trajectory[0]?.type === 'tool_call') {
-      // Duration should be 500ms (600 - 100)
-      expect(trajectory[0].duration).toBe(500)
+      expect(trajectory[0]?.type).toBe('tool_call')
+      // Type narrowing after explicit assertion - Duration should be 500ms (600 - 100)
+      const step = trajectory[0]!
+      expect(step.type === 'tool_call' && step.duration).toBe(500)
+    } finally {
+      Date.now = originalNow
     }
-    Date.now = originalNow
   })
   test('ignores non-text content in thought chunks', () => {
@@ -551,3 +575,141 @@ describe('extractContent', () => {
     expect(extractContent(input)).toBe('line1\nline2\nline3')
   })
 })
+// ============================================================================
+// detectTrajectoryRichness
+// ============================================================================
+describe('detectTrajectoryRichness', () => {
+  test('returns "full" when trajectory has thoughts', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'thought', content: 'Let me think...', timestamp: 0 },
+      { type: 'message', content: 'Answer', timestamp: 100 },
+    ]
+    expect(detectTrajectoryRichness(trajectory)).toBe('full')
+  })
+  test('returns "full" when trajectory has tool calls', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 0 },
+      { type: 'message', content: 'Answer', timestamp: 100 },
+    ]
+    expect(detectTrajectoryRichness(trajectory)).toBe('full')
+  })
+  test('returns "full" when trajectory has plans', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'plan', entries: [{ content: 'Step 1', status: 'completed' }], timestamp: 0 },
+      { type: 'message', content: 'Answer', timestamp: 100 },
+    ]
+    expect(detectTrajectoryRichness(trajectory)).toBe('full')
+  })
+  test('returns "messages-only" when trajectory only has messages', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'message', content: 'First', timestamp: 0 },
+      { type: 'message', content: 'Second', timestamp: 100 },
+    ]
+    expect(detectTrajectoryRichness(trajectory)).toBe('messages-only')
+  })
+  test('returns "minimal" when trajectory is empty', () => {
+    expect(detectTrajectoryRichness([])).toBe('minimal')
+  })
+  test('returns "full" when trajectory has mixed rich content', () => {
+    const trajectory: TrajectoryStep[] = [
+      { type: 'thought', content: 'Thinking...', timestamp: 0 },
+      { type: 'tool_call', name: 'Read', status: 'completed', timestamp: 50 },
+      { type: 'plan', entries: [], timestamp: 100 },
+      { type: 'message', content: 'Done', timestamp: 150 },
+    ]
+    expect(detectTrajectoryRichness(trajectory)).toBe('full')
+  })
+})
+// ============================================================================
+// extractTokenCounts
+// ============================================================================
+describe('extractTokenCounts', () => {
+  test('returns undefined when no usage data present', () => {
+    const updates: SessionNotification[] = [
+      {
+        sessionId: 's1',
+        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
+      },
+    ]
+    const result = extractTokenCounts(updates)
+    expect(result.inputTokens).toBeUndefined()
+    expect(result.outputTokens).toBeUndefined()
+  })
+  test('extracts token counts from usage field when present', () => {
+    const updates: SessionNotification[] = [
+      {
+        sessionId: 's1',
+        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
+        // @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
+        usage: { inputTokens: 50, outputTokens: 30 },
+      },
+    ]
+    const result = extractTokenCounts(updates)
+    expect(result.inputTokens).toBe(50)
+    expect(result.outputTokens).toBe(30)
+  })
+  test('accumulates token counts across multiple updates', () => {
+    const updates: SessionNotification[] = [
+      {
+        sessionId: 's1',
+        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'First' } },
+        // @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
+        usage: { inputTokens: 50, outputTokens: 30 },
+      },
+      {
+        sessionId: 's1',
+        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Second' } },
+        // @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
+        usage: { inputTokens: 25, outputTokens: 45 },
+      },
+    ]
+    const result = extractTokenCounts(updates)
+    expect(result.inputTokens).toBe(75) // 50 + 25
+    expect(result.outputTokens).toBe(75) // 30 + 45
+  })
+  test('handles empty updates array', () => {
+    const result = extractTokenCounts([])
+    expect(result.inputTokens).toBeUndefined()
+    expect(result.outputTokens).toBeUndefined()
+  })
+  test('handles partial token counts (only input or output)', () => {
+    const updates: SessionNotification[] = [
+      {
+        sessionId: 's1',
+        update: { sessionUpdate: 'agent_message_chunk', content: { type: 'text', text: 'Hello' } },
+        // @ts-expect-error - SessionNotification type doesn't include 'usage' field, but adapters like Claude Code add it at runtime
+        usage: { inputTokens: 100 },
+      },
+    ]
+    const result = extractTokenCounts(updates)
+    expect(result.inputTokens).toBe(100)
+    expect(result.outputTokens).toBeUndefined()
+  })
+})

package/src/tests/constants.spec.ts ADDED Viewed

@@ -0,0 +1,121 @@
+import { describe, expect, test } from 'bun:test'
+import {
+  ACP_METHODS,
+  ACP_PROTOCOL_VERSION,
+  DEFAULT_ACP_CLIENT_NAME,
+  DEFAULT_ACP_TIMEOUT,
+  DEFAULT_CALIBRATION_SAMPLE_SIZE,
+  DEFAULT_HARNESS_TIMEOUT,
+  DEFAULT_POLLING_INTERVAL,
+  DEFAULT_TRIAL_COUNT,
+  HEAD_LINES,
+  JSON_RPC_ERRORS,
+  MAX_CONTENT_LENGTH,
+  TAIL_LINES,
+} from '../constants.ts'
+// ============================================================================
+// ACP Protocol Constants
+// ============================================================================
+describe('ACP_METHODS', () => {
+  test('contains all required lifecycle methods', () => {
+    expect(ACP_METHODS.INITIALIZE).toBe('initialize')
+    expect(ACP_METHODS.SHUTDOWN).toBe('shutdown')
+  })
+  test('contains all required session methods', () => {
+    expect(ACP_METHODS.CREATE_SESSION).toBe('session/new')
+    expect(ACP_METHODS.LOAD_SESSION).toBe('session/load')
+    expect(ACP_METHODS.PROMPT).toBe('session/prompt')
+    expect(ACP_METHODS.CANCEL).toBe('session/cancel')
+    expect(ACP_METHODS.UPDATE).toBe('session/update')
+    expect(ACP_METHODS.REQUEST_PERMISSION).toBe('session/request_permission')
+    expect(ACP_METHODS.SET_MODEL).toBe('session/set_model')
+  })
+  test('contains protocol-level methods', () => {
+    expect(ACP_METHODS.CANCEL_REQUEST).toBe('$/cancel_request')
+  })
+})
+describe('ACP_PROTOCOL_VERSION', () => {
+  test('is version 1', () => {
+    expect(ACP_PROTOCOL_VERSION).toBe(1)
+  })
+})
+// ============================================================================
+// JSON-RPC Error Codes
+// ============================================================================
+describe('JSON_RPC_ERRORS', () => {
+  test('contains standard JSON-RPC error codes', () => {
+    expect(JSON_RPC_ERRORS.PARSE_ERROR).toBe(-32700)
+    expect(JSON_RPC_ERRORS.INVALID_REQUEST).toBe(-32600)
+    expect(JSON_RPC_ERRORS.METHOD_NOT_FOUND).toBe(-32601)
+    expect(JSON_RPC_ERRORS.INVALID_PARAMS).toBe(-32602)
+    expect(JSON_RPC_ERRORS.INTERNAL_ERROR).toBe(-32603)
+  })
+  test('contains ACP extension error codes', () => {
+    expect(JSON_RPC_ERRORS.REQUEST_CANCELLED).toBe(-32800)
+  })
+})
+// ============================================================================
+// ACP Client Defaults
+// ============================================================================
+describe('ACP Client defaults', () => {
+  test('DEFAULT_ACP_CLIENT_NAME is set', () => {
+    expect(DEFAULT_ACP_CLIENT_NAME).toBe('plaited-acp-client')
+  })
+  test('DEFAULT_ACP_TIMEOUT is 30 seconds', () => {
+    expect(DEFAULT_ACP_TIMEOUT).toBe(30000)
+  })
+  test('DEFAULT_POLLING_INTERVAL is 50ms', () => {
+    expect(DEFAULT_POLLING_INTERVAL).toBe(50)
+  })
+})
+// ============================================================================
+// Harness Preview Configuration
+// ============================================================================
+describe('Preview configuration', () => {
+  test('HEAD_LINES is positive', () => {
+    expect(HEAD_LINES).toBeGreaterThan(0)
+    expect(HEAD_LINES).toBe(8)
+  })
+  test('TAIL_LINES is positive', () => {
+    expect(TAIL_LINES).toBeGreaterThan(0)
+    expect(TAIL_LINES).toBe(4)
+  })
+  test('MAX_CONTENT_LENGTH is reasonable', () => {
+    expect(MAX_CONTENT_LENGTH).toBeGreaterThan(0)
+    expect(MAX_CONTENT_LENGTH).toBe(500)
+  })
+})
+// ============================================================================
+// Harness Defaults
+// ============================================================================
+describe('Harness defaults', () => {
+  test('DEFAULT_HARNESS_TIMEOUT is 60 seconds', () => {
+    expect(DEFAULT_HARNESS_TIMEOUT).toBe(60000)
+  })
+  test('DEFAULT_TRIAL_COUNT is 5', () => {
+    expect(DEFAULT_TRIAL_COUNT).toBe(5)
+  })
+  test('DEFAULT_CALIBRATION_SAMPLE_SIZE is 10', () => {
+    expect(DEFAULT_CALIBRATION_SAMPLE_SIZE).toBe(10)
+  })
+})

package/src/tests/fixtures/grader-exec.py CHANGED Viewed

@@ -10,10 +10,10 @@ def main():
     data = json.load(sys.stdin)
     output = data.get("output", "").lower()
-    expected = (data.get("expected") or "").lower()
+    hint = (data.get("hint") or "").lower()
-    if expected:
-        pass_result = expected in output
+    if hint:
+        pass_result = hint in output
     else:
         pass_result = True

package/src/tests/fixtures/grader-module.ts CHANGED Viewed

@@ -4,8 +4,8 @@
 import type { Grader } from '../../schemas.ts'
-export const grade: Grader = async ({ input: _input, output, expected }) => {
-  const pass = expected ? output.toLowerCase().includes(expected.toLowerCase()) : true
+export const grade: Grader = async ({ input: _input, output, hint }) => {
+  const pass = hint ? output.toLowerCase().includes(hint.toLowerCase()) : true
   return {
     pass,
     score: pass ? 1.0 : 0.0,

package/src/tests/grader-loader.spec.ts CHANGED Viewed

@@ -15,7 +15,7 @@ describe('loadGrader - module graders', () => {
     const result = await grader({
       input: 'What is 2+2?',
       output: 'The answer is 4',
-      expected: '4',
+      hint: '4',
     })
     expect(result.pass).toBe(true)
@@ -45,7 +45,7 @@ describe('loadGrader - executable graders', () => {
     const result = await grader({
       input: 'What is 2+2?',
       output: 'The answer is 4',
-      expected: '4',
+      hint: '4',
     })
     expect(result.pass).toBe(true)
@@ -59,7 +59,7 @@ describe('loadGrader - executable graders', () => {
     const result = await grader({
       input: 'What is 2+2?',
       output: 'I do not know',
-      expected: '4',
+      hint: '4',
     })
     expect(result.pass).toBe(false)
@@ -126,7 +126,7 @@ describe('loadGrader - trajectory support', () => {
     const result = await grader({
       input: 'test',
       output: 'The answer is 4',
-      expected: '4',
+      hint: '4',
       trajectory,
     })
@@ -144,7 +144,7 @@ describe('loadGrader - trajectory support', () => {
     const result = await grader({
       input: 'test',
       output: 'The answer is 4',
-      expected: '4',
+      hint: '4',
       trajectory,
     })