@plaited/acp-harness 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,9 +4,11 @@
4
4
  [![CI](https://github.com/plaited/acp-harness/actions/workflows/ci.yml/badge.svg)](https://github.com/plaited/acp-harness/actions/workflows/ci.yml)
5
5
  [![License: ISC](https://img.shields.io/badge/License-ISC-blue.svg)](https://opensource.org/licenses/ISC)
6
6
 
7
- CLI tool for capturing agent trajectories from ACP-compatible agents. Execute prompts, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring.
7
+ CLI tool for capturing agent trajectories from ACP-compatible agents. Execute prompts, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring. Available as both a CLI tool and as installable skills for AI coding agents.
8
8
 
9
- ## Quick Start
9
+ ## CLI Tool
10
+
11
+ Use these tools directly via the CLI without installation:
10
12
 
11
13
  ```bash
12
14
  # Run without installing
@@ -24,49 +26,98 @@ npm install -g @anthropic-ai/claude-code-acp
24
26
  export ANTHROPIC_API_KEY=sk-...
25
27
  ```
26
28
 
27
- ## Commands
28
-
29
- | Command | Purpose |
30
- |---------|---------|
31
- | `capture` | Trajectory capture (full JSONL) |
32
- | `trials` | Multi-run with pass@k metrics |
33
- | `summarize` | Derive compact views from results |
34
- | `calibrate` | Sample failures for review |
35
- | `validate-refs` | Check reference solutions |
36
- | `balance` | Analyze test set coverage |
37
- | `schemas` | Export JSON schemas |
29
+ ### Commands
38
30
 
39
- ### capture
31
+ | Command | Description |
32
+ |---------|-------------|
33
+ | `capture <prompts> <cmd>` | Trajectory capture (full JSONL) |
34
+ | `trials <prompts> <cmd>` | Multi-run with pass@k metrics |
35
+ | `summarize <results>` | Derive compact views from results |
36
+ | `calibrate <results>` | Sample failures for review |
37
+ | `validate-refs <prompts>` | Check reference solutions |
38
+ | `balance <prompts>` | Analyze test set coverage |
39
+ | `schemas [name]` | Export JSON schemas |
40
+ | `adapter:scaffold [name]` | Scaffold new ACP adapter project |
41
+ | `adapter:check <cmd>` | Validate adapter ACP compliance |
40
42
 
41
- Capture full trajectories from an ACP agent:
43
+ ### Examples
42
44
 
43
45
  ```bash
44
- acp-harness capture <prompts.jsonl> <command> [args...] [options]
45
-
46
- Options:
47
- -o, --output Output file (default: stdout)
48
- -c, --cwd Working directory for agent
49
- -t, --timeout Request timeout in ms (default: 60000)
50
- -g, --grader Path to grader (.ts/.js module or executable script)
51
- --progress Show progress to stderr
52
- --append Append to output file
53
- --mcp-server MCP server config JSON (repeatable)
54
- -h, --help Show help
46
+ # Capture trajectories
47
+ bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
48
+
49
+ # Run trials for pass@k analysis
50
+ bunx @plaited/acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts
51
+
52
+ # Summarize results
53
+ bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
54
+
55
+ # Export schemas
56
+ bunx @plaited/acp-harness schemas CaptureResult --json
57
+
58
+ # Scaffold a new adapter
59
+ bunx @plaited/acp-harness adapter:scaffold my-agent -o ./my-agent-acp
60
+
61
+ # Validate adapter compliance
62
+ bunx @plaited/acp-harness adapter:check bun ./my-adapter/src/main.ts
55
63
  ```
56
64
 
57
- ### trials
65
+ ## Skills for AI Agents
58
66
 
59
- Run multiple trials per prompt for pass@k analysis:
67
+ **Install skills** for use with AI coding agents:
60
68
 
61
69
  ```bash
62
- acp-harness trials <prompts.jsonl> <command> [args...] [options]
70
+ curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agent <agent-name> --project acp-harness
71
+ ```
72
+
73
+ Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
63
74
 
64
- Options:
65
- -k Number of trials per prompt (default: 3)
66
- -g, --grader Path to grader (computes pass@k/pass^k metrics)
67
- ... (same as capture)
75
+ **Update skills:**
76
+
77
+ ```bash
78
+ curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- update --agent <agent-name> --project acp-harness
68
79
  ```
69
80
 
81
+ ### Available Skills
82
+
83
+ #### ACP Harness
84
+
85
+ CLI tool for capturing agent trajectories, optimized for TypeScript/JavaScript projects using Bun.
86
+
87
+ **Commands:**
88
+
89
+ | Command | Description |
90
+ |---------|-------------|
91
+ | `capture` | Execute prompts and capture full trajectories |
92
+ | `trials` | Multi-run trials with pass@k/pass^k metrics |
93
+ | `summarize` | Derive compact views from trajectory results |
94
+ | `calibrate` | Sample failures for grader calibration |
95
+ | `validate-refs` | Validate reference solutions against graders |
96
+ | `balance` | Analyze test set coverage distribution |
97
+ | `schemas` | Export Zod schemas as JSON Schema |
98
+
99
+ **Use cases:**
100
+ - Capturing trajectories for downstream evaluation (Braintrust, custom scorers)
101
+ - Generating training data (SFT/DPO) with full context
102
+ - Building regression test fixtures for agent behavior
103
+ - Comparing agent responses across configurations
104
+
105
+ #### ACP Adapters
106
+
107
+ Discover, create, and validate ACP adapters for agent integration.
108
+
109
+ **Commands:**
110
+
111
+ | Command | Description |
112
+ |---------|-------------|
113
+ | `adapter:scaffold` | Generate new adapter project with handlers |
114
+ | `adapter:check` | Validate ACP protocol compliance |
115
+
116
+ **Use cases:**
117
+ - Finding existing adapters for your agent
118
+ - Building custom ACP adapters from scratch
119
+ - Validating adapter implementations
120
+
70
121
  ## Input Format
71
122
 
72
123
  ```jsonl
@@ -165,20 +216,6 @@ cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
165
216
  cat results.jsonl | your-scoring-script.ts
166
217
  ```
167
218
 
168
- ## Plugin
169
-
170
- This package includes an **acp-harness skill** for AI coding agents with complete documentation:
171
-
172
- - CLI usage and examples
173
- - Output format schemas
174
- - Integration patterns (Braintrust, jq, custom scorers)
175
-
176
- **Other AI coding agents:**
177
-
178
- ```bash
179
- curl -fsSL https://raw.githubusercontent.com/plaited/marketplace/main/install.sh | bash -s -- --agent <agent-name> --plugin development-skills
180
- ```
181
-
182
219
  ## Development
183
220
 
184
221
  ```bash
package/bin/cli.ts CHANGED
@@ -68,7 +68,7 @@ Examples:
68
68
  acp-harness adapter:scaffold my-agent -o ./adapters/my-agent
69
69
 
70
70
  # Validate adapter compliance
71
- acp-harness adapter:check bun ./my-adapter/src/index.ts
71
+ acp-harness adapter:check bun ./my-adapter/src/main.ts
72
72
 
73
73
  Documentation: https://github.com/plaited/acp-harness
74
74
  `)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/acp-harness",
3
- "version": "0.3.1",
3
+ "version": "0.3.2",
4
4
  "description": "CLI tool for capturing agent trajectories from ACP-compatible agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -56,7 +56,7 @@
56
56
  },
57
57
  "dependencies": {
58
58
  "zod": "^4.3.5",
59
- "@plaited/development-skills": "0.4.1"
59
+ "@plaited/development-skills": "0.6.2"
60
60
  },
61
61
  "peerDependencies": {
62
62
  "typescript-language-server": "^5.1.3",
@@ -65,7 +65,7 @@
65
65
  "devDependencies": {
66
66
  "@biomejs/biome": "2.3.11",
67
67
  "@types/bun": "1.3.6",
68
- "@zed-industries/claude-code-acp": "0.13.0",
68
+ "@zed-industries/claude-code-acp": "0.13.1",
69
69
  "format-package": "7.0.0",
70
70
  "lint-staged": "16.2.7",
71
71
  "typescript": "5.9.3"
@@ -25,7 +25,7 @@ import { ACP_METHODS, ACP_PROTOCOL_VERSION, DEFAULT_ACP_CLIENT_NAME } from './co
25
25
 
26
26
  /** Configuration for compliance check */
27
27
  export type CheckConfig = {
28
- /** Command to spawn adapter (e.g., ['bun', './src/index.ts']) */
28
+ /** Command to spawn adapter (e.g., ['bun', './src/main.ts']) */
29
29
  command: string[]
30
30
  /** Timeout for each check in milliseconds */
31
31
  timeout: number
@@ -488,7 +488,7 @@ Checks Performed:
488
488
 
489
489
  Examples:
490
490
  # Check local TypeScript adapter
491
- acp-harness adapter:check bun ./my-adapter/src/index.ts
491
+ acp-harness adapter:check bun ./my-adapter/src/main.ts
492
492
 
493
493
  # Check with verbose output
494
494
  acp-harness adapter:check bunx my-adapter --verbose
@@ -501,7 +501,7 @@ Examples:
501
501
 
502
502
  if (positionals.length === 0) {
503
503
  console.error('Error: adapter command is required')
504
- console.error('Example: acp-harness adapter:check bun ./src/index.ts')
504
+ console.error('Example: acp-harness adapter:check bun ./src/main.ts')
505
505
  process.exit(1)
506
506
  }
507
507
 
@@ -49,11 +49,11 @@ const tsPackageJson = (name: string): string => `{
49
49
  "version": "1.0.0",
50
50
  "type": "module",
51
51
  "bin": {
52
- "${name}-acp": "./src/index.ts"
52
+ "${name}-acp": "./src/main.ts"
53
53
  },
54
54
  "scripts": {
55
- "start": "bun run src/index.ts",
56
- "check": "bunx @plaited/acp-harness adapter:check bun ./src/index.ts"
55
+ "start": "bun run src/main.ts",
56
+ "check": "bunx @plaited/acp-harness adapter:check bun ./src/main.ts"
57
57
  },
58
58
  "dependencies": {
59
59
  "@agentclientprotocol/sdk": "^0.0.1"
@@ -335,7 +335,7 @@ const tsSessionPromptHandler = (): string => `/**
335
335
  */
336
336
 
337
337
  import { sessionManager } from '../session-manager.ts'
338
- import { sendSessionUpdate } from '../index.ts'
338
+ import { sendSessionUpdate } from '../main.ts'
339
339
  import type { ContentBlock } from '../types.ts'
340
340
 
341
341
  type PromptParams = {
@@ -486,7 +486,7 @@ bun install
486
486
  bun run start
487
487
 
488
488
  # Or run directly
489
- bun run src/index.ts
489
+ bun run src/main.ts
490
490
  \`\`\`
491
491
 
492
492
  ## Verify Compliance
@@ -496,7 +496,7 @@ bun run src/index.ts
496
496
  bun run check
497
497
 
498
498
  # Or manually
499
- bunx @plaited/acp-harness adapter:check bun ./src/index.ts
499
+ bunx @plaited/acp-harness adapter:check bun ./src/main.ts
500
500
  \`\`\`
501
501
 
502
502
  ## Test with Harness
@@ -506,7 +506,7 @@ bunx @plaited/acp-harness adapter:check bun ./src/index.ts
506
506
  echo '{"id":"test-1","input":"Hello"}' > prompts.jsonl
507
507
 
508
508
  # Run capture
509
- bunx @plaited/acp-harness capture prompts.jsonl bun ./src/index.ts -o results.jsonl
509
+ bunx @plaited/acp-harness capture prompts.jsonl bun ./src/main.ts -o results.jsonl
510
510
 
511
511
  # View results
512
512
  cat results.jsonl | jq .
@@ -774,8 +774,8 @@ const scaffoldTypeScript = async (config: ScaffoldConfig): Promise<string[]> =>
774
774
  await Bun.write(join(outputDir, 'tsconfig.json'), tsTsConfig())
775
775
  files.push('tsconfig.json')
776
776
 
777
- await Bun.write(join(outputDir, 'src', 'index.ts'), tsIndexFile(name))
778
- files.push('src/index.ts')
777
+ await Bun.write(join(outputDir, 'src', 'main.ts'), tsIndexFile(name))
778
+ files.push('src/main.ts')
779
779
 
780
780
  await Bun.write(join(outputDir, 'src', 'types.ts'), tsTypesFile())
781
781
  files.push('src/types.ts')
@@ -929,6 +929,6 @@ ${result.lang === 'ts' ? ' bun install' : ' chmod +x adapter.py'}
929
929
  ${result.lang === 'ts' ? ' bun run start' : ' python adapter.py'}
930
930
 
931
931
  Verify compliance:
932
- acp-harness adapter:check ${result.lang === 'ts' ? 'bun ./src/index.ts' : 'python ./adapter.py'}
932
+ acp-harness adapter:check ${result.lang === 'ts' ? 'bun ./src/main.ts' : 'python ./adapter.py'}
933
933
  `)
934
934
  }
package/src/balance.ts CHANGED
@@ -38,8 +38,16 @@ const resolvePath = (path: string): string => {
38
38
  return `${process.cwd()}/${path}`
39
39
  }
40
40
 
41
- /** Analyze category distribution */
42
- const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
41
+ /**
42
+ * Analyze category distribution across prompts.
43
+ *
44
+ * @param prompts - Array of prompt cases
45
+ * @param key - Metadata key to analyze
46
+ * @returns Array of category distributions sorted by count descending
47
+ *
48
+ * @public
49
+ */
50
+ export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
43
51
  const counts = new Map<string, number>()
44
52
 
45
53
  for (const prompt of prompts) {
@@ -65,16 +73,33 @@ const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribu
65
73
  return distributions
66
74
  }
67
75
 
68
- /** Identify underrepresented categories */
69
- const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
76
+ /**
77
+ * Identify underrepresented categories.
78
+ *
79
+ * @param distributions - Array of category distributions
80
+ * @param threshold - Percentage threshold relative to even distribution
81
+ * @returns Array of underrepresented category names
82
+ *
83
+ * @public
84
+ */
85
+ export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
70
86
  // Expected percentage if evenly distributed
71
87
  const evenPercentage = 100 / distributions.length
72
88
 
73
89
  return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
74
90
  }
75
91
 
76
- /** Generate suggestions for improvement */
77
- const generateSuggestions = (
92
+ /**
93
+ * Generate suggestions for improving test set balance.
94
+ *
95
+ * @param distributions - Array of category distributions
96
+ * @param underrepresented - Array of underrepresented category names
97
+ * @param total - Total number of test cases
98
+ * @returns Array of suggestion strings
99
+ *
100
+ * @public
101
+ */
102
+ export const generateSuggestions = (
78
103
  distributions: CategoryDistribution[],
79
104
  underrepresented: string[],
80
105
  total: number,
package/src/calibrate.ts CHANGED
@@ -56,14 +56,33 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
56
56
  })
57
57
  }
58
58
 
59
- /** Random sample from array */
60
- const sampleArray = <T>(arr: T[], n: number): T[] => {
59
+ /**
60
+ * Random sample from array.
61
+ *
62
+ * @param arr - Array to sample from
63
+ * @param n - Number of samples to take
64
+ * @returns Array of sampled elements
65
+ *
66
+ * @public
67
+ */
68
+ export const sampleArray = <T>(arr: T[], n: number): T[] => {
61
69
  const shuffled = [...arr].sort(() => 0.5 - Math.random())
62
70
  return shuffled.slice(0, n)
63
71
  }
64
72
 
65
- /** Get snippet of trajectory for review */
66
- const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
73
+ /**
74
+ * Get snippet of trajectory for review.
75
+ *
76
+ * @remarks
77
+ * Includes first 2 steps, middle step, and last 2 steps.
78
+ *
79
+ * @param trajectory - Full trajectory
80
+ * @param maxSteps - Maximum number of steps to include
81
+ * @returns Trajectory snippet
82
+ *
83
+ * @public
84
+ */
85
+ export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
67
86
  // Include first and last steps, plus some from the middle
68
87
  if (trajectory.length <= maxSteps) return trajectory
69
88
 
package/src/summarize.ts CHANGED
@@ -55,8 +55,15 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
55
55
  })
56
56
  }
57
57
 
58
- /** Format result as summary JSONL */
59
- const formatSummary = (result: CaptureResult): SummaryResult => {
58
+ /**
59
+ * Format capture result as compact summary.
60
+ *
61
+ * @param result - Full capture result
62
+ * @returns Compact summary result
63
+ *
64
+ * @public
65
+ */
66
+ export const formatSummary = (result: CaptureResult): SummaryResult => {
60
67
  return {
61
68
  id: result.id,
62
69
  input: result.input,
@@ -66,8 +73,15 @@ const formatSummary = (result: CaptureResult): SummaryResult => {
66
73
  }
67
74
  }
68
75
 
69
- /** Format result as markdown with step IDs */
70
- const formatMarkdown = (result: CaptureResult): string => {
76
+ /**
77
+ * Format capture result as markdown with step IDs.
78
+ *
79
+ * @param result - Full capture result
80
+ * @returns Markdown formatted string
81
+ *
82
+ * @public
83
+ */
84
+ export const formatMarkdown = (result: CaptureResult): string => {
71
85
  const lines: string[] = [
72
86
  `## Evaluation Record: ${result.id}`,
73
87
  '',
@@ -29,7 +29,7 @@ describe('runScaffold', () => {
29
29
  expect(result.lang).toBe('ts')
30
30
  expect(result.files).toContain('package.json')
31
31
  expect(result.files).toContain('tsconfig.json')
32
- expect(result.files).toContain('src/index.ts')
32
+ expect(result.files).toContain('src/main.ts')
33
33
  expect(result.files).toContain('src/types.ts')
34
34
  expect(result.files).toContain('src/session-manager.ts')
35
35
  expect(result.files).toContain('src/handlers/initialize.ts')
@@ -42,9 +42,9 @@ describe('runScaffold', () => {
42
42
  const packageJson = await Bun.file(join(testDir, 'package.json')).text()
43
43
  expect(packageJson).toContain('"test-agent-acp"')
44
44
 
45
- const indexTs = await Bun.file(join(testDir, 'src', 'index.ts')).text()
46
- expect(indexTs).toContain('#!/usr/bin/env bun')
47
- expect(indexTs).toContain('handleInitialize')
45
+ const mainTs = await Bun.file(join(testDir, 'src', 'main.ts')).text()
46
+ expect(mainTs).toContain('#!/usr/bin/env bun')
47
+ expect(mainTs).toContain('handleInitialize')
48
48
  })
49
49
 
50
50
  test('generates minimal TypeScript structure without README', async () => {
@@ -59,7 +59,7 @@ describe('runScaffold', () => {
59
59
 
60
60
  expect(result.files).not.toContain('README.md')
61
61
  expect(result.files).toContain('package.json')
62
- expect(result.files).toContain('src/index.ts')
62
+ expect(result.files).toContain('src/main.ts')
63
63
  })
64
64
 
65
65
  test('generates Python adapter structure', async () => {