npm - @plaited/acp-harness - Versions diffs - 0.3.1 → 0.3.2 - Mend

@plaited/acp-harness 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +84 -47
package/bin/cli.ts +1 -1
package/package.json +3 -3
package/src/adapter-check.ts +3 -3
package/src/adapter-scaffold.ts +10 -10
package/src/balance.ts +31 -6
package/src/calibrate.ts +23 -4
package/src/summarize.ts +18 -4
package/src/tests/adapter-scaffold.spec.ts +5 -5
package/src/tests/balance-helpers.spec.ts +279 -0
package/src/tests/calibrate-helpers.spec.ts +226 -0
package/src/tests/capture-helpers.spec.ts +553 -0
package/src/tests/summarize-helpers.spec.ts +339 -0
package/src/tests/trials-calculations.spec.ts +209 -0
package/src/trials.ts +14 -2

package/README.md CHANGED Viewed

@@ -4,9 +4,11 @@
 [![CI](https://github.com/plaited/acp-harness/actions/workflows/ci.yml/badge.svg)](https://github.com/plaited/acp-harness/actions/workflows/ci.yml)
 [![License: ISC](https://img.shields.io/badge/License-ISC-blue.svg)](https://opensource.org/licenses/ISC)
-CLI tool for capturing agent trajectories from ACP-compatible agents. Execute prompts, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring.
+CLI tool for capturing agent trajectories from ACP-compatible agents. Execute prompts, capture full trajectories (tools, thoughts, plans), and output structured JSONL for downstream scoring. Available as both a CLI tool and as installable skills for AI coding agents.
-## Quick Start
+## CLI Tool
+Use these tools directly via the CLI without installation:
 ```bash
 # Run without installing
@@ -24,49 +26,98 @@ npm install -g @anthropic-ai/claude-code-acp
 export ANTHROPIC_API_KEY=sk-...
 ```
-## Commands
-| Command | Purpose |
-|---------|---------|
-| `capture` | Trajectory capture (full JSONL) |
-| `trials` | Multi-run with pass@k metrics |
-| `summarize` | Derive compact views from results |
-| `calibrate` | Sample failures for review |
-| `validate-refs` | Check reference solutions |
-| `balance` | Analyze test set coverage |
-| `schemas` | Export JSON schemas |
+### Commands
-### capture
+| Command | Description |
+|---------|-------------|
+| `capture <prompts> <cmd>` | Trajectory capture (full JSONL) |
+| `trials <prompts> <cmd>` | Multi-run with pass@k metrics |
+| `summarize <results>` | Derive compact views from results |
+| `calibrate <results>` | Sample failures for review |
+| `validate-refs <prompts>` | Check reference solutions |
+| `balance <prompts>` | Analyze test set coverage |
+| `schemas [name]` | Export JSON schemas |
+| `adapter:scaffold [name]` | Scaffold new ACP adapter project |
+| `adapter:check <cmd>` | Validate adapter ACP compliance |
-Capture full trajectories from an ACP agent:
+### Examples
 ```bash
-acp-harness capture <prompts.jsonl> <command> [args...] [options]
-Options:
-  -o, --output      Output file (default: stdout)
-  -c, --cwd         Working directory for agent
-  -t, --timeout     Request timeout in ms (default: 60000)
-  -g, --grader      Path to grader (.ts/.js module or executable script)
-  --progress        Show progress to stderr
-  --append          Append to output file
-  --mcp-server      MCP server config JSON (repeatable)
-  -h, --help        Show help
+# Capture trajectories
+bunx @plaited/acp-harness capture prompts.jsonl bunx claude-code-acp -o results.jsonl
+# Run trials for pass@k analysis
+bunx @plaited/acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts
+# Summarize results
+bunx @plaited/acp-harness summarize results.jsonl -o summary.jsonl
+# Export schemas
+bunx @plaited/acp-harness schemas CaptureResult --json
+# Scaffold a new adapter
+bunx @plaited/acp-harness adapter:scaffold my-agent -o ./my-agent-acp
+# Validate adapter compliance
+bunx @plaited/acp-harness adapter:check bun ./my-adapter/src/main.ts
 ```
-### trials
+## Skills for AI Agents
-Run multiple trials per prompt for pass@k analysis:
+**Install skills** for use with AI coding agents:
 ```bash
-acp-harness trials <prompts.jsonl> <command> [args...] [options]
+curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agent <agent-name> --project acp-harness
+```
+Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
-Options:
-  -k                Number of trials per prompt (default: 3)
-  -g, --grader      Path to grader (computes pass@k/pass^k metrics)
-  ...               (same as capture)
+**Update skills:**
+```bash
+curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- update --agent <agent-name> --project acp-harness
 ```
+### Available Skills
+#### ACP Harness
+CLI tool for capturing agent trajectories, optimized for TypeScript/JavaScript projects using Bun.
+**Commands:**
+| Command | Description |
+|---------|-------------|
+| `capture` | Execute prompts and capture full trajectories |
+| `trials` | Multi-run trials with pass@k/pass^k metrics |
+| `summarize` | Derive compact views from trajectory results |
+| `calibrate` | Sample failures for grader calibration |
+| `validate-refs` | Validate reference solutions against graders |
+| `balance` | Analyze test set coverage distribution |
+| `schemas` | Export Zod schemas as JSON Schema |
+**Use cases:**
+- Capturing trajectories for downstream evaluation (Braintrust, custom scorers)
+- Generating training data (SFT/DPO) with full context
+- Building regression test fixtures for agent behavior
+- Comparing agent responses across configurations
+#### ACP Adapters
+Discover, create, and validate ACP adapters for agent integration.
+**Commands:**
+| Command | Description |
+|---------|-------------|
+| `adapter:scaffold` | Generate new adapter project with handlers |
+| `adapter:check` | Validate ACP protocol compliance |
+**Use cases:**
+- Finding existing adapters for your agent
+- Building custom ACP adapters from scratch
+- Validating adapter implementations
 ## Input Format
 ```jsonl
@@ -165,20 +216,6 @@ cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
 cat results.jsonl | your-scoring-script.ts
 ```
-## Plugin
-This package includes an **acp-harness skill** for AI coding agents with complete documentation:
-- CLI usage and examples
-- Output format schemas
-- Integration patterns (Braintrust, jq, custom scorers)
-**Other AI coding agents:**
-```bash
-curl -fsSL https://raw.githubusercontent.com/plaited/marketplace/main/install.sh | bash -s -- --agent <agent-name> --plugin development-skills
-```
 ## Development
 ```bash

package/bin/cli.ts CHANGED Viewed

@@ -68,7 +68,7 @@ Examples:
   acp-harness adapter:scaffold my-agent -o ./adapters/my-agent
   # Validate adapter compliance
-  acp-harness adapter:check bun ./my-adapter/src/index.ts
+  acp-harness adapter:check bun ./my-adapter/src/main.ts
 Documentation: https://github.com/plaited/acp-harness
 `)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/acp-harness",
-  "version": "0.3.1",
+  "version": "0.3.2",
   "description": "CLI tool for capturing agent trajectories from ACP-compatible agents",
   "license": "ISC",
   "engines": {
@@ -56,7 +56,7 @@
   },
   "dependencies": {
     "zod": "^4.3.5",
-    "@plaited/development-skills": "0.4.1"
+    "@plaited/development-skills": "0.6.2"
   },
   "peerDependencies": {
     "typescript-language-server": "^5.1.3",
@@ -65,7 +65,7 @@
   "devDependencies": {
     "@biomejs/biome": "2.3.11",
     "@types/bun": "1.3.6",
-    "@zed-industries/claude-code-acp": "0.13.0",
+    "@zed-industries/claude-code-acp": "0.13.1",
     "format-package": "7.0.0",
     "lint-staged": "16.2.7",
     "typescript": "5.9.3"

package/src/adapter-check.ts CHANGED Viewed

@@ -25,7 +25,7 @@ import { ACP_METHODS, ACP_PROTOCOL_VERSION, DEFAULT_ACP_CLIENT_NAME } from './co
 /** Configuration for compliance check */
 export type CheckConfig = {
-  /** Command to spawn adapter (e.g., ['bun', './src/index.ts']) */
+  /** Command to spawn adapter (e.g., ['bun', './src/main.ts']) */
   command: string[]
   /** Timeout for each check in milliseconds */
   timeout: number
@@ -488,7 +488,7 @@ Checks Performed:
 Examples:
   # Check local TypeScript adapter
-  acp-harness adapter:check bun ./my-adapter/src/index.ts
+  acp-harness adapter:check bun ./my-adapter/src/main.ts
   # Check with verbose output
   acp-harness adapter:check bunx my-adapter --verbose
@@ -501,7 +501,7 @@ Examples:
   if (positionals.length === 0) {
     console.error('Error: adapter command is required')
-    console.error('Example: acp-harness adapter:check bun ./src/index.ts')
+    console.error('Example: acp-harness adapter:check bun ./src/main.ts')
     process.exit(1)
   }

package/src/adapter-scaffold.ts CHANGED Viewed

@@ -49,11 +49,11 @@ const tsPackageJson = (name: string): string => `{
   "version": "1.0.0",
   "type": "module",
   "bin": {
-    "${name}-acp": "./src/index.ts"
+    "${name}-acp": "./src/main.ts"
   },
   "scripts": {
-    "start": "bun run src/index.ts",
-    "check": "bunx @plaited/acp-harness adapter:check bun ./src/index.ts"
+    "start": "bun run src/main.ts",
+    "check": "bunx @plaited/acp-harness adapter:check bun ./src/main.ts"
   },
   "dependencies": {
     "@agentclientprotocol/sdk": "^0.0.1"
@@ -335,7 +335,7 @@ const tsSessionPromptHandler = (): string => `/**
  */
 import { sessionManager } from '../session-manager.ts'
-import { sendSessionUpdate } from '../index.ts'
+import { sendSessionUpdate } from '../main.ts'
 import type { ContentBlock } from '../types.ts'
 type PromptParams = {
@@ -486,7 +486,7 @@ bun install
 bun run start
 # Or run directly
-bun run src/index.ts
+bun run src/main.ts
 \`\`\`
 ## Verify Compliance
@@ -496,7 +496,7 @@ bun run src/index.ts
 bun run check
 # Or manually
-bunx @plaited/acp-harness adapter:check bun ./src/index.ts
+bunx @plaited/acp-harness adapter:check bun ./src/main.ts
 \`\`\`
 ## Test with Harness
@@ -506,7 +506,7 @@ bunx @plaited/acp-harness adapter:check bun ./src/index.ts
 echo '{"id":"test-1","input":"Hello"}' > prompts.jsonl
 # Run capture
-bunx @plaited/acp-harness capture prompts.jsonl bun ./src/index.ts -o results.jsonl
+bunx @plaited/acp-harness capture prompts.jsonl bun ./src/main.ts -o results.jsonl
 # View results
 cat results.jsonl | jq .
@@ -774,8 +774,8 @@ const scaffoldTypeScript = async (config: ScaffoldConfig): Promise<string[]> =>
   await Bun.write(join(outputDir, 'tsconfig.json'), tsTsConfig())
   files.push('tsconfig.json')
-  await Bun.write(join(outputDir, 'src', 'index.ts'), tsIndexFile(name))
-  files.push('src/index.ts')
+  await Bun.write(join(outputDir, 'src', 'main.ts'), tsIndexFile(name))
+  files.push('src/main.ts')
   await Bun.write(join(outputDir, 'src', 'types.ts'), tsTypesFile())
   files.push('src/types.ts')
@@ -929,6 +929,6 @@ ${result.lang === 'ts' ? '  bun install' : '  chmod +x adapter.py'}
 ${result.lang === 'ts' ? '  bun run start' : '  python adapter.py'}
 Verify compliance:
-  acp-harness adapter:check ${result.lang === 'ts' ? 'bun ./src/index.ts' : 'python ./adapter.py'}
+  acp-harness adapter:check ${result.lang === 'ts' ? 'bun ./src/main.ts' : 'python ./adapter.py'}
 `)
 }

package/src/balance.ts CHANGED Viewed

@@ -38,8 +38,16 @@ const resolvePath = (path: string): string => {
   return `${process.cwd()}/${path}`
 }
-/** Analyze category distribution */
-const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
+/**
+ * Analyze category distribution across prompts.
+ *
+ * @param prompts - Array of prompt cases
+ * @param key - Metadata key to analyze
+ * @returns Array of category distributions sorted by count descending
+ *
+ * @public
+ */
+export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
   const counts = new Map<string, number>()
   for (const prompt of prompts) {
@@ -65,16 +73,33 @@ const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribu
   return distributions
 }
-/** Identify underrepresented categories */
-const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
+/**
+ * Identify underrepresented categories.
+ *
+ * @param distributions - Array of category distributions
+ * @param threshold - Percentage threshold relative to even distribution
+ * @returns Array of underrepresented category names
+ *
+ * @public
+ */
+export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
   // Expected percentage if evenly distributed
   const evenPercentage = 100 / distributions.length
   return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
 }
-/** Generate suggestions for improvement */
-const generateSuggestions = (
+/**
+ * Generate suggestions for improving test set balance.
+ *
+ * @param distributions - Array of category distributions
+ * @param underrepresented - Array of underrepresented category names
+ * @param total - Total number of test cases
+ * @returns Array of suggestion strings
+ *
+ * @public
+ */
+export const generateSuggestions = (
   distributions: CategoryDistribution[],
   underrepresented: string[],
   total: number,

package/src/calibrate.ts CHANGED Viewed

@@ -56,14 +56,33 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
     })
 }
-/** Random sample from array */
-const sampleArray = <T>(arr: T[], n: number): T[] => {
+/**
+ * Random sample from array.
+ *
+ * @param arr - Array to sample from
+ * @param n - Number of samples to take
+ * @returns Array of sampled elements
+ *
+ * @public
+ */
+export const sampleArray = <T>(arr: T[], n: number): T[] => {
   const shuffled = [...arr].sort(() => 0.5 - Math.random())
   return shuffled.slice(0, n)
 }
-/** Get snippet of trajectory for review */
-const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
+/**
+ * Get snippet of trajectory for review.
+ *
+ * @remarks
+ * Includes first 2 steps, middle step, and last 2 steps.
+ *
+ * @param trajectory - Full trajectory
+ * @param maxSteps - Maximum number of steps to include
+ * @returns Trajectory snippet
+ *
+ * @public
+ */
+export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
   // Include first and last steps, plus some from the middle
   if (trajectory.length <= maxSteps) return trajectory

package/src/summarize.ts CHANGED Viewed

@@ -55,8 +55,15 @@ const loadResults = async (path: string): Promise<CaptureResult[]> => {
     })
 }
-/** Format result as summary JSONL */
-const formatSummary = (result: CaptureResult): SummaryResult => {
+/**
+ * Format capture result as compact summary.
+ *
+ * @param result - Full capture result
+ * @returns Compact summary result
+ *
+ * @public
+ */
+export const formatSummary = (result: CaptureResult): SummaryResult => {
   return {
     id: result.id,
     input: result.input,
@@ -66,8 +73,15 @@ const formatSummary = (result: CaptureResult): SummaryResult => {
   }
 }
-/** Format result as markdown with step IDs */
-const formatMarkdown = (result: CaptureResult): string => {
+/**
+ * Format capture result as markdown with step IDs.
+ *
+ * @param result - Full capture result
+ * @returns Markdown formatted string
+ *
+ * @public
+ */
+export const formatMarkdown = (result: CaptureResult): string => {
   const lines: string[] = [
     `## Evaluation Record: ${result.id}`,
     '',

package/src/tests/adapter-scaffold.spec.ts CHANGED Viewed

@@ -29,7 +29,7 @@ describe('runScaffold', () => {
     expect(result.lang).toBe('ts')
     expect(result.files).toContain('package.json')
     expect(result.files).toContain('tsconfig.json')
-    expect(result.files).toContain('src/index.ts')
+    expect(result.files).toContain('src/main.ts')
     expect(result.files).toContain('src/types.ts')
     expect(result.files).toContain('src/session-manager.ts')
     expect(result.files).toContain('src/handlers/initialize.ts')
@@ -42,9 +42,9 @@ describe('runScaffold', () => {
     const packageJson = await Bun.file(join(testDir, 'package.json')).text()
     expect(packageJson).toContain('"test-agent-acp"')
-    const indexTs = await Bun.file(join(testDir, 'src', 'index.ts')).text()
-    expect(indexTs).toContain('#!/usr/bin/env bun')
-    expect(indexTs).toContain('handleInitialize')
+    const mainTs = await Bun.file(join(testDir, 'src', 'main.ts')).text()
+    expect(mainTs).toContain('#!/usr/bin/env bun')
+    expect(mainTs).toContain('handleInitialize')
   })
   test('generates minimal TypeScript structure without README', async () => {
@@ -59,7 +59,7 @@ describe('runScaffold', () => {
     expect(result.files).not.toContain('README.md')
     expect(result.files).toContain('package.json')
-    expect(result.files).toContain('src/index.ts')
+    expect(result.files).toContain('src/main.ts')
   })
   test('generates Python adapter structure', async () => {