npm - @plaited/acp-harness - Versions diffs - 0.2.6 → 0.3.2 - Mend

@plaited/acp-harness 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +1 -1
package/README.md +175 -34
package/bin/cli.ts +105 -636
package/bin/tests/cli.spec.ts +218 -51
package/package.json +21 -5
package/src/acp-client.ts +5 -4
package/src/acp-transport.ts +14 -7
package/src/adapter-check.ts +542 -0
package/src/adapter-scaffold.ts +934 -0
package/src/balance.ts +257 -0
package/src/calibrate.ts +319 -0
package/src/capture.ts +457 -0
package/src/constants.ts +94 -0
package/src/grader-loader.ts +174 -0
package/src/harness.ts +35 -0
package/src/schemas-cli.ts +239 -0
package/src/schemas.ts +567 -0
package/src/summarize.ts +259 -0
package/src/tests/adapter-check.spec.ts +70 -0
package/src/tests/adapter-scaffold.spec.ts +112 -0
package/src/tests/balance-helpers.spec.ts +279 -0
package/src/tests/calibrate-helpers.spec.ts +226 -0
package/src/tests/capture-helpers.spec.ts +553 -0
package/src/tests/fixtures/grader-bad-module.ts +5 -0
package/src/tests/fixtures/grader-exec-fail.py +9 -0
package/src/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/tests/fixtures/grader-exec.py +29 -0
package/src/tests/fixtures/grader-module.ts +14 -0
package/src/tests/grader-loader.spec.ts +153 -0
package/src/tests/summarize-helpers.spec.ts +339 -0
package/src/tests/trials-calculations.spec.ts +209 -0
package/src/trials.ts +407 -0
package/src/validate-refs.ts +188 -0
package/.claude/rules/accuracy.md +0 -43
package/.claude/rules/bun-apis.md +0 -80
package/.claude/rules/code-review.md +0 -254
package/.claude/rules/git-workflow.md +0 -37
package/.claude/rules/github.md +0 -154
package/.claude/rules/testing.md +0 -172
package/.claude/skills/acp-harness/SKILL.md +0 -310
package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
package/.claude/skills/acp-harness/references/downstream.md +0 -288
package/.claude/skills/acp-harness/references/output-formats.md +0 -221
package/.claude-plugin/marketplace.json +0 -15
package/.claude-plugin/plugin.json +0 -16
package/.github/CODEOWNERS +0 -6
package/.github/workflows/ci.yml +0 -63
package/.github/workflows/publish.yml +0 -146
package/.mcp.json +0 -20
package/CLAUDE.md +0 -92
package/Dockerfile.test +0 -23
package/biome.json +0 -96
package/bun.lock +0 -513
package/docker-compose.test.yml +0 -21
package/scripts/bun-test-wrapper.sh +0 -46
package/src/acp.constants.ts +0 -56
package/src/acp.schemas.ts +0 -161
package/src/acp.types.ts +0 -28
package/src/tests/fixtures/.claude/settings.local.json +0 -8
package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
package/tsconfig.json +0 -32

package/src/summarize.ts ADDED Viewed

@@ -0,0 +1,259 @@
+/**
+ * Summarize command - derive compact views from full trajectory results.
+ *
+ * @remarks
+ * Transforms full trajectory JSONL into:
+ * - Summary JSONL: Compact format for jq analysis
+ * - Markdown: Human-readable format for LLM-as-judge workflows
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { extractContent, extractFilePath, headTailPreview } from './capture.ts'
+import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from './constants.ts'
+import type { CaptureResult, SummaryResult } from './schemas.ts'
+import { CaptureResultSchema } from './schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for summarize command */
+export type SummarizeConfig = {
+  /** Path to results.jsonl file */
+  resultsPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Output as markdown instead of JSONL */
+  markdown?: boolean
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+/** Load capture results from JSONL file */
+const loadResults = async (path: string): Promise<CaptureResult[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return CaptureResultSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+/**
+ * Format capture result as compact summary.
+ *
+ * @param result - Full capture result
+ * @returns Compact summary result
+ *
+ * @public
+ */
+export const formatSummary = (result: CaptureResult): SummaryResult => {
+  return {
+    id: result.id,
+    input: result.input,
+    output: result.output,
+    toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
+    duration: result.timing.end - result.timing.start,
+  }
+}
+/**
+ * Format capture result as markdown with step IDs.
+ *
+ * @param result - Full capture result
+ * @returns Markdown formatted string
+ *
+ * @public
+ */
+export const formatMarkdown = (result: CaptureResult): string => {
+  const lines: string[] = [
+    `## Evaluation Record: ${result.id}`,
+    '',
+    `**Input:** ${result.input}`,
+    '',
+    '**Trajectory:**',
+  ]
+  let stepNum = 1
+  for (const step of result.trajectory) {
+    const stepId = `${result.id}-step-${stepNum}`
+    if (step.type === 'thought') {
+      const preview = step.content.slice(0, 100)
+      const truncated = step.content.length > 100 ? '...' : ''
+      lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
+      stepNum++
+    } else if (step.type === 'tool_call') {
+      const duration = step.duration ? ` (${step.duration}ms)` : ''
+      const filePath = extractFilePath(step.input)
+      const content = extractContent(step.input)
+      lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
+      // Add file path if present
+      if (filePath) {
+        const charCount = content?.length ?? 0
+        lines.push(`   File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
+      }
+      // Add head/tail preview for content-producing tools
+      if (content && content.length > 0) {
+        const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
+        // Detect file extension for syntax highlighting
+        const ext = filePath?.split('.').pop() ?? 'typescript'
+        lines.push(`   \`\`\`${ext}`)
+        lines.push(`   ${preview.split('\n').join('\n   ')}`)
+        lines.push('   ```')
+      }
+      stepNum++
+    } else if (step.type === 'plan') {
+      const entries = step.entries as Array<{ content: string; status: string }>
+      const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
+      const truncated = planSummary.length > 80 ? '...' : ''
+      lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
+      stepNum++
+    } else if (step.type === 'message') {
+      const preview = step.content.slice(0, 100)
+      const truncated = step.content.length > 100 ? '...' : ''
+      lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
+      stepNum++
+    }
+  }
+  lines.push('')
+  const outputPreview = result.output.slice(0, 200)
+  const outputTruncated = result.output.length > 200 ? '...' : ''
+  lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
+  lines.push('')
+  const metadataStr = Object.entries(result.metadata)
+    .map(([k, v]) => `${k}=${v}`)
+    .join(', ')
+  lines.push(`**Metadata:** ${metadataStr}`)
+  lines.push(`**Tool Errors:** ${result.toolErrors}`)
+  lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
+  if (result.score) {
+    lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
+    if (result.score.reasoning) {
+      lines.push(`**Reasoning:** ${result.score.reasoning}`)
+    }
+  }
+  lines.push('')
+  lines.push('---')
+  lines.push('')
+  return lines.join('\n')
+}
+// ============================================================================
+// Summarize Implementation
+// ============================================================================
+/**
+ * Execute summarize with configuration object.
+ *
+ * @param config - Summarize configuration
+ * @returns Formatted output string
+ */
+export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
+  const { resultsPath, outputPath, markdown = false } = config
+  // Load results
+  const results = await loadResults(resultsPath)
+  // Format output
+  let output: string
+  if (markdown) {
+    output = results.map(formatMarkdown).join('\n')
+  } else {
+    output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
+  }
+  // Write output
+  if (outputPath) {
+    await Bun.write(resolvePath(outputPath), output)
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(output)
+  }
+  return output
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Summarize command CLI handler.
+ *
+ * @param args - Command line arguments (after 'summarize')
+ */
+export const summarize = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      markdown: { type: 'boolean', short: 'm', default: false },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness summarize <results.jsonl> [options]
+Arguments:
+  results.jsonl     Input file with capture results
+Options:
+  -o, --output      Output file (default: stdout)
+  -m, --markdown    Output as markdown instead of JSONL
+  -h, --help        Show this help message
+Output Formats:
+  JSONL (default): Compact summary with id, input, output, toolCalls, duration
+  Markdown (-m):   Human-readable format with step IDs for LLM-as-judge
+Examples:
+  # Summary JSONL for jq analysis
+  acp-harness summarize results.jsonl -o summary.jsonl
+  # Markdown for LLM evaluation
+  acp-harness summarize results.jsonl --markdown -o results.md
+`)
+    return
+  }
+  const resultsPath = positionals[0]
+  if (!resultsPath) {
+    console.error('Error: results.jsonl path is required')
+    process.exit(1)
+  }
+  await runSummarize({
+    resultsPath,
+    outputPath: values.output,
+    markdown: values.markdown ?? false,
+  })
+}

package/src/tests/adapter-check.spec.ts ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * Tests for adapter compliance checking functionality.
+ */
+import { describe, expect, test } from 'bun:test'
+import { type CheckConfig, runCheck } from '../adapter-check.ts'
+describe('runCheck', () => {
+  test('fails spawn check for non-existent command', async () => {
+    const config: CheckConfig = {
+      command: ['nonexistent-command-xyz'],
+      timeout: 1000,
+      verbose: false,
+    }
+    const result = await runCheck(config)
+    expect(result.passed).toBe(false)
+    expect(result.checks.length).toBeGreaterThanOrEqual(1)
+    expect(result.checks[0]?.name).toBe('spawn')
+    expect(result.checks[0]?.passed).toBe(false)
+  })
+  test('fails spawn check for command that exits immediately', async () => {
+    const config: CheckConfig = {
+      command: ['false'], // Unix command that exits with code 1
+      timeout: 1000,
+      verbose: false,
+    }
+    const result = await runCheck(config)
+    expect(result.passed).toBe(false)
+    expect(result.summary.failed).toBeGreaterThanOrEqual(1)
+  })
+  test('returns structured result with summary', async () => {
+    const config: CheckConfig = {
+      command: ['echo', 'test'],
+      timeout: 1000,
+      verbose: false,
+    }
+    const result = await runCheck(config)
+    expect(result).toHaveProperty('passed')
+    expect(result).toHaveProperty('checks')
+    expect(result).toHaveProperty('summary')
+    expect(result.summary).toHaveProperty('total')
+    expect(result.summary).toHaveProperty('passed')
+    expect(result.summary).toHaveProperty('failed')
+    expect(typeof result.passed).toBe('boolean')
+    expect(Array.isArray(result.checks)).toBe(true)
+  })
+  test('includes verbose details when enabled', async () => {
+    const config: CheckConfig = {
+      command: ['echo', 'test'],
+      timeout: 1000,
+      verbose: true,
+    }
+    const result = await runCheck(config)
+    // At least the spawn check should have details in verbose mode
+    const spawnCheck = result.checks.find((c) => c.name === 'spawn')
+    expect(spawnCheck).toBeDefined()
+    // Note: details may or may not be present depending on check outcome
+  })
+})

package/src/tests/adapter-scaffold.spec.ts ADDED Viewed

@@ -0,0 +1,112 @@
+/**
+ * Tests for adapter scaffolding functionality.
+ */
+import { afterEach, describe, expect, test } from 'bun:test'
+import { rm } from 'node:fs/promises'
+import { join } from 'node:path'
+import { runScaffold, type ScaffoldConfig } from '../adapter-scaffold.ts'
+const testDir = join(import.meta.dir, 'fixtures', 'scaffold-output')
+describe('runScaffold', () => {
+  afterEach(async () => {
+    // Clean up test output
+    await rm(testDir, { recursive: true, force: true })
+  })
+  test('generates TypeScript adapter structure', async () => {
+    const config: ScaffoldConfig = {
+      name: 'test-agent',
+      outputDir: testDir,
+      lang: 'ts',
+      minimal: false,
+    }
+    const result = await runScaffold(config)
+    expect(result.outputDir).toBe(testDir)
+    expect(result.lang).toBe('ts')
+    expect(result.files).toContain('package.json')
+    expect(result.files).toContain('tsconfig.json')
+    expect(result.files).toContain('src/main.ts')
+    expect(result.files).toContain('src/types.ts')
+    expect(result.files).toContain('src/session-manager.ts')
+    expect(result.files).toContain('src/handlers/initialize.ts')
+    expect(result.files).toContain('src/handlers/session-new.ts')
+    expect(result.files).toContain('src/handlers/session-prompt.ts')
+    expect(result.files).toContain('src/handlers/session-cancel.ts')
+    expect(result.files).toContain('README.md')
+    // Verify files actually exist
+    const packageJson = await Bun.file(join(testDir, 'package.json')).text()
+    expect(packageJson).toContain('"test-agent-acp"')
+    const mainTs = await Bun.file(join(testDir, 'src', 'main.ts')).text()
+    expect(mainTs).toContain('#!/usr/bin/env bun')
+    expect(mainTs).toContain('handleInitialize')
+  })
+  test('generates minimal TypeScript structure without README', async () => {
+    const config: ScaffoldConfig = {
+      name: 'minimal-agent',
+      outputDir: testDir,
+      lang: 'ts',
+      minimal: true,
+    }
+    const result = await runScaffold(config)
+    expect(result.files).not.toContain('README.md')
+    expect(result.files).toContain('package.json')
+    expect(result.files).toContain('src/main.ts')
+  })
+  test('generates Python adapter structure', async () => {
+    const config: ScaffoldConfig = {
+      name: 'python-agent',
+      outputDir: testDir,
+      lang: 'python',
+      minimal: false,
+    }
+    const result = await runScaffold(config)
+    expect(result.lang).toBe('python')
+    expect(result.files).toContain('adapter.py')
+    expect(result.files).toContain('README.md')
+    const adapterPy = await Bun.file(join(testDir, 'adapter.py')).text()
+    expect(adapterPy).toContain('#!/usr/bin/env python3')
+    expect(adapterPy).toContain('python-agent')
+    expect(adapterPy).toContain('def handle_initialize')
+  })
+  test('generates minimal Python structure without README', async () => {
+    const config: ScaffoldConfig = {
+      name: 'minimal-python',
+      outputDir: testDir,
+      lang: 'python',
+      minimal: true,
+    }
+    const result = await runScaffold(config)
+    expect(result.files).toContain('adapter.py')
+    expect(result.files).not.toContain('README.md')
+  })
+  test('package.json contains correct name', async () => {
+    const config: ScaffoldConfig = {
+      name: 'my-special-agent',
+      outputDir: testDir,
+      lang: 'ts',
+      minimal: true,
+    }
+    await runScaffold(config)
+    const packageJson = JSON.parse(await Bun.file(join(testDir, 'package.json')).text())
+    expect(packageJson.name).toBe('my-special-agent-acp')
+  })
+})