npm - @plaited/acp-harness - Versions diffs - 0.2.6 → 0.3.2 - Mend

@plaited/acp-harness 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/LICENSE +1 -1
package/README.md +175 -34
package/bin/cli.ts +105 -636
package/bin/tests/cli.spec.ts +218 -51
package/package.json +21 -5
package/src/acp-client.ts +5 -4
package/src/acp-transport.ts +14 -7
package/src/adapter-check.ts +542 -0
package/src/adapter-scaffold.ts +934 -0
package/src/balance.ts +257 -0
package/src/calibrate.ts +319 -0
package/src/capture.ts +457 -0
package/src/constants.ts +94 -0
package/src/grader-loader.ts +174 -0
package/src/harness.ts +35 -0
package/src/schemas-cli.ts +239 -0
package/src/schemas.ts +567 -0
package/src/summarize.ts +259 -0
package/src/tests/adapter-check.spec.ts +70 -0
package/src/tests/adapter-scaffold.spec.ts +112 -0
package/src/tests/balance-helpers.spec.ts +279 -0
package/src/tests/calibrate-helpers.spec.ts +226 -0
package/src/tests/capture-helpers.spec.ts +553 -0
package/src/tests/fixtures/grader-bad-module.ts +5 -0
package/src/tests/fixtures/grader-exec-fail.py +9 -0
package/src/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/tests/fixtures/grader-exec.py +29 -0
package/src/tests/fixtures/grader-module.ts +14 -0
package/src/tests/grader-loader.spec.ts +153 -0
package/src/tests/summarize-helpers.spec.ts +339 -0
package/src/tests/trials-calculations.spec.ts +209 -0
package/src/trials.ts +407 -0
package/src/validate-refs.ts +188 -0
package/.claude/rules/accuracy.md +0 -43
package/.claude/rules/bun-apis.md +0 -80
package/.claude/rules/code-review.md +0 -254
package/.claude/rules/git-workflow.md +0 -37
package/.claude/rules/github.md +0 -154
package/.claude/rules/testing.md +0 -172
package/.claude/skills/acp-harness/SKILL.md +0 -310
package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
package/.claude/skills/acp-harness/references/downstream.md +0 -288
package/.claude/skills/acp-harness/references/output-formats.md +0 -221
package/.claude-plugin/marketplace.json +0 -15
package/.claude-plugin/plugin.json +0 -16
package/.github/CODEOWNERS +0 -6
package/.github/workflows/ci.yml +0 -63
package/.github/workflows/publish.yml +0 -146
package/.mcp.json +0 -20
package/CLAUDE.md +0 -92
package/Dockerfile.test +0 -23
package/biome.json +0 -96
package/bun.lock +0 -513
package/docker-compose.test.yml +0 -21
package/scripts/bun-test-wrapper.sh +0 -46
package/src/acp.constants.ts +0 -56
package/src/acp.schemas.ts +0 -161
package/src/acp.types.ts +0 -28
package/src/tests/fixtures/.claude/settings.local.json +0 -8
package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
package/tsconfig.json +0 -32

package/src/balance.ts ADDED Viewed

@@ -0,0 +1,257 @@
+/**
+ * Balance command - analyze test set coverage.
+ *
+ * @remarks
+ * Analyzes the distribution of test cases by metadata categories.
+ * Identifies underrepresented categories and suggests improvements.
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { loadPrompts } from './capture.ts'
+import type { BalanceAnalysis, CategoryDistribution, PromptCase } from './schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for balance command */
+export type BalanceConfig = {
+  /** Path to prompts.jsonl file */
+  promptsPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Metadata key to analyze (default: 'category') */
+  key?: string
+  /** Threshold for underrepresentation (percentage) */
+  threshold?: number
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+/**
+ * Analyze category distribution across prompts.
+ *
+ * @param prompts - Array of prompt cases
+ * @param key - Metadata key to analyze
+ * @returns Array of category distributions sorted by count descending
+ *
+ * @public
+ */
+export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
+  const counts = new Map<string, number>()
+  for (const prompt of prompts) {
+    const value = prompt.metadata?.[key]
+    const category = value !== undefined ? String(value) : '(uncategorized)'
+    counts.set(category, (counts.get(category) ?? 0) + 1)
+  }
+  const total = prompts.length
+  const distributions: CategoryDistribution[] = []
+  for (const [name, count] of counts) {
+    distributions.push({
+      name,
+      count,
+      percentage: Math.round((count / total) * 100),
+    })
+  }
+  // Sort by count descending
+  distributions.sort((a, b) => b.count - a.count)
+  return distributions
+}
+/**
+ * Identify underrepresented categories.
+ *
+ * @param distributions - Array of category distributions
+ * @param threshold - Percentage threshold relative to even distribution
+ * @returns Array of underrepresented category names
+ *
+ * @public
+ */
+export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
+  // Expected percentage if evenly distributed
+  const evenPercentage = 100 / distributions.length
+  return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
+}
+/**
+ * Generate suggestions for improving test set balance.
+ *
+ * @param distributions - Array of category distributions
+ * @param underrepresented - Array of underrepresented category names
+ * @param total - Total number of test cases
+ * @returns Array of suggestion strings
+ *
+ * @public
+ */
+export const generateSuggestions = (
+  distributions: CategoryDistribution[],
+  underrepresented: string[],
+  total: number,
+): string[] => {
+  const suggestions: string[] = []
+  if (underrepresented.length > 0) {
+    suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`)
+  }
+  // Check for category with > 50% of cases
+  const dominant = distributions.find((d) => d.percentage > 50)
+  if (dominant) {
+    suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`)
+  }
+  // Check for very small categories
+  const tiny = distributions.filter((d) => d.count < 3)
+  if (tiny.length > 0) {
+    suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`)
+  }
+  // Check total test count
+  if (total < 20) {
+    suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`)
+  }
+  if (suggestions.length === 0) {
+    suggestions.push('Test set appears well-balanced')
+  }
+  return suggestions
+}
+// ============================================================================
+// Balance Implementation
+// ============================================================================
+/**
+ * Execute balance analysis with configuration object.
+ *
+ * @param config - Balance configuration
+ * @returns Balance analysis result
+ */
+export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis> => {
+  const { promptsPath, outputPath, key = 'category', threshold = 50 } = config
+  // Load prompts
+  const prompts = await loadPrompts(promptsPath)
+  console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`)
+  // Analyze distribution
+  const categories = analyzeCategories(prompts, key)
+  const underrepresented = findUnderrepresented(categories, threshold)
+  const suggestions = generateSuggestions(categories, underrepresented, prompts.length)
+  const analysis: BalanceAnalysis = {
+    totalCases: prompts.length,
+    categories,
+    underrepresented,
+    suggestions,
+  }
+  // Format output
+  const output = JSON.stringify(analysis, null, 2)
+  // Write output
+  if (outputPath) {
+    await Bun.write(resolvePath(outputPath), output)
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(output)
+  }
+  // Summary to stderr
+  console.error('\nCategory Distribution:')
+  for (const cat of categories) {
+    const bar = '█'.repeat(Math.round(cat.percentage / 5))
+    console.error(`  ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`)
+  }
+  if (underrepresented.length > 0) {
+    console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`)
+  }
+  console.error('\nSuggestions:')
+  for (const suggestion of suggestions) {
+    console.error(`  - ${suggestion}`)
+  }
+  return analysis
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Balance command CLI handler.
+ *
+ * @param args - Command line arguments (after 'balance')
+ */
+export const balance = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      key: { type: 'string', short: 'k', default: 'category' },
+      threshold: { type: 'string', short: 't', default: '50' },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness balance <prompts.jsonl> [options]
+Arguments:
+  prompts.jsonl     Input file with prompts
+Options:
+  -o, --output      Output file (default: stdout)
+  -k, --key         Metadata key to analyze (default: 'category')
+  -t, --threshold   Underrepresentation threshold % (default: 50)
+  -h, --help        Show this help message
+Output:
+  JSON with category distribution, underrepresented categories, and suggestions.
+Examples:
+  # Analyze by default 'category' key
+  acp-harness balance prompts.jsonl -o balance.json
+  # Analyze by custom metadata key
+  acp-harness balance prompts.jsonl --key difficulty -o balance.json
+`)
+    return
+  }
+  const promptsPath = positionals[0]
+  if (!promptsPath) {
+    console.error('Error: prompts.jsonl path is required')
+    process.exit(1)
+  }
+  await runBalance({
+    promptsPath,
+    outputPath: values.output,
+    key: values.key ?? 'category',
+    threshold: Number.parseInt(values.threshold ?? '50', 10),
+  })
+}

package/src/calibrate.ts ADDED Viewed

@@ -0,0 +1,319 @@
+/**
+ * Calibrate command - sample failures for grader review.
+ *
+ * @remarks
+ * Helps identify grader bugs by sampling failures for human review.
+ * Can optionally re-score with a different grader for comparison.
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from './constants.ts'
+import { loadGrader } from './grader-loader.ts'
+import type { CalibrationSample, CaptureResult, Grader, GraderResult, TrajectoryStep } from './schemas.ts'
+import { CaptureResultSchema } from './schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for calibrate command */
+export type CalibrateConfig = {
+  /** Path to results.jsonl file */
+  resultsPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Number of samples to include */
+  sample?: number
+  /** Optional grader for re-scoring */
+  grader?: Grader
+}
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+/** Load capture results from JSONL file */
+const loadResults = async (path: string): Promise<CaptureResult[]> => {
+  const content = await Bun.file(path).text()
+  return content
+    .trim()
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return CaptureResultSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
+/**
+ * Random sample from array.
+ *
+ * @param arr - Array to sample from
+ * @param n - Number of samples to take
+ * @returns Array of sampled elements
+ *
+ * @public
+ */
+export const sampleArray = <T>(arr: T[], n: number): T[] => {
+  const shuffled = [...arr].sort(() => 0.5 - Math.random())
+  return shuffled.slice(0, n)
+}
+/**
+ * Get snippet of trajectory for review.
+ *
+ * @remarks
+ * Includes first 2 steps, middle step, and last 2 steps.
+ *
+ * @param trajectory - Full trajectory
+ * @param maxSteps - Maximum number of steps to include
+ * @returns Trajectory snippet
+ *
+ * @public
+ */
+export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
+  // Include first and last steps, plus some from the middle
+  if (trajectory.length <= maxSteps) return trajectory
+  const result: TrajectoryStep[] = []
+  // First 2 steps
+  result.push(...trajectory.slice(0, 2))
+  // Middle step
+  const mid = Math.floor(trajectory.length / 2)
+  result.push(trajectory[mid] as TrajectoryStep)
+  // Last 2 steps
+  result.push(...trajectory.slice(-2))
+  return result
+}
+/** Format calibration sample as markdown */
+const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
+  const lines: string[] = [
+    '# Grader Calibration Report',
+    '',
+    `Generated: ${new Date().toISOString()}`,
+    `Samples: ${samples.length}`,
+    '',
+    '## Instructions',
+    '',
+    'Review each failure below and mark whether:',
+    '- [ ] **Valid failure** - Grader correctly identified a problem',
+    '- [ ] **Grader bug** - Output was actually correct, grader was wrong',
+    '- [ ] **Ambiguous** - Unclear if the output is correct or not',
+    '',
+    '---',
+    '',
+  ]
+  for (let i = 0; i < samples.length; i++) {
+    const sample = samples[i]
+    if (!sample) continue
+    lines.push(`## Sample ${i + 1}: ${sample.id}`)
+    lines.push('')
+    lines.push(`**Input:** ${sample.input}`)
+    lines.push('')
+    if (sample.expected) {
+      lines.push(`**Expected:** ${sample.expected}`)
+      lines.push('')
+    }
+    lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`)
+    lines.push('')
+    lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`)
+    if (sample.originalScore.reasoning) {
+      lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`)
+    }
+    lines.push('')
+    if (sample.rescoredResult) {
+      lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`)
+      if (sample.rescoredResult.reasoning) {
+        lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`)
+      }
+      lines.push('')
+    }
+    lines.push('**Trajectory Snippet:**')
+    lines.push('```')
+    for (const step of sample.trajectorySnippet) {
+      if (step.type === 'tool_call') {
+        lines.push(`[${step.type}] ${step.name}: ${step.status}`)
+      } else if (step.type === 'message' || step.type === 'thought') {
+        lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`)
+      } else if (step.type === 'plan') {
+        lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`)
+      }
+    }
+    lines.push('```')
+    lines.push('')
+    lines.push('**Review:**')
+    lines.push('- [ ] Valid failure')
+    lines.push('- [ ] Grader bug')
+    lines.push('- [ ] Ambiguous')
+    lines.push('')
+    lines.push('---')
+    lines.push('')
+  }
+  return lines.join('\n')
+}
+// ============================================================================
+// Calibrate Implementation
+// ============================================================================
+/**
+ * Execute calibrate with configuration object.
+ *
+ * @param config - Calibrate configuration
+ * @returns Calibration samples
+ */
+export const runCalibrate = async (config: CalibrateConfig): Promise<CalibrationSample[]> => {
+  const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config
+  // Load results
+  const results = await loadResults(resultsPath)
+  // Filter to failures (or results without scores)
+  const failures = results.filter((r) => r.score && !r.score.pass)
+  if (failures.length === 0) {
+    console.error('No failures found in results')
+    return []
+  }
+  // Sample failures
+  const sampled = sampleArray(failures, Math.min(sample, failures.length))
+  // Build calibration samples
+  const samples: CalibrationSample[] = []
+  for (const result of sampled) {
+    const calibrationSample: CalibrationSample = {
+      id: result.id,
+      input: result.input,
+      output: result.output,
+      expected: result.expected,
+      originalScore: result.score as GraderResult,
+      trajectorySnippet: getTrajectorySnippet(result.trajectory),
+    }
+    // Re-score with different grader if provided
+    if (grader) {
+      calibrationSample.rescoredResult = await grader({
+        input: result.input,
+        output: result.output,
+        expected: result.expected,
+        trajectory: result.trajectory,
+      })
+    }
+    samples.push(calibrationSample)
+  }
+  // Format as markdown
+  const markdown = formatCalibrationMarkdown(samples)
+  // Write output
+  if (outputPath) {
+    await Bun.write(resolvePath(outputPath), markdown)
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(markdown)
+  }
+  return samples
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Calibrate command CLI handler.
+ *
+ * @param args - Command line arguments (after 'calibrate')
+ */
+export const calibrate = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) },
+      grader: { type: 'string', short: 'g' },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: acp-harness calibrate <results.jsonl> [options]
+Arguments:
+  results.jsonl     Input file with scored capture results
+Options:
+  -o, --output      Output file (default: stdout)
+  -s, --sample      Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE})
+  -g, --grader      Path to alternative grader (.ts/.js module or executable script)
+  -h, --help        Show this help message
+Output:
+  Markdown report with sampled failures for human review.
+  Includes checkboxes for labeling (valid failure / grader bug / ambiguous).
+Examples:
+  # Sample failures for review
+  acp-harness calibrate results.jsonl --sample 10 -o calibration.md
+  # Re-score with different grader to compare
+  acp-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md
+`)
+    return
+  }
+  const resultsPath = positionals[0]
+  if (!resultsPath) {
+    console.error('Error: results.jsonl path is required')
+    process.exit(1)
+  }
+  // Load grader if specified
+  let grader: Grader | undefined
+  if (values.grader) {
+    try {
+      grader = await loadGrader(values.grader)
+    } catch (error) {
+      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+      process.exit(1)
+    }
+  }
+  await runCalibrate({
+    resultsPath,
+    outputPath: values.output,
+    sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10),
+    grader,
+  })
+}