@plaited/acp-harness 0.2.6 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +120 -16
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +20 -4
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +232 -0
  11. package/src/calibrate.ts +300 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +245 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  22. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  23. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  24. package/src/tests/fixtures/grader-exec.py +29 -0
  25. package/src/tests/fixtures/grader-module.ts +14 -0
  26. package/src/tests/grader-loader.spec.ts +153 -0
  27. package/src/trials.ts +395 -0
  28. package/src/validate-refs.ts +188 -0
  29. package/.claude/rules/accuracy.md +0 -43
  30. package/.claude/rules/bun-apis.md +0 -80
  31. package/.claude/rules/code-review.md +0 -254
  32. package/.claude/rules/git-workflow.md +0 -37
  33. package/.claude/rules/github.md +0 -154
  34. package/.claude/rules/testing.md +0 -172
  35. package/.claude/skills/acp-harness/SKILL.md +0 -310
  36. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  37. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  38. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  39. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  40. package/.claude-plugin/marketplace.json +0 -15
  41. package/.claude-plugin/plugin.json +0 -16
  42. package/.github/CODEOWNERS +0 -6
  43. package/.github/workflows/ci.yml +0 -63
  44. package/.github/workflows/publish.yml +0 -146
  45. package/.mcp.json +0 -20
  46. package/CLAUDE.md +0 -92
  47. package/Dockerfile.test +0 -23
  48. package/biome.json +0 -96
  49. package/bun.lock +0 -513
  50. package/docker-compose.test.yml +0 -21
  51. package/scripts/bun-test-wrapper.sh +0 -46
  52. package/src/acp.constants.ts +0 -56
  53. package/src/acp.schemas.ts +0 -161
  54. package/src/acp.types.ts +0 -28
  55. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  56. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  57. package/tsconfig.json +0 -32
package/src/balance.ts ADDED
@@ -0,0 +1,232 @@
1
+ /**
2
+ * Balance command - analyze test set coverage.
3
+ *
4
+ * @remarks
5
+ * Analyzes the distribution of test cases by metadata categories.
6
+ * Identifies underrepresented categories and suggests improvements.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { loadPrompts } from './capture.ts'
13
+ import type { BalanceAnalysis, CategoryDistribution, PromptCase } from './schemas.ts'
14
+
15
+ // ============================================================================
16
+ // Types
17
+ // ============================================================================
18
+
19
+ /** Configuration for balance command */
20
+ export type BalanceConfig = {
21
+ /** Path to prompts.jsonl file */
22
+ promptsPath: string
23
+ /** Output file path */
24
+ outputPath?: string
25
+ /** Metadata key to analyze (default: 'category') */
26
+ key?: string
27
+ /** Threshold for underrepresentation (percentage) */
28
+ threshold?: number
29
+ }
30
+
31
+ // ============================================================================
32
+ // Helpers
33
+ // ============================================================================
34
+
35
+ /** Resolve path relative to process.cwd() */
36
+ const resolvePath = (path: string): string => {
37
+ if (path.startsWith('/')) return path
38
+ return `${process.cwd()}/${path}`
39
+ }
40
+
41
+ /** Analyze category distribution */
42
+ const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
43
+ const counts = new Map<string, number>()
44
+
45
+ for (const prompt of prompts) {
46
+ const value = prompt.metadata?.[key]
47
+ const category = value !== undefined ? String(value) : '(uncategorized)'
48
+ counts.set(category, (counts.get(category) ?? 0) + 1)
49
+ }
50
+
51
+ const total = prompts.length
52
+ const distributions: CategoryDistribution[] = []
53
+
54
+ for (const [name, count] of counts) {
55
+ distributions.push({
56
+ name,
57
+ count,
58
+ percentage: Math.round((count / total) * 100),
59
+ })
60
+ }
61
+
62
+ // Sort by count descending
63
+ distributions.sort((a, b) => b.count - a.count)
64
+
65
+ return distributions
66
+ }
67
+
68
+ /** Identify underrepresented categories */
69
+ const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
70
+ // Expected percentage if evenly distributed
71
+ const evenPercentage = 100 / distributions.length
72
+
73
+ return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
74
+ }
75
+
76
+ /** Generate suggestions for improvement */
77
+ const generateSuggestions = (
78
+ distributions: CategoryDistribution[],
79
+ underrepresented: string[],
80
+ total: number,
81
+ ): string[] => {
82
+ const suggestions: string[] = []
83
+
84
+ if (underrepresented.length > 0) {
85
+ suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`)
86
+ }
87
+
88
+ // Check for category with > 50% of cases
89
+ const dominant = distributions.find((d) => d.percentage > 50)
90
+ if (dominant) {
91
+ suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`)
92
+ }
93
+
94
+ // Check for very small categories
95
+ const tiny = distributions.filter((d) => d.count < 3)
96
+ if (tiny.length > 0) {
97
+ suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`)
98
+ }
99
+
100
+ // Check total test count
101
+ if (total < 20) {
102
+ suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`)
103
+ }
104
+
105
+ if (suggestions.length === 0) {
106
+ suggestions.push('Test set appears well-balanced')
107
+ }
108
+
109
+ return suggestions
110
+ }
111
+
112
+ // ============================================================================
113
+ // Balance Implementation
114
+ // ============================================================================
115
+
116
+ /**
117
+ * Execute balance analysis with configuration object.
118
+ *
119
+ * @param config - Balance configuration
120
+ * @returns Balance analysis result
121
+ */
122
+ export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis> => {
123
+ const { promptsPath, outputPath, key = 'category', threshold = 50 } = config
124
+
125
+ // Load prompts
126
+ const prompts = await loadPrompts(promptsPath)
127
+
128
+ console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`)
129
+
130
+ // Analyze distribution
131
+ const categories = analyzeCategories(prompts, key)
132
+ const underrepresented = findUnderrepresented(categories, threshold)
133
+ const suggestions = generateSuggestions(categories, underrepresented, prompts.length)
134
+
135
+ const analysis: BalanceAnalysis = {
136
+ totalCases: prompts.length,
137
+ categories,
138
+ underrepresented,
139
+ suggestions,
140
+ }
141
+
142
+ // Format output
143
+ const output = JSON.stringify(analysis, null, 2)
144
+
145
+ // Write output
146
+ if (outputPath) {
147
+ await Bun.write(resolvePath(outputPath), output)
148
+ } else {
149
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
150
+ console.log(output)
151
+ }
152
+
153
+ // Summary to stderr
154
+ console.error('\nCategory Distribution:')
155
+ for (const cat of categories) {
156
+ const bar = '█'.repeat(Math.round(cat.percentage / 5))
157
+ console.error(` ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`)
158
+ }
159
+
160
+ if (underrepresented.length > 0) {
161
+ console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`)
162
+ }
163
+
164
+ console.error('\nSuggestions:')
165
+ for (const suggestion of suggestions) {
166
+ console.error(` - ${suggestion}`)
167
+ }
168
+
169
+ return analysis
170
+ }
171
+
172
+ // ============================================================================
173
+ // CLI Entry Point
174
+ // ============================================================================
175
+
176
+ /**
177
+ * Balance command CLI handler.
178
+ *
179
+ * @param args - Command line arguments (after 'balance')
180
+ */
181
+ export const balance = async (args: string[]): Promise<void> => {
182
+ const { values, positionals } = parseArgs({
183
+ args,
184
+ options: {
185
+ output: { type: 'string', short: 'o' },
186
+ key: { type: 'string', short: 'k', default: 'category' },
187
+ threshold: { type: 'string', short: 't', default: '50' },
188
+ help: { type: 'boolean', short: 'h' },
189
+ },
190
+ allowPositionals: true,
191
+ })
192
+
193
+ if (values.help) {
194
+ // biome-ignore lint/suspicious/noConsole: CLI help output
195
+ console.log(`
196
+ Usage: acp-harness balance <prompts.jsonl> [options]
197
+
198
+ Arguments:
199
+ prompts.jsonl Input file with prompts
200
+
201
+ Options:
202
+ -o, --output Output file (default: stdout)
203
+ -k, --key Metadata key to analyze (default: 'category')
204
+ -t, --threshold Underrepresentation threshold % (default: 50)
205
+ -h, --help Show this help message
206
+
207
+ Output:
208
+ JSON with category distribution, underrepresented categories, and suggestions.
209
+
210
+ Examples:
211
+ # Analyze by default 'category' key
212
+ acp-harness balance prompts.jsonl -o balance.json
213
+
214
+ # Analyze by custom metadata key
215
+ acp-harness balance prompts.jsonl --key difficulty -o balance.json
216
+ `)
217
+ return
218
+ }
219
+
220
+ const promptsPath = positionals[0]
221
+ if (!promptsPath) {
222
+ console.error('Error: prompts.jsonl path is required')
223
+ process.exit(1)
224
+ }
225
+
226
+ await runBalance({
227
+ promptsPath,
228
+ outputPath: values.output,
229
+ key: values.key ?? 'category',
230
+ threshold: Number.parseInt(values.threshold ?? '50', 10),
231
+ })
232
+ }
@@ -0,0 +1,300 @@
1
+ /**
2
+ * Calibrate command - sample failures for grader review.
3
+ *
4
+ * @remarks
5
+ * Helps identify grader bugs by sampling failures for human review.
6
+ * Can optionally re-score with a different grader for comparison.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from './constants.ts'
13
+ import { loadGrader } from './grader-loader.ts'
14
+ import type { CalibrationSample, CaptureResult, Grader, GraderResult, TrajectoryStep } from './schemas.ts'
15
+ import { CaptureResultSchema } from './schemas.ts'
16
+
17
+ // ============================================================================
18
+ // Types
19
+ // ============================================================================
20
+
21
+ /** Configuration for calibrate command */
22
+ export type CalibrateConfig = {
23
+ /** Path to results.jsonl file */
24
+ resultsPath: string
25
+ /** Output file path */
26
+ outputPath?: string
27
+ /** Number of samples to include */
28
+ sample?: number
29
+ /** Optional grader for re-scoring */
30
+ grader?: Grader
31
+ }
32
+
33
+ // ============================================================================
34
+ // Helpers
35
+ // ============================================================================
36
+
37
+ /** Resolve path relative to process.cwd() */
38
+ const resolvePath = (path: string): string => {
39
+ if (path.startsWith('/')) return path
40
+ return `${process.cwd()}/${path}`
41
+ }
42
+
43
+ /** Load capture results from JSONL file */
44
+ const loadResults = async (path: string): Promise<CaptureResult[]> => {
45
+ const content = await Bun.file(path).text()
46
+ return content
47
+ .trim()
48
+ .split('\n')
49
+ .filter(Boolean)
50
+ .map((line, index) => {
51
+ try {
52
+ return CaptureResultSchema.parse(JSON.parse(line))
53
+ } catch (error) {
54
+ throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
55
+ }
56
+ })
57
+ }
58
+
59
+ /** Random sample from array */
60
+ const sampleArray = <T>(arr: T[], n: number): T[] => {
61
+ const shuffled = [...arr].sort(() => 0.5 - Math.random())
62
+ return shuffled.slice(0, n)
63
+ }
64
+
65
+ /** Get snippet of trajectory for review */
66
+ const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
67
+ // Include first and last steps, plus some from the middle
68
+ if (trajectory.length <= maxSteps) return trajectory
69
+
70
+ const result: TrajectoryStep[] = []
71
+
72
+ // First 2 steps
73
+ result.push(...trajectory.slice(0, 2))
74
+
75
+ // Middle step
76
+ const mid = Math.floor(trajectory.length / 2)
77
+ result.push(trajectory[mid] as TrajectoryStep)
78
+
79
+ // Last 2 steps
80
+ result.push(...trajectory.slice(-2))
81
+
82
+ return result
83
+ }
84
+
85
+ /** Format calibration sample as markdown */
86
+ const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
87
+ const lines: string[] = [
88
+ '# Grader Calibration Report',
89
+ '',
90
+ `Generated: ${new Date().toISOString()}`,
91
+ `Samples: ${samples.length}`,
92
+ '',
93
+ '## Instructions',
94
+ '',
95
+ 'Review each failure below and mark whether:',
96
+ '- [ ] **Valid failure** - Grader correctly identified a problem',
97
+ '- [ ] **Grader bug** - Output was actually correct, grader was wrong',
98
+ '- [ ] **Ambiguous** - Unclear if the output is correct or not',
99
+ '',
100
+ '---',
101
+ '',
102
+ ]
103
+
104
+ for (let i = 0; i < samples.length; i++) {
105
+ const sample = samples[i]
106
+ if (!sample) continue
107
+
108
+ lines.push(`## Sample ${i + 1}: ${sample.id}`)
109
+ lines.push('')
110
+ lines.push(`**Input:** ${sample.input}`)
111
+ lines.push('')
112
+
113
+ if (sample.expected) {
114
+ lines.push(`**Expected:** ${sample.expected}`)
115
+ lines.push('')
116
+ }
117
+
118
+ lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`)
119
+ lines.push('')
120
+
121
+ lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`)
122
+ if (sample.originalScore.reasoning) {
123
+ lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`)
124
+ }
125
+ lines.push('')
126
+
127
+ if (sample.rescoredResult) {
128
+ lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`)
129
+ if (sample.rescoredResult.reasoning) {
130
+ lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`)
131
+ }
132
+ lines.push('')
133
+ }
134
+
135
+ lines.push('**Trajectory Snippet:**')
136
+ lines.push('```')
137
+ for (const step of sample.trajectorySnippet) {
138
+ if (step.type === 'tool_call') {
139
+ lines.push(`[${step.type}] ${step.name}: ${step.status}`)
140
+ } else if (step.type === 'message' || step.type === 'thought') {
141
+ lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`)
142
+ } else if (step.type === 'plan') {
143
+ lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`)
144
+ }
145
+ }
146
+ lines.push('```')
147
+ lines.push('')
148
+
149
+ lines.push('**Review:**')
150
+ lines.push('- [ ] Valid failure')
151
+ lines.push('- [ ] Grader bug')
152
+ lines.push('- [ ] Ambiguous')
153
+ lines.push('')
154
+ lines.push('---')
155
+ lines.push('')
156
+ }
157
+
158
+ return lines.join('\n')
159
+ }
160
+
161
+ // ============================================================================
162
+ // Calibrate Implementation
163
+ // ============================================================================
164
+
165
+ /**
166
+ * Execute calibrate with configuration object.
167
+ *
168
+ * @param config - Calibrate configuration
169
+ * @returns Calibration samples
170
+ */
171
+ export const runCalibrate = async (config: CalibrateConfig): Promise<CalibrationSample[]> => {
172
+ const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config
173
+
174
+ // Load results
175
+ const results = await loadResults(resultsPath)
176
+
177
+ // Filter to failures (or results without scores)
178
+ const failures = results.filter((r) => r.score && !r.score.pass)
179
+
180
+ if (failures.length === 0) {
181
+ console.error('No failures found in results')
182
+ return []
183
+ }
184
+
185
+ // Sample failures
186
+ const sampled = sampleArray(failures, Math.min(sample, failures.length))
187
+
188
+ // Build calibration samples
189
+ const samples: CalibrationSample[] = []
190
+
191
+ for (const result of sampled) {
192
+ const calibrationSample: CalibrationSample = {
193
+ id: result.id,
194
+ input: result.input,
195
+ output: result.output,
196
+ expected: result.expected,
197
+ originalScore: result.score as GraderResult,
198
+ trajectorySnippet: getTrajectorySnippet(result.trajectory),
199
+ }
200
+
201
+ // Re-score with different grader if provided
202
+ if (grader) {
203
+ calibrationSample.rescoredResult = await grader({
204
+ input: result.input,
205
+ output: result.output,
206
+ expected: result.expected,
207
+ trajectory: result.trajectory,
208
+ })
209
+ }
210
+
211
+ samples.push(calibrationSample)
212
+ }
213
+
214
+ // Format as markdown
215
+ const markdown = formatCalibrationMarkdown(samples)
216
+
217
+ // Write output
218
+ if (outputPath) {
219
+ await Bun.write(resolvePath(outputPath), markdown)
220
+ } else {
221
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
222
+ console.log(markdown)
223
+ }
224
+
225
+ return samples
226
+ }
227
+
228
+ // ============================================================================
229
+ // CLI Entry Point
230
+ // ============================================================================
231
+
232
+ /**
233
+ * Calibrate command CLI handler.
234
+ *
235
+ * @param args - Command line arguments (after 'calibrate')
236
+ */
237
+ export const calibrate = async (args: string[]): Promise<void> => {
238
+ const { values, positionals } = parseArgs({
239
+ args,
240
+ options: {
241
+ output: { type: 'string', short: 'o' },
242
+ sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) },
243
+ grader: { type: 'string', short: 'g' },
244
+ help: { type: 'boolean', short: 'h' },
245
+ },
246
+ allowPositionals: true,
247
+ })
248
+
249
+ if (values.help) {
250
+ // biome-ignore lint/suspicious/noConsole: CLI help output
251
+ console.log(`
252
+ Usage: acp-harness calibrate <results.jsonl> [options]
253
+
254
+ Arguments:
255
+ results.jsonl Input file with scored capture results
256
+
257
+ Options:
258
+ -o, --output Output file (default: stdout)
259
+ -s, --sample Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE})
260
+ -g, --grader Path to alternative grader (.ts/.js module or executable script)
261
+ -h, --help Show this help message
262
+
263
+ Output:
264
+ Markdown report with sampled failures for human review.
265
+ Includes checkboxes for labeling (valid failure / grader bug / ambiguous).
266
+
267
+ Examples:
268
+ # Sample failures for review
269
+ acp-harness calibrate results.jsonl --sample 10 -o calibration.md
270
+
271
+ # Re-score with different grader to compare
272
+ acp-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md
273
+ `)
274
+ return
275
+ }
276
+
277
+ const resultsPath = positionals[0]
278
+ if (!resultsPath) {
279
+ console.error('Error: results.jsonl path is required')
280
+ process.exit(1)
281
+ }
282
+
283
+ // Load grader if specified
284
+ let grader: Grader | undefined
285
+ if (values.grader) {
286
+ try {
287
+ grader = await loadGrader(values.grader)
288
+ } catch (error) {
289
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
290
+ process.exit(1)
291
+ }
292
+ }
293
+
294
+ await runCalibrate({
295
+ resultsPath,
296
+ outputPath: values.output,
297
+ sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10),
298
+ grader,
299
+ })
300
+ }