@plaited/acp-harness 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +175 -34
  3. package/bin/cli.ts +105 -636
  4. package/bin/tests/cli.spec.ts +218 -51
  5. package/package.json +21 -5
  6. package/src/acp-client.ts +5 -4
  7. package/src/acp-transport.ts +14 -7
  8. package/src/adapter-check.ts +542 -0
  9. package/src/adapter-scaffold.ts +934 -0
  10. package/src/balance.ts +257 -0
  11. package/src/calibrate.ts +319 -0
  12. package/src/capture.ts +457 -0
  13. package/src/constants.ts +94 -0
  14. package/src/grader-loader.ts +174 -0
  15. package/src/harness.ts +35 -0
  16. package/src/schemas-cli.ts +239 -0
  17. package/src/schemas.ts +567 -0
  18. package/src/summarize.ts +259 -0
  19. package/src/tests/adapter-check.spec.ts +70 -0
  20. package/src/tests/adapter-scaffold.spec.ts +112 -0
  21. package/src/tests/balance-helpers.spec.ts +279 -0
  22. package/src/tests/calibrate-helpers.spec.ts +226 -0
  23. package/src/tests/capture-helpers.spec.ts +553 -0
  24. package/src/tests/fixtures/grader-bad-module.ts +5 -0
  25. package/src/tests/fixtures/grader-exec-fail.py +9 -0
  26. package/src/tests/fixtures/grader-exec-invalid.py +6 -0
  27. package/src/tests/fixtures/grader-exec.py +29 -0
  28. package/src/tests/fixtures/grader-module.ts +14 -0
  29. package/src/tests/grader-loader.spec.ts +153 -0
  30. package/src/tests/summarize-helpers.spec.ts +339 -0
  31. package/src/tests/trials-calculations.spec.ts +209 -0
  32. package/src/trials.ts +407 -0
  33. package/src/validate-refs.ts +188 -0
  34. package/.claude/rules/accuracy.md +0 -43
  35. package/.claude/rules/bun-apis.md +0 -80
  36. package/.claude/rules/code-review.md +0 -254
  37. package/.claude/rules/git-workflow.md +0 -37
  38. package/.claude/rules/github.md +0 -154
  39. package/.claude/rules/testing.md +0 -172
  40. package/.claude/skills/acp-harness/SKILL.md +0 -310
  41. package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
  42. package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
  43. package/.claude/skills/acp-harness/references/downstream.md +0 -288
  44. package/.claude/skills/acp-harness/references/output-formats.md +0 -221
  45. package/.claude-plugin/marketplace.json +0 -15
  46. package/.claude-plugin/plugin.json +0 -16
  47. package/.github/CODEOWNERS +0 -6
  48. package/.github/workflows/ci.yml +0 -63
  49. package/.github/workflows/publish.yml +0 -146
  50. package/.mcp.json +0 -20
  51. package/CLAUDE.md +0 -92
  52. package/Dockerfile.test +0 -23
  53. package/biome.json +0 -96
  54. package/bun.lock +0 -513
  55. package/docker-compose.test.yml +0 -21
  56. package/scripts/bun-test-wrapper.sh +0 -46
  57. package/src/acp.constants.ts +0 -56
  58. package/src/acp.schemas.ts +0 -161
  59. package/src/acp.types.ts +0 -28
  60. package/src/tests/fixtures/.claude/settings.local.json +0 -8
  61. package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
  62. package/tsconfig.json +0 -32
package/src/balance.ts ADDED
@@ -0,0 +1,257 @@
1
+ /**
2
+ * Balance command - analyze test set coverage.
3
+ *
4
+ * @remarks
5
+ * Analyzes the distribution of test cases by metadata categories.
6
+ * Identifies underrepresented categories and suggests improvements.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { loadPrompts } from './capture.ts'
13
+ import type { BalanceAnalysis, CategoryDistribution, PromptCase } from './schemas.ts'
14
+
15
+ // ============================================================================
16
+ // Types
17
+ // ============================================================================
18
+
19
+ /** Configuration for balance command */
20
+ export type BalanceConfig = {
21
+ /** Path to prompts.jsonl file */
22
+ promptsPath: string
23
+ /** Output file path */
24
+ outputPath?: string
25
+ /** Metadata key to analyze (default: 'category') */
26
+ key?: string
27
+ /** Threshold for underrepresentation (percentage) */
28
+ threshold?: number
29
+ }
30
+
31
+ // ============================================================================
32
+ // Helpers
33
+ // ============================================================================
34
+
35
+ /** Resolve path relative to process.cwd() */
36
+ const resolvePath = (path: string): string => {
37
+ if (path.startsWith('/')) return path
38
+ return `${process.cwd()}/${path}`
39
+ }
40
+
41
+ /**
42
+ * Analyze category distribution across prompts.
43
+ *
44
+ * @param prompts - Array of prompt cases
45
+ * @param key - Metadata key to analyze
46
+ * @returns Array of category distributions sorted by count descending
47
+ *
48
+ * @public
49
+ */
50
+ export const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
51
+ const counts = new Map<string, number>()
52
+
53
+ for (const prompt of prompts) {
54
+ const value = prompt.metadata?.[key]
55
+ const category = value !== undefined ? String(value) : '(uncategorized)'
56
+ counts.set(category, (counts.get(category) ?? 0) + 1)
57
+ }
58
+
59
+ const total = prompts.length
60
+ const distributions: CategoryDistribution[] = []
61
+
62
+ for (const [name, count] of counts) {
63
+ distributions.push({
64
+ name,
65
+ count,
66
+ percentage: Math.round((count / total) * 100),
67
+ })
68
+ }
69
+
70
+ // Sort by count descending
71
+ distributions.sort((a, b) => b.count - a.count)
72
+
73
+ return distributions
74
+ }
75
+
76
+ /**
77
+ * Identify underrepresented categories.
78
+ *
79
+ * @param distributions - Array of category distributions
80
+ * @param threshold - Percentage threshold relative to even distribution
81
+ * @returns Array of underrepresented category names
82
+ *
83
+ * @public
84
+ */
85
+ export const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
86
+ // Expected percentage if evenly distributed
87
+ const evenPercentage = 100 / distributions.length
88
+
89
+ return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
90
+ }
91
+
92
+ /**
93
+ * Generate suggestions for improving test set balance.
94
+ *
95
+ * @param distributions - Array of category distributions
96
+ * @param underrepresented - Array of underrepresented category names
97
+ * @param total - Total number of test cases
98
+ * @returns Array of suggestion strings
99
+ *
100
+ * @public
101
+ */
102
+ export const generateSuggestions = (
103
+ distributions: CategoryDistribution[],
104
+ underrepresented: string[],
105
+ total: number,
106
+ ): string[] => {
107
+ const suggestions: string[] = []
108
+
109
+ if (underrepresented.length > 0) {
110
+ suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`)
111
+ }
112
+
113
+ // Check for category with > 50% of cases
114
+ const dominant = distributions.find((d) => d.percentage > 50)
115
+ if (dominant) {
116
+ suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`)
117
+ }
118
+
119
+ // Check for very small categories
120
+ const tiny = distributions.filter((d) => d.count < 3)
121
+ if (tiny.length > 0) {
122
+ suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`)
123
+ }
124
+
125
+ // Check total test count
126
+ if (total < 20) {
127
+ suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`)
128
+ }
129
+
130
+ if (suggestions.length === 0) {
131
+ suggestions.push('Test set appears well-balanced')
132
+ }
133
+
134
+ return suggestions
135
+ }
136
+
137
+ // ============================================================================
138
+ // Balance Implementation
139
+ // ============================================================================
140
+
141
+ /**
142
+ * Execute balance analysis with configuration object.
143
+ *
144
+ * @param config - Balance configuration
145
+ * @returns Balance analysis result
146
+ */
147
+ export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis> => {
148
+ const { promptsPath, outputPath, key = 'category', threshold = 50 } = config
149
+
150
+ // Load prompts
151
+ const prompts = await loadPrompts(promptsPath)
152
+
153
+ console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`)
154
+
155
+ // Analyze distribution
156
+ const categories = analyzeCategories(prompts, key)
157
+ const underrepresented = findUnderrepresented(categories, threshold)
158
+ const suggestions = generateSuggestions(categories, underrepresented, prompts.length)
159
+
160
+ const analysis: BalanceAnalysis = {
161
+ totalCases: prompts.length,
162
+ categories,
163
+ underrepresented,
164
+ suggestions,
165
+ }
166
+
167
+ // Format output
168
+ const output = JSON.stringify(analysis, null, 2)
169
+
170
+ // Write output
171
+ if (outputPath) {
172
+ await Bun.write(resolvePath(outputPath), output)
173
+ } else {
174
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
175
+ console.log(output)
176
+ }
177
+
178
+ // Summary to stderr
179
+ console.error('\nCategory Distribution:')
180
+ for (const cat of categories) {
181
+ const bar = '█'.repeat(Math.round(cat.percentage / 5))
182
+ console.error(` ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`)
183
+ }
184
+
185
+ if (underrepresented.length > 0) {
186
+ console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`)
187
+ }
188
+
189
+ console.error('\nSuggestions:')
190
+ for (const suggestion of suggestions) {
191
+ console.error(` - ${suggestion}`)
192
+ }
193
+
194
+ return analysis
195
+ }
196
+
197
+ // ============================================================================
198
+ // CLI Entry Point
199
+ // ============================================================================
200
+
201
+ /**
202
+ * Balance command CLI handler.
203
+ *
204
+ * @param args - Command line arguments (after 'balance')
205
+ */
206
+ export const balance = async (args: string[]): Promise<void> => {
207
+ const { values, positionals } = parseArgs({
208
+ args,
209
+ options: {
210
+ output: { type: 'string', short: 'o' },
211
+ key: { type: 'string', short: 'k', default: 'category' },
212
+ threshold: { type: 'string', short: 't', default: '50' },
213
+ help: { type: 'boolean', short: 'h' },
214
+ },
215
+ allowPositionals: true,
216
+ })
217
+
218
+ if (values.help) {
219
+ // biome-ignore lint/suspicious/noConsole: CLI help output
220
+ console.log(`
221
+ Usage: acp-harness balance <prompts.jsonl> [options]
222
+
223
+ Arguments:
224
+ prompts.jsonl Input file with prompts
225
+
226
+ Options:
227
+ -o, --output Output file (default: stdout)
228
+ -k, --key Metadata key to analyze (default: 'category')
229
+ -t, --threshold Underrepresentation threshold % (default: 50)
230
+ -h, --help Show this help message
231
+
232
+ Output:
233
+ JSON with category distribution, underrepresented categories, and suggestions.
234
+
235
+ Examples:
236
+ # Analyze by default 'category' key
237
+ acp-harness balance prompts.jsonl -o balance.json
238
+
239
+ # Analyze by custom metadata key
240
+ acp-harness balance prompts.jsonl --key difficulty -o balance.json
241
+ `)
242
+ return
243
+ }
244
+
245
+ const promptsPath = positionals[0]
246
+ if (!promptsPath) {
247
+ console.error('Error: prompts.jsonl path is required')
248
+ process.exit(1)
249
+ }
250
+
251
+ await runBalance({
252
+ promptsPath,
253
+ outputPath: values.output,
254
+ key: values.key ?? 'category',
255
+ threshold: Number.parseInt(values.threshold ?? '50', 10),
256
+ })
257
+ }
@@ -0,0 +1,319 @@
1
+ /**
2
+ * Calibrate command - sample failures for grader review.
3
+ *
4
+ * @remarks
5
+ * Helps identify grader bugs by sampling failures for human review.
6
+ * Can optionally re-score with a different grader for comparison.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { parseArgs } from 'node:util'
12
+ import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from './constants.ts'
13
+ import { loadGrader } from './grader-loader.ts'
14
+ import type { CalibrationSample, CaptureResult, Grader, GraderResult, TrajectoryStep } from './schemas.ts'
15
+ import { CaptureResultSchema } from './schemas.ts'
16
+
17
+ // ============================================================================
18
+ // Types
19
+ // ============================================================================
20
+
21
+ /** Configuration for calibrate command */
22
+ export type CalibrateConfig = {
23
+ /** Path to results.jsonl file */
24
+ resultsPath: string
25
+ /** Output file path */
26
+ outputPath?: string
27
+ /** Number of samples to include */
28
+ sample?: number
29
+ /** Optional grader for re-scoring */
30
+ grader?: Grader
31
+ }
32
+
33
+ // ============================================================================
34
+ // Helpers
35
+ // ============================================================================
36
+
37
+ /** Resolve path relative to process.cwd() */
38
+ const resolvePath = (path: string): string => {
39
+ if (path.startsWith('/')) return path
40
+ return `${process.cwd()}/${path}`
41
+ }
42
+
43
+ /** Load capture results from JSONL file */
44
+ const loadResults = async (path: string): Promise<CaptureResult[]> => {
45
+ const content = await Bun.file(path).text()
46
+ return content
47
+ .trim()
48
+ .split('\n')
49
+ .filter(Boolean)
50
+ .map((line, index) => {
51
+ try {
52
+ return CaptureResultSchema.parse(JSON.parse(line))
53
+ } catch (error) {
54
+ throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
55
+ }
56
+ })
57
+ }
58
+
59
+ /**
60
+ * Random sample from array.
61
+ *
62
+ * @param arr - Array to sample from
63
+ * @param n - Number of samples to take
64
+ * @returns Array of sampled elements
65
+ *
66
+ * @public
67
+ */
68
+ export const sampleArray = <T>(arr: T[], n: number): T[] => {
69
+ const shuffled = [...arr].sort(() => 0.5 - Math.random())
70
+ return shuffled.slice(0, n)
71
+ }
72
+
73
+ /**
74
+ * Get snippet of trajectory for review.
75
+ *
76
+ * @remarks
77
+ * Includes first 2 steps, middle step, and last 2 steps.
78
+ *
79
+ * @param trajectory - Full trajectory
80
+ * @param maxSteps - Maximum number of steps to include
81
+ * @returns Trajectory snippet
82
+ *
83
+ * @public
84
+ */
85
+ export const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
86
+ // Include first and last steps, plus some from the middle
87
+ if (trajectory.length <= maxSteps) return trajectory
88
+
89
+ const result: TrajectoryStep[] = []
90
+
91
+ // First 2 steps
92
+ result.push(...trajectory.slice(0, 2))
93
+
94
+ // Middle step
95
+ const mid = Math.floor(trajectory.length / 2)
96
+ result.push(trajectory[mid] as TrajectoryStep)
97
+
98
+ // Last 2 steps
99
+ result.push(...trajectory.slice(-2))
100
+
101
+ return result
102
+ }
103
+
104
+ /** Format calibration sample as markdown */
105
+ const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
106
+ const lines: string[] = [
107
+ '# Grader Calibration Report',
108
+ '',
109
+ `Generated: ${new Date().toISOString()}`,
110
+ `Samples: ${samples.length}`,
111
+ '',
112
+ '## Instructions',
113
+ '',
114
+ 'Review each failure below and mark whether:',
115
+ '- [ ] **Valid failure** - Grader correctly identified a problem',
116
+ '- [ ] **Grader bug** - Output was actually correct, grader was wrong',
117
+ '- [ ] **Ambiguous** - Unclear if the output is correct or not',
118
+ '',
119
+ '---',
120
+ '',
121
+ ]
122
+
123
+ for (let i = 0; i < samples.length; i++) {
124
+ const sample = samples[i]
125
+ if (!sample) continue
126
+
127
+ lines.push(`## Sample ${i + 1}: ${sample.id}`)
128
+ lines.push('')
129
+ lines.push(`**Input:** ${sample.input}`)
130
+ lines.push('')
131
+
132
+ if (sample.expected) {
133
+ lines.push(`**Expected:** ${sample.expected}`)
134
+ lines.push('')
135
+ }
136
+
137
+ lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`)
138
+ lines.push('')
139
+
140
+ lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`)
141
+ if (sample.originalScore.reasoning) {
142
+ lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`)
143
+ }
144
+ lines.push('')
145
+
146
+ if (sample.rescoredResult) {
147
+ lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`)
148
+ if (sample.rescoredResult.reasoning) {
149
+ lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`)
150
+ }
151
+ lines.push('')
152
+ }
153
+
154
+ lines.push('**Trajectory Snippet:**')
155
+ lines.push('```')
156
+ for (const step of sample.trajectorySnippet) {
157
+ if (step.type === 'tool_call') {
158
+ lines.push(`[${step.type}] ${step.name}: ${step.status}`)
159
+ } else if (step.type === 'message' || step.type === 'thought') {
160
+ lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`)
161
+ } else if (step.type === 'plan') {
162
+ lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`)
163
+ }
164
+ }
165
+ lines.push('```')
166
+ lines.push('')
167
+
168
+ lines.push('**Review:**')
169
+ lines.push('- [ ] Valid failure')
170
+ lines.push('- [ ] Grader bug')
171
+ lines.push('- [ ] Ambiguous')
172
+ lines.push('')
173
+ lines.push('---')
174
+ lines.push('')
175
+ }
176
+
177
+ return lines.join('\n')
178
+ }
179
+
180
+ // ============================================================================
181
+ // Calibrate Implementation
182
+ // ============================================================================
183
+
184
+ /**
185
+ * Execute calibrate with configuration object.
186
+ *
187
+ * @param config - Calibrate configuration
188
+ * @returns Calibration samples
189
+ */
190
+ export const runCalibrate = async (config: CalibrateConfig): Promise<CalibrationSample[]> => {
191
+ const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config
192
+
193
+ // Load results
194
+ const results = await loadResults(resultsPath)
195
+
196
+ // Filter to failures (or results without scores)
197
+ const failures = results.filter((r) => r.score && !r.score.pass)
198
+
199
+ if (failures.length === 0) {
200
+ console.error('No failures found in results')
201
+ return []
202
+ }
203
+
204
+ // Sample failures
205
+ const sampled = sampleArray(failures, Math.min(sample, failures.length))
206
+
207
+ // Build calibration samples
208
+ const samples: CalibrationSample[] = []
209
+
210
+ for (const result of sampled) {
211
+ const calibrationSample: CalibrationSample = {
212
+ id: result.id,
213
+ input: result.input,
214
+ output: result.output,
215
+ expected: result.expected,
216
+ originalScore: result.score as GraderResult,
217
+ trajectorySnippet: getTrajectorySnippet(result.trajectory),
218
+ }
219
+
220
+ // Re-score with different grader if provided
221
+ if (grader) {
222
+ calibrationSample.rescoredResult = await grader({
223
+ input: result.input,
224
+ output: result.output,
225
+ expected: result.expected,
226
+ trajectory: result.trajectory,
227
+ })
228
+ }
229
+
230
+ samples.push(calibrationSample)
231
+ }
232
+
233
+ // Format as markdown
234
+ const markdown = formatCalibrationMarkdown(samples)
235
+
236
+ // Write output
237
+ if (outputPath) {
238
+ await Bun.write(resolvePath(outputPath), markdown)
239
+ } else {
240
+ // biome-ignore lint/suspicious/noConsole: CLI stdout output
241
+ console.log(markdown)
242
+ }
243
+
244
+ return samples
245
+ }
246
+
247
+ // ============================================================================
248
+ // CLI Entry Point
249
+ // ============================================================================
250
+
251
+ /**
252
+ * Calibrate command CLI handler.
253
+ *
254
+ * @param args - Command line arguments (after 'calibrate')
255
+ */
256
+ export const calibrate = async (args: string[]): Promise<void> => {
257
+ const { values, positionals } = parseArgs({
258
+ args,
259
+ options: {
260
+ output: { type: 'string', short: 'o' },
261
+ sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) },
262
+ grader: { type: 'string', short: 'g' },
263
+ help: { type: 'boolean', short: 'h' },
264
+ },
265
+ allowPositionals: true,
266
+ })
267
+
268
+ if (values.help) {
269
+ // biome-ignore lint/suspicious/noConsole: CLI help output
270
+ console.log(`
271
+ Usage: acp-harness calibrate <results.jsonl> [options]
272
+
273
+ Arguments:
274
+ results.jsonl Input file with scored capture results
275
+
276
+ Options:
277
+ -o, --output Output file (default: stdout)
278
+ -s, --sample Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE})
279
+ -g, --grader Path to alternative grader (.ts/.js module or executable script)
280
+ -h, --help Show this help message
281
+
282
+ Output:
283
+ Markdown report with sampled failures for human review.
284
+ Includes checkboxes for labeling (valid failure / grader bug / ambiguous).
285
+
286
+ Examples:
287
+ # Sample failures for review
288
+ acp-harness calibrate results.jsonl --sample 10 -o calibration.md
289
+
290
+ # Re-score with different grader to compare
291
+ acp-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md
292
+ `)
293
+ return
294
+ }
295
+
296
+ const resultsPath = positionals[0]
297
+ if (!resultsPath) {
298
+ console.error('Error: results.jsonl path is required')
299
+ process.exit(1)
300
+ }
301
+
302
+ // Load grader if specified
303
+ let grader: Grader | undefined
304
+ if (values.grader) {
305
+ try {
306
+ grader = await loadGrader(values.grader)
307
+ } catch (error) {
308
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
309
+ process.exit(1)
310
+ }
311
+ }
312
+
313
+ await runCalibrate({
314
+ resultsPath,
315
+ outputPath: values.output,
316
+ sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10),
317
+ grader,
318
+ })
319
+ }