@plaited/acp-harness 0.2.6 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +120 -16
- package/bin/cli.ts +105 -636
- package/bin/tests/cli.spec.ts +218 -51
- package/package.json +20 -4
- package/src/acp-client.ts +5 -4
- package/src/acp-transport.ts +14 -7
- package/src/adapter-check.ts +542 -0
- package/src/adapter-scaffold.ts +934 -0
- package/src/balance.ts +232 -0
- package/src/calibrate.ts +300 -0
- package/src/capture.ts +457 -0
- package/src/constants.ts +94 -0
- package/src/grader-loader.ts +174 -0
- package/src/harness.ts +35 -0
- package/src/schemas-cli.ts +239 -0
- package/src/schemas.ts +567 -0
- package/src/summarize.ts +245 -0
- package/src/tests/adapter-check.spec.ts +70 -0
- package/src/tests/adapter-scaffold.spec.ts +112 -0
- package/src/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/tests/fixtures/grader-exec.py +29 -0
- package/src/tests/fixtures/grader-module.ts +14 -0
- package/src/tests/grader-loader.spec.ts +153 -0
- package/src/trials.ts +395 -0
- package/src/validate-refs.ts +188 -0
- package/.claude/rules/accuracy.md +0 -43
- package/.claude/rules/bun-apis.md +0 -80
- package/.claude/rules/code-review.md +0 -254
- package/.claude/rules/git-workflow.md +0 -37
- package/.claude/rules/github.md +0 -154
- package/.claude/rules/testing.md +0 -172
- package/.claude/skills/acp-harness/SKILL.md +0 -310
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
- package/.claude/skills/acp-harness/references/downstream.md +0 -288
- package/.claude/skills/acp-harness/references/output-formats.md +0 -221
- package/.claude-plugin/marketplace.json +0 -15
- package/.claude-plugin/plugin.json +0 -16
- package/.github/CODEOWNERS +0 -6
- package/.github/workflows/ci.yml +0 -63
- package/.github/workflows/publish.yml +0 -146
- package/.mcp.json +0 -20
- package/CLAUDE.md +0 -92
- package/Dockerfile.test +0 -23
- package/biome.json +0 -96
- package/bun.lock +0 -513
- package/docker-compose.test.yml +0 -21
- package/scripts/bun-test-wrapper.sh +0 -46
- package/src/acp.constants.ts +0 -56
- package/src/acp.schemas.ts +0 -161
- package/src/acp.types.ts +0 -28
- package/src/tests/fixtures/.claude/settings.local.json +0 -8
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
- package/tsconfig.json +0 -32
package/src/balance.ts
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Balance command - analyze test set coverage.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Analyzes the distribution of test cases by metadata categories.
|
|
6
|
+
* Identifies underrepresented categories and suggests improvements.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadPrompts } from './capture.ts'
|
|
13
|
+
import type { BalanceAnalysis, CategoryDistribution, PromptCase } from './schemas.ts'
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Types
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/** Configuration for balance command */
|
|
20
|
+
export type BalanceConfig = {
|
|
21
|
+
/** Path to prompts.jsonl file */
|
|
22
|
+
promptsPath: string
|
|
23
|
+
/** Output file path */
|
|
24
|
+
outputPath?: string
|
|
25
|
+
/** Metadata key to analyze (default: 'category') */
|
|
26
|
+
key?: string
|
|
27
|
+
/** Threshold for underrepresentation (percentage) */
|
|
28
|
+
threshold?: number
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ============================================================================
|
|
32
|
+
// Helpers
|
|
33
|
+
// ============================================================================
|
|
34
|
+
|
|
35
|
+
/** Resolve path relative to process.cwd() */
|
|
36
|
+
const resolvePath = (path: string): string => {
|
|
37
|
+
if (path.startsWith('/')) return path
|
|
38
|
+
return `${process.cwd()}/${path}`
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Analyze category distribution */
|
|
42
|
+
const analyzeCategories = (prompts: PromptCase[], key: string): CategoryDistribution[] => {
|
|
43
|
+
const counts = new Map<string, number>()
|
|
44
|
+
|
|
45
|
+
for (const prompt of prompts) {
|
|
46
|
+
const value = prompt.metadata?.[key]
|
|
47
|
+
const category = value !== undefined ? String(value) : '(uncategorized)'
|
|
48
|
+
counts.set(category, (counts.get(category) ?? 0) + 1)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const total = prompts.length
|
|
52
|
+
const distributions: CategoryDistribution[] = []
|
|
53
|
+
|
|
54
|
+
for (const [name, count] of counts) {
|
|
55
|
+
distributions.push({
|
|
56
|
+
name,
|
|
57
|
+
count,
|
|
58
|
+
percentage: Math.round((count / total) * 100),
|
|
59
|
+
})
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Sort by count descending
|
|
63
|
+
distributions.sort((a, b) => b.count - a.count)
|
|
64
|
+
|
|
65
|
+
return distributions
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Identify underrepresented categories */
|
|
69
|
+
const findUnderrepresented = (distributions: CategoryDistribution[], threshold: number): string[] => {
|
|
70
|
+
// Expected percentage if evenly distributed
|
|
71
|
+
const evenPercentage = 100 / distributions.length
|
|
72
|
+
|
|
73
|
+
return distributions.filter((d) => d.percentage < evenPercentage * (threshold / 100)).map((d) => d.name)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Generate suggestions for improvement */
|
|
77
|
+
const generateSuggestions = (
|
|
78
|
+
distributions: CategoryDistribution[],
|
|
79
|
+
underrepresented: string[],
|
|
80
|
+
total: number,
|
|
81
|
+
): string[] => {
|
|
82
|
+
const suggestions: string[] = []
|
|
83
|
+
|
|
84
|
+
if (underrepresented.length > 0) {
|
|
85
|
+
suggestions.push(`Consider adding more test cases for: ${underrepresented.join(', ')}`)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Check for category with > 50% of cases
|
|
89
|
+
const dominant = distributions.find((d) => d.percentage > 50)
|
|
90
|
+
if (dominant) {
|
|
91
|
+
suggestions.push(`Category '${dominant.name}' has ${dominant.percentage}% of cases - consider diversifying`)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Check for very small categories
|
|
95
|
+
const tiny = distributions.filter((d) => d.count < 3)
|
|
96
|
+
if (tiny.length > 0) {
|
|
97
|
+
suggestions.push(`Categories with < 3 cases may not be reliable: ${tiny.map((d) => d.name).join(', ')}`)
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Check total test count
|
|
101
|
+
if (total < 20) {
|
|
102
|
+
suggestions.push(`Consider expanding test set (currently ${total} cases) for more statistical significance`)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (suggestions.length === 0) {
|
|
106
|
+
suggestions.push('Test set appears well-balanced')
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return suggestions
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// ============================================================================
|
|
113
|
+
// Balance Implementation
|
|
114
|
+
// ============================================================================
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Execute balance analysis with configuration object.
|
|
118
|
+
*
|
|
119
|
+
* @param config - Balance configuration
|
|
120
|
+
* @returns Balance analysis result
|
|
121
|
+
*/
|
|
122
|
+
export const runBalance = async (config: BalanceConfig): Promise<BalanceAnalysis> => {
|
|
123
|
+
const { promptsPath, outputPath, key = 'category', threshold = 50 } = config
|
|
124
|
+
|
|
125
|
+
// Load prompts
|
|
126
|
+
const prompts = await loadPrompts(promptsPath)
|
|
127
|
+
|
|
128
|
+
console.error(`Analyzing ${prompts.length} prompts by '${key}' metadata...`)
|
|
129
|
+
|
|
130
|
+
// Analyze distribution
|
|
131
|
+
const categories = analyzeCategories(prompts, key)
|
|
132
|
+
const underrepresented = findUnderrepresented(categories, threshold)
|
|
133
|
+
const suggestions = generateSuggestions(categories, underrepresented, prompts.length)
|
|
134
|
+
|
|
135
|
+
const analysis: BalanceAnalysis = {
|
|
136
|
+
totalCases: prompts.length,
|
|
137
|
+
categories,
|
|
138
|
+
underrepresented,
|
|
139
|
+
suggestions,
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Format output
|
|
143
|
+
const output = JSON.stringify(analysis, null, 2)
|
|
144
|
+
|
|
145
|
+
// Write output
|
|
146
|
+
if (outputPath) {
|
|
147
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
148
|
+
} else {
|
|
149
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
150
|
+
console.log(output)
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Summary to stderr
|
|
154
|
+
console.error('\nCategory Distribution:')
|
|
155
|
+
for (const cat of categories) {
|
|
156
|
+
const bar = '█'.repeat(Math.round(cat.percentage / 5))
|
|
157
|
+
console.error(` ${cat.name}: ${cat.count} (${cat.percentage}%) ${bar}`)
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if (underrepresented.length > 0) {
|
|
161
|
+
console.error(`\nUnderrepresented: ${underrepresented.join(', ')}`)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
console.error('\nSuggestions:')
|
|
165
|
+
for (const suggestion of suggestions) {
|
|
166
|
+
console.error(` - ${suggestion}`)
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return analysis
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ============================================================================
|
|
173
|
+
// CLI Entry Point
|
|
174
|
+
// ============================================================================
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Balance command CLI handler.
|
|
178
|
+
*
|
|
179
|
+
* @param args - Command line arguments (after 'balance')
|
|
180
|
+
*/
|
|
181
|
+
export const balance = async (args: string[]): Promise<void> => {
|
|
182
|
+
const { values, positionals } = parseArgs({
|
|
183
|
+
args,
|
|
184
|
+
options: {
|
|
185
|
+
output: { type: 'string', short: 'o' },
|
|
186
|
+
key: { type: 'string', short: 'k', default: 'category' },
|
|
187
|
+
threshold: { type: 'string', short: 't', default: '50' },
|
|
188
|
+
help: { type: 'boolean', short: 'h' },
|
|
189
|
+
},
|
|
190
|
+
allowPositionals: true,
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
if (values.help) {
|
|
194
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
195
|
+
console.log(`
|
|
196
|
+
Usage: acp-harness balance <prompts.jsonl> [options]
|
|
197
|
+
|
|
198
|
+
Arguments:
|
|
199
|
+
prompts.jsonl Input file with prompts
|
|
200
|
+
|
|
201
|
+
Options:
|
|
202
|
+
-o, --output Output file (default: stdout)
|
|
203
|
+
-k, --key Metadata key to analyze (default: 'category')
|
|
204
|
+
-t, --threshold Underrepresentation threshold % (default: 50)
|
|
205
|
+
-h, --help Show this help message
|
|
206
|
+
|
|
207
|
+
Output:
|
|
208
|
+
JSON with category distribution, underrepresented categories, and suggestions.
|
|
209
|
+
|
|
210
|
+
Examples:
|
|
211
|
+
# Analyze by default 'category' key
|
|
212
|
+
acp-harness balance prompts.jsonl -o balance.json
|
|
213
|
+
|
|
214
|
+
# Analyze by custom metadata key
|
|
215
|
+
acp-harness balance prompts.jsonl --key difficulty -o balance.json
|
|
216
|
+
`)
|
|
217
|
+
return
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const promptsPath = positionals[0]
|
|
221
|
+
if (!promptsPath) {
|
|
222
|
+
console.error('Error: prompts.jsonl path is required')
|
|
223
|
+
process.exit(1)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
await runBalance({
|
|
227
|
+
promptsPath,
|
|
228
|
+
outputPath: values.output,
|
|
229
|
+
key: values.key ?? 'category',
|
|
230
|
+
threshold: Number.parseInt(values.threshold ?? '50', 10),
|
|
231
|
+
})
|
|
232
|
+
}
|
package/src/calibrate.ts
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Calibrate command - sample failures for grader review.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Helps identify grader bugs by sampling failures for human review.
|
|
6
|
+
* Can optionally re-score with a different grader for comparison.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from './constants.ts'
|
|
13
|
+
import { loadGrader } from './grader-loader.ts'
|
|
14
|
+
import type { CalibrationSample, CaptureResult, Grader, GraderResult, TrajectoryStep } from './schemas.ts'
|
|
15
|
+
import { CaptureResultSchema } from './schemas.ts'
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Types
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
/** Configuration for calibrate command */
|
|
22
|
+
export type CalibrateConfig = {
|
|
23
|
+
/** Path to results.jsonl file */
|
|
24
|
+
resultsPath: string
|
|
25
|
+
/** Output file path */
|
|
26
|
+
outputPath?: string
|
|
27
|
+
/** Number of samples to include */
|
|
28
|
+
sample?: number
|
|
29
|
+
/** Optional grader for re-scoring */
|
|
30
|
+
grader?: Grader
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ============================================================================
|
|
34
|
+
// Helpers
|
|
35
|
+
// ============================================================================
|
|
36
|
+
|
|
37
|
+
/** Resolve path relative to process.cwd() */
|
|
38
|
+
const resolvePath = (path: string): string => {
|
|
39
|
+
if (path.startsWith('/')) return path
|
|
40
|
+
return `${process.cwd()}/${path}`
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Load capture results from JSONL file */
|
|
44
|
+
const loadResults = async (path: string): Promise<CaptureResult[]> => {
|
|
45
|
+
const content = await Bun.file(path).text()
|
|
46
|
+
return content
|
|
47
|
+
.trim()
|
|
48
|
+
.split('\n')
|
|
49
|
+
.filter(Boolean)
|
|
50
|
+
.map((line, index) => {
|
|
51
|
+
try {
|
|
52
|
+
return CaptureResultSchema.parse(JSON.parse(line))
|
|
53
|
+
} catch (error) {
|
|
54
|
+
throw new Error(`Invalid result at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
55
|
+
}
|
|
56
|
+
})
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Random sample from array */
|
|
60
|
+
const sampleArray = <T>(arr: T[], n: number): T[] => {
|
|
61
|
+
const shuffled = [...arr].sort(() => 0.5 - Math.random())
|
|
62
|
+
return shuffled.slice(0, n)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Get snippet of trajectory for review */
|
|
66
|
+
const getTrajectorySnippet = (trajectory: TrajectoryStep[], maxSteps = 5): TrajectoryStep[] => {
|
|
67
|
+
// Include first and last steps, plus some from the middle
|
|
68
|
+
if (trajectory.length <= maxSteps) return trajectory
|
|
69
|
+
|
|
70
|
+
const result: TrajectoryStep[] = []
|
|
71
|
+
|
|
72
|
+
// First 2 steps
|
|
73
|
+
result.push(...trajectory.slice(0, 2))
|
|
74
|
+
|
|
75
|
+
// Middle step
|
|
76
|
+
const mid = Math.floor(trajectory.length / 2)
|
|
77
|
+
result.push(trajectory[mid] as TrajectoryStep)
|
|
78
|
+
|
|
79
|
+
// Last 2 steps
|
|
80
|
+
result.push(...trajectory.slice(-2))
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Format calibration sample as markdown */
|
|
86
|
+
const formatCalibrationMarkdown = (samples: CalibrationSample[]): string => {
|
|
87
|
+
const lines: string[] = [
|
|
88
|
+
'# Grader Calibration Report',
|
|
89
|
+
'',
|
|
90
|
+
`Generated: ${new Date().toISOString()}`,
|
|
91
|
+
`Samples: ${samples.length}`,
|
|
92
|
+
'',
|
|
93
|
+
'## Instructions',
|
|
94
|
+
'',
|
|
95
|
+
'Review each failure below and mark whether:',
|
|
96
|
+
'- [ ] **Valid failure** - Grader correctly identified a problem',
|
|
97
|
+
'- [ ] **Grader bug** - Output was actually correct, grader was wrong',
|
|
98
|
+
'- [ ] **Ambiguous** - Unclear if the output is correct or not',
|
|
99
|
+
'',
|
|
100
|
+
'---',
|
|
101
|
+
'',
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
for (let i = 0; i < samples.length; i++) {
|
|
105
|
+
const sample = samples[i]
|
|
106
|
+
if (!sample) continue
|
|
107
|
+
|
|
108
|
+
lines.push(`## Sample ${i + 1}: ${sample.id}`)
|
|
109
|
+
lines.push('')
|
|
110
|
+
lines.push(`**Input:** ${sample.input}`)
|
|
111
|
+
lines.push('')
|
|
112
|
+
|
|
113
|
+
if (sample.expected) {
|
|
114
|
+
lines.push(`**Expected:** ${sample.expected}`)
|
|
115
|
+
lines.push('')
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
lines.push(`**Output:** ${sample.output.slice(0, 500)}${sample.output.length > 500 ? '...' : ''}`)
|
|
119
|
+
lines.push('')
|
|
120
|
+
|
|
121
|
+
lines.push(`**Original Score:** ${sample.originalScore.pass ? 'PASS' : 'FAIL'} (${sample.originalScore.score})`)
|
|
122
|
+
if (sample.originalScore.reasoning) {
|
|
123
|
+
lines.push(`**Reasoning:** ${sample.originalScore.reasoning}`)
|
|
124
|
+
}
|
|
125
|
+
lines.push('')
|
|
126
|
+
|
|
127
|
+
if (sample.rescoredResult) {
|
|
128
|
+
lines.push(`**Re-scored:** ${sample.rescoredResult.pass ? 'PASS' : 'FAIL'} (${sample.rescoredResult.score})`)
|
|
129
|
+
if (sample.rescoredResult.reasoning) {
|
|
130
|
+
lines.push(`**Re-score Reasoning:** ${sample.rescoredResult.reasoning}`)
|
|
131
|
+
}
|
|
132
|
+
lines.push('')
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
lines.push('**Trajectory Snippet:**')
|
|
136
|
+
lines.push('```')
|
|
137
|
+
for (const step of sample.trajectorySnippet) {
|
|
138
|
+
if (step.type === 'tool_call') {
|
|
139
|
+
lines.push(`[${step.type}] ${step.name}: ${step.status}`)
|
|
140
|
+
} else if (step.type === 'message' || step.type === 'thought') {
|
|
141
|
+
lines.push(`[${step.type}] ${step.content.slice(0, 100)}...`)
|
|
142
|
+
} else if (step.type === 'plan') {
|
|
143
|
+
lines.push(`[${step.type}] ${(step.entries as Array<{ content: string }>).length} entries`)
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
lines.push('```')
|
|
147
|
+
lines.push('')
|
|
148
|
+
|
|
149
|
+
lines.push('**Review:**')
|
|
150
|
+
lines.push('- [ ] Valid failure')
|
|
151
|
+
lines.push('- [ ] Grader bug')
|
|
152
|
+
lines.push('- [ ] Ambiguous')
|
|
153
|
+
lines.push('')
|
|
154
|
+
lines.push('---')
|
|
155
|
+
lines.push('')
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
return lines.join('\n')
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// ============================================================================
|
|
162
|
+
// Calibrate Implementation
|
|
163
|
+
// ============================================================================
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Execute calibrate with configuration object.
|
|
167
|
+
*
|
|
168
|
+
* @param config - Calibrate configuration
|
|
169
|
+
* @returns Calibration samples
|
|
170
|
+
*/
|
|
171
|
+
export const runCalibrate = async (config: CalibrateConfig): Promise<CalibrationSample[]> => {
|
|
172
|
+
const { resultsPath, outputPath, sample = DEFAULT_CALIBRATION_SAMPLE_SIZE, grader } = config
|
|
173
|
+
|
|
174
|
+
// Load results
|
|
175
|
+
const results = await loadResults(resultsPath)
|
|
176
|
+
|
|
177
|
+
// Filter to failures (or results without scores)
|
|
178
|
+
const failures = results.filter((r) => r.score && !r.score.pass)
|
|
179
|
+
|
|
180
|
+
if (failures.length === 0) {
|
|
181
|
+
console.error('No failures found in results')
|
|
182
|
+
return []
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Sample failures
|
|
186
|
+
const sampled = sampleArray(failures, Math.min(sample, failures.length))
|
|
187
|
+
|
|
188
|
+
// Build calibration samples
|
|
189
|
+
const samples: CalibrationSample[] = []
|
|
190
|
+
|
|
191
|
+
for (const result of sampled) {
|
|
192
|
+
const calibrationSample: CalibrationSample = {
|
|
193
|
+
id: result.id,
|
|
194
|
+
input: result.input,
|
|
195
|
+
output: result.output,
|
|
196
|
+
expected: result.expected,
|
|
197
|
+
originalScore: result.score as GraderResult,
|
|
198
|
+
trajectorySnippet: getTrajectorySnippet(result.trajectory),
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Re-score with different grader if provided
|
|
202
|
+
if (grader) {
|
|
203
|
+
calibrationSample.rescoredResult = await grader({
|
|
204
|
+
input: result.input,
|
|
205
|
+
output: result.output,
|
|
206
|
+
expected: result.expected,
|
|
207
|
+
trajectory: result.trajectory,
|
|
208
|
+
})
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
samples.push(calibrationSample)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Format as markdown
|
|
215
|
+
const markdown = formatCalibrationMarkdown(samples)
|
|
216
|
+
|
|
217
|
+
// Write output
|
|
218
|
+
if (outputPath) {
|
|
219
|
+
await Bun.write(resolvePath(outputPath), markdown)
|
|
220
|
+
} else {
|
|
221
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
222
|
+
console.log(markdown)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return samples
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// ============================================================================
|
|
229
|
+
// CLI Entry Point
|
|
230
|
+
// ============================================================================
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Calibrate command CLI handler.
|
|
234
|
+
*
|
|
235
|
+
* @param args - Command line arguments (after 'calibrate')
|
|
236
|
+
*/
|
|
237
|
+
export const calibrate = async (args: string[]): Promise<void> => {
|
|
238
|
+
const { values, positionals } = parseArgs({
|
|
239
|
+
args,
|
|
240
|
+
options: {
|
|
241
|
+
output: { type: 'string', short: 'o' },
|
|
242
|
+
sample: { type: 'string', short: 's', default: String(DEFAULT_CALIBRATION_SAMPLE_SIZE) },
|
|
243
|
+
grader: { type: 'string', short: 'g' },
|
|
244
|
+
help: { type: 'boolean', short: 'h' },
|
|
245
|
+
},
|
|
246
|
+
allowPositionals: true,
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
if (values.help) {
|
|
250
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
251
|
+
console.log(`
|
|
252
|
+
Usage: acp-harness calibrate <results.jsonl> [options]
|
|
253
|
+
|
|
254
|
+
Arguments:
|
|
255
|
+
results.jsonl Input file with scored capture results
|
|
256
|
+
|
|
257
|
+
Options:
|
|
258
|
+
-o, --output Output file (default: stdout)
|
|
259
|
+
-s, --sample Number of failures to sample (default: ${DEFAULT_CALIBRATION_SAMPLE_SIZE})
|
|
260
|
+
-g, --grader Path to alternative grader (.ts/.js module or executable script)
|
|
261
|
+
-h, --help Show this help message
|
|
262
|
+
|
|
263
|
+
Output:
|
|
264
|
+
Markdown report with sampled failures for human review.
|
|
265
|
+
Includes checkboxes for labeling (valid failure / grader bug / ambiguous).
|
|
266
|
+
|
|
267
|
+
Examples:
|
|
268
|
+
# Sample failures for review
|
|
269
|
+
acp-harness calibrate results.jsonl --sample 10 -o calibration.md
|
|
270
|
+
|
|
271
|
+
# Re-score with different grader to compare
|
|
272
|
+
acp-harness calibrate results.jsonl --grader ./loose-grader.ts -o comparison.md
|
|
273
|
+
`)
|
|
274
|
+
return
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
const resultsPath = positionals[0]
|
|
278
|
+
if (!resultsPath) {
|
|
279
|
+
console.error('Error: results.jsonl path is required')
|
|
280
|
+
process.exit(1)
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Load grader if specified
|
|
284
|
+
let grader: Grader | undefined
|
|
285
|
+
if (values.grader) {
|
|
286
|
+
try {
|
|
287
|
+
grader = await loadGrader(values.grader)
|
|
288
|
+
} catch (error) {
|
|
289
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
290
|
+
process.exit(1)
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
await runCalibrate({
|
|
295
|
+
resultsPath,
|
|
296
|
+
outputPath: values.output,
|
|
297
|
+
sample: Number.parseInt(values.sample ?? String(DEFAULT_CALIBRATION_SAMPLE_SIZE), 10),
|
|
298
|
+
grader,
|
|
299
|
+
})
|
|
300
|
+
}
|