@plaited/acp-harness 0.2.6 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +120 -16
- package/bin/cli.ts +105 -636
- package/bin/tests/cli.spec.ts +218 -51
- package/package.json +20 -4
- package/src/acp-client.ts +5 -4
- package/src/acp-transport.ts +14 -7
- package/src/adapter-check.ts +542 -0
- package/src/adapter-scaffold.ts +934 -0
- package/src/balance.ts +232 -0
- package/src/calibrate.ts +300 -0
- package/src/capture.ts +457 -0
- package/src/constants.ts +94 -0
- package/src/grader-loader.ts +174 -0
- package/src/harness.ts +35 -0
- package/src/schemas-cli.ts +239 -0
- package/src/schemas.ts +567 -0
- package/src/summarize.ts +245 -0
- package/src/tests/adapter-check.spec.ts +70 -0
- package/src/tests/adapter-scaffold.spec.ts +112 -0
- package/src/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/tests/fixtures/grader-exec.py +29 -0
- package/src/tests/fixtures/grader-module.ts +14 -0
- package/src/tests/grader-loader.spec.ts +153 -0
- package/src/trials.ts +395 -0
- package/src/validate-refs.ts +188 -0
- package/.claude/rules/accuracy.md +0 -43
- package/.claude/rules/bun-apis.md +0 -80
- package/.claude/rules/code-review.md +0 -254
- package/.claude/rules/git-workflow.md +0 -37
- package/.claude/rules/github.md +0 -154
- package/.claude/rules/testing.md +0 -172
- package/.claude/skills/acp-harness/SKILL.md +0 -310
- package/.claude/skills/acp-harness/assets/Dockerfile.acp +0 -25
- package/.claude/skills/acp-harness/assets/docker-compose.acp.yml +0 -19
- package/.claude/skills/acp-harness/references/downstream.md +0 -288
- package/.claude/skills/acp-harness/references/output-formats.md +0 -221
- package/.claude-plugin/marketplace.json +0 -15
- package/.claude-plugin/plugin.json +0 -16
- package/.github/CODEOWNERS +0 -6
- package/.github/workflows/ci.yml +0 -63
- package/.github/workflows/publish.yml +0 -146
- package/.mcp.json +0 -20
- package/CLAUDE.md +0 -92
- package/Dockerfile.test +0 -23
- package/biome.json +0 -96
- package/bun.lock +0 -513
- package/docker-compose.test.yml +0 -21
- package/scripts/bun-test-wrapper.sh +0 -46
- package/src/acp.constants.ts +0 -56
- package/src/acp.schemas.ts +0 -161
- package/src/acp.types.ts +0 -28
- package/src/tests/fixtures/.claude/settings.local.json +0 -8
- package/src/tests/fixtures/.claude/skills/greeting/SKILL.md +0 -17
- package/tsconfig.json +0 -32
package/src/trials.ts
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-run trials command for pass@k/pass^k analysis.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Runs each prompt k times to measure non-determinism.
|
|
6
|
+
* Without a grader, captures raw trials. With a grader, computes:
|
|
7
|
+
* - passRate: Simple pass rate (passes / k)
|
|
8
|
+
* - passAtK: Probability of at least one pass in k samples
|
|
9
|
+
* - passExpK: Probability of all k samples passing
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { appendFile } from 'node:fs/promises'
|
|
15
|
+
import { parseArgs } from 'node:util'
|
|
16
|
+
import { createACPClient } from './acp-client.ts'
|
|
17
|
+
import { createPrompt } from './acp-helpers.ts'
|
|
18
|
+
import { extractOutput, extractTrajectory, loadPrompts } from './capture.ts'
|
|
19
|
+
import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from './constants.ts'
|
|
20
|
+
import { loadGrader } from './grader-loader.ts'
|
|
21
|
+
import type { Grader, TrialEntry, TrialResult } from './schemas.ts'
|
|
22
|
+
import { McpServerSchema } from './schemas.ts'
|
|
23
|
+
|
|
24
|
+
// ============================================================================
|
|
25
|
+
// Pass@k/Pass^k Calculation
|
|
26
|
+
// ============================================================================
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Calculate pass@k: probability of at least one pass in k samples.
|
|
30
|
+
*
|
|
31
|
+
* @remarks
|
|
32
|
+
* Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k)
|
|
33
|
+
* where n = total samples, c = correct samples, k = samples per trial
|
|
34
|
+
*
|
|
35
|
+
* For our case where n = k (we run exactly k trials per prompt):
|
|
36
|
+
* pass@k = 1 - (1 - passRate)^k (simplified)
|
|
37
|
+
*/
|
|
38
|
+
const calculatePassAtK = (passes: number, k: number): number => {
|
|
39
|
+
if (passes >= k) return 1
|
|
40
|
+
if (passes === 0) return 0
|
|
41
|
+
|
|
42
|
+
// Simplified formula when n = k
|
|
43
|
+
const passRate = passes / k
|
|
44
|
+
return 1 - (1 - passRate) ** k
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Calculate pass^k: probability of all k samples passing.
|
|
49
|
+
*
|
|
50
|
+
* @remarks
|
|
51
|
+
* This is simply passRate^k
|
|
52
|
+
*/
|
|
53
|
+
const calculatePassExpK = (passes: number, k: number): number => {
|
|
54
|
+
if (passes === k) return 1
|
|
55
|
+
if (passes === 0) return 0
|
|
56
|
+
|
|
57
|
+
const passRate = passes / k
|
|
58
|
+
return passRate ** k
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ============================================================================
|
|
62
|
+
// Types
|
|
63
|
+
// ============================================================================
|
|
64
|
+
|
|
65
|
+
/** Configuration for trials command */
|
|
66
|
+
export type TrialsConfig = {
|
|
67
|
+
/** Path to prompts.jsonl file */
|
|
68
|
+
promptsPath: string
|
|
69
|
+
/** ACP agent command */
|
|
70
|
+
agentCommand: string[]
|
|
71
|
+
/** Number of trials per prompt */
|
|
72
|
+
k: number
|
|
73
|
+
/** Output file path */
|
|
74
|
+
outputPath?: string
|
|
75
|
+
/** Working directory for agent */
|
|
76
|
+
cwd?: string
|
|
77
|
+
/** Timeout per prompt in milliseconds */
|
|
78
|
+
timeout?: number
|
|
79
|
+
/** Show progress to stderr */
|
|
80
|
+
progress?: boolean
|
|
81
|
+
/** Append to output file */
|
|
82
|
+
append?: boolean
|
|
83
|
+
/** MCP server configurations */
|
|
84
|
+
mcpServers?: unknown[]
|
|
85
|
+
/** Optional grader function */
|
|
86
|
+
grader?: Grader
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// ============================================================================
|
|
90
|
+
// Helpers
|
|
91
|
+
// ============================================================================
|
|
92
|
+
|
|
93
|
+
/** Resolve path relative to process.cwd() */
|
|
94
|
+
const resolvePath = (path: string): string => {
|
|
95
|
+
if (path.startsWith('/')) return path
|
|
96
|
+
return `${process.cwd()}/${path}`
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Write output line */
|
|
100
|
+
const writeOutput = async (line: string, outputPath?: string, append?: boolean): Promise<void> => {
|
|
101
|
+
if (outputPath) {
|
|
102
|
+
if (append) {
|
|
103
|
+
await appendFile(outputPath, `${line}\n`)
|
|
104
|
+
} else {
|
|
105
|
+
await Bun.write(outputPath, `${line}\n`)
|
|
106
|
+
}
|
|
107
|
+
} else {
|
|
108
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
109
|
+
console.log(line)
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Log progress to stderr */
|
|
114
|
+
const logProgress = (message: string, showProgress: boolean): void => {
|
|
115
|
+
if (showProgress) {
|
|
116
|
+
console.error(message)
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ============================================================================
|
|
121
|
+
// Trials Implementation
|
|
122
|
+
// ============================================================================
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Execute trials with configuration object.
|
|
126
|
+
*
|
|
127
|
+
* @param config - Trials configuration
|
|
128
|
+
* @returns Array of trial results
|
|
129
|
+
*/
|
|
130
|
+
export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
|
|
131
|
+
const {
|
|
132
|
+
promptsPath,
|
|
133
|
+
agentCommand,
|
|
134
|
+
k,
|
|
135
|
+
outputPath,
|
|
136
|
+
cwd,
|
|
137
|
+
timeout = DEFAULT_HARNESS_TIMEOUT,
|
|
138
|
+
progress = false,
|
|
139
|
+
append = false,
|
|
140
|
+
mcpServers = [],
|
|
141
|
+
grader,
|
|
142
|
+
} = config
|
|
143
|
+
|
|
144
|
+
// Parse MCP server configurations
|
|
145
|
+
const parsedMcpServers = mcpServers.map((s) => McpServerSchema.parse(s))
|
|
146
|
+
|
|
147
|
+
// Load prompts
|
|
148
|
+
const prompts = await loadPrompts(promptsPath)
|
|
149
|
+
|
|
150
|
+
// Resolve output path
|
|
151
|
+
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
152
|
+
|
|
153
|
+
// Log progress info
|
|
154
|
+
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
155
|
+
logProgress(`Running ${k} trials per prompt`, progress)
|
|
156
|
+
logProgress(`Command: ${agentCommand.join(' ')}`, progress)
|
|
157
|
+
if (grader) {
|
|
158
|
+
logProgress('Grader: enabled (will compute pass@k metrics)', progress)
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Create ACP client
|
|
162
|
+
const client = createACPClient({
|
|
163
|
+
command: agentCommand,
|
|
164
|
+
cwd,
|
|
165
|
+
timeout,
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
// Clear output file if not appending
|
|
169
|
+
if (resolvedOutputPath && !append) {
|
|
170
|
+
await Bun.write(resolvedOutputPath, '')
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Session params
|
|
174
|
+
const sessionParams = {
|
|
175
|
+
cwd: cwd ?? process.cwd(),
|
|
176
|
+
mcpServers: parsedMcpServers,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
const results: TrialResult[] = []
|
|
180
|
+
let isFirstOutput = true
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
logProgress('Connecting to agent...', progress)
|
|
184
|
+
await client.connect()
|
|
185
|
+
logProgress('Connected!', progress)
|
|
186
|
+
|
|
187
|
+
// Run evaluations
|
|
188
|
+
for (let i = 0; i < prompts.length; i++) {
|
|
189
|
+
const promptCase = prompts[i]
|
|
190
|
+
if (!promptCase) continue
|
|
191
|
+
|
|
192
|
+
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
|
|
193
|
+
|
|
194
|
+
const trialEntries: TrialEntry[] = []
|
|
195
|
+
|
|
196
|
+
for (let trialNum = 1; trialNum <= k; trialNum++) {
|
|
197
|
+
// Create fresh session for each trial
|
|
198
|
+
const session = await client.createSession(sessionParams)
|
|
199
|
+
const startTime = Date.now()
|
|
200
|
+
|
|
201
|
+
try {
|
|
202
|
+
const prompt = createPrompt(promptCase.input)
|
|
203
|
+
const { updates } = await client.promptSync(session.id, prompt)
|
|
204
|
+
|
|
205
|
+
const endTime = Date.now()
|
|
206
|
+
const trajectory = extractTrajectory(updates, startTime)
|
|
207
|
+
const output = extractOutput(trajectory)
|
|
208
|
+
|
|
209
|
+
const entry: TrialEntry = {
|
|
210
|
+
trialNum,
|
|
211
|
+
output,
|
|
212
|
+
trajectory,
|
|
213
|
+
duration: endTime - startTime,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Apply grader if provided
|
|
217
|
+
if (grader) {
|
|
218
|
+
const graderResult = await grader({
|
|
219
|
+
input: promptCase.input,
|
|
220
|
+
output,
|
|
221
|
+
expected: promptCase.expected,
|
|
222
|
+
trajectory,
|
|
223
|
+
})
|
|
224
|
+
entry.pass = graderResult.pass
|
|
225
|
+
entry.score = graderResult.score
|
|
226
|
+
entry.reasoning = graderResult.reasoning
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
trialEntries.push(entry)
|
|
230
|
+
logProgress(
|
|
231
|
+
` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
|
|
232
|
+
progress,
|
|
233
|
+
)
|
|
234
|
+
} catch (error) {
|
|
235
|
+
const endTime = Date.now()
|
|
236
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
237
|
+
|
|
238
|
+
trialEntries.push({
|
|
239
|
+
trialNum,
|
|
240
|
+
output: '',
|
|
241
|
+
trajectory: [],
|
|
242
|
+
duration: endTime - startTime,
|
|
243
|
+
pass: false,
|
|
244
|
+
reasoning: `Error: ${message}`,
|
|
245
|
+
})
|
|
246
|
+
logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress)
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Build result
|
|
251
|
+
const result: TrialResult = {
|
|
252
|
+
id: promptCase.id,
|
|
253
|
+
input: promptCase.input,
|
|
254
|
+
...(promptCase.expected && { expected: promptCase.expected }),
|
|
255
|
+
k,
|
|
256
|
+
trials: trialEntries,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Calculate metrics if grader was used
|
|
260
|
+
if (grader) {
|
|
261
|
+
const passes = trialEntries.filter((t) => t.pass).length
|
|
262
|
+
result.passRate = passes / k
|
|
263
|
+
result.passAtK = calculatePassAtK(passes, k)
|
|
264
|
+
result.passExpK = calculatePassExpK(passes, k)
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
results.push(result)
|
|
268
|
+
|
|
269
|
+
// Write result immediately
|
|
270
|
+
const formatted = JSON.stringify(result)
|
|
271
|
+
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
272
|
+
isFirstOutput = false
|
|
273
|
+
|
|
274
|
+
if (grader) {
|
|
275
|
+
logProgress(
|
|
276
|
+
` → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
|
|
277
|
+
progress,
|
|
278
|
+
)
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
} finally {
|
|
282
|
+
logProgress('Disconnecting...', progress)
|
|
283
|
+
await client.disconnect()
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
logProgress('Done!', progress)
|
|
287
|
+
return results
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// ============================================================================
|
|
291
|
+
// CLI Entry Point
|
|
292
|
+
// ============================================================================
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Trials command CLI handler.
|
|
296
|
+
*
|
|
297
|
+
* @param args - Command line arguments (after 'trials')
|
|
298
|
+
*/
|
|
299
|
+
export const trials = async (args: string[]): Promise<void> => {
|
|
300
|
+
const { values, positionals } = parseArgs({
|
|
301
|
+
args,
|
|
302
|
+
options: {
|
|
303
|
+
output: { type: 'string', short: 'o' },
|
|
304
|
+
k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) },
|
|
305
|
+
cwd: { type: 'string', short: 'c' },
|
|
306
|
+
timeout: { type: 'string', short: 't', default: String(DEFAULT_HARNESS_TIMEOUT) },
|
|
307
|
+
progress: { type: 'boolean', default: false },
|
|
308
|
+
append: { type: 'boolean', default: false },
|
|
309
|
+
'mcp-server': { type: 'string', multiple: true },
|
|
310
|
+
grader: { type: 'string', short: 'g' },
|
|
311
|
+
help: { type: 'boolean', short: 'h' },
|
|
312
|
+
},
|
|
313
|
+
allowPositionals: true,
|
|
314
|
+
})
|
|
315
|
+
|
|
316
|
+
if (values.help) {
|
|
317
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
318
|
+
console.log(`
|
|
319
|
+
Usage: acp-harness trials <prompts.jsonl> <command> [args...] [options]
|
|
320
|
+
|
|
321
|
+
Arguments:
|
|
322
|
+
prompts.jsonl Input file with evaluation prompts
|
|
323
|
+
command [args] ACP agent command to execute
|
|
324
|
+
|
|
325
|
+
Options:
|
|
326
|
+
-o, --output Output file (default: stdout)
|
|
327
|
+
-k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
|
|
328
|
+
-c, --cwd Working directory for agent
|
|
329
|
+
-t, --timeout Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
|
|
330
|
+
--progress Show progress to stderr
|
|
331
|
+
--append Append to output file
|
|
332
|
+
--mcp-server MCP server config JSON (repeatable)
|
|
333
|
+
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
334
|
+
-h, --help Show this help message
|
|
335
|
+
|
|
336
|
+
Output Format:
|
|
337
|
+
Without grader: Raw trials with trajectories
|
|
338
|
+
With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK)
|
|
339
|
+
|
|
340
|
+
Graders:
|
|
341
|
+
TS/JS modules must export a 'grade' function.
|
|
342
|
+
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
343
|
+
|
|
344
|
+
Examples:
|
|
345
|
+
# Capture only
|
|
346
|
+
acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 -o trials.jsonl
|
|
347
|
+
|
|
348
|
+
# With TypeScript grader
|
|
349
|
+
acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.ts -o trials.jsonl
|
|
350
|
+
|
|
351
|
+
# With Python grader
|
|
352
|
+
acp-harness trials prompts.jsonl bunx claude-code-acp -k 5 --grader ./grader.py -o trials.jsonl
|
|
353
|
+
`)
|
|
354
|
+
return
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const promptsPath = positionals[0]
|
|
358
|
+
if (!promptsPath) {
|
|
359
|
+
console.error('Error: prompts.jsonl path is required')
|
|
360
|
+
process.exit(1)
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
const agentCommand = positionals.slice(1)
|
|
364
|
+
if (agentCommand.length === 0) {
|
|
365
|
+
console.error('Error: ACP agent command is required')
|
|
366
|
+
process.exit(1)
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Load grader if specified
|
|
370
|
+
let grader: Grader | undefined
|
|
371
|
+
if (values.grader) {
|
|
372
|
+
try {
|
|
373
|
+
grader = await loadGrader(values.grader)
|
|
374
|
+
} catch (error) {
|
|
375
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
376
|
+
process.exit(1)
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
// Parse MCP server configurations
|
|
381
|
+
const mcpServers = (values['mcp-server'] ?? []).map((json) => JSON.parse(json))
|
|
382
|
+
|
|
383
|
+
await runTrials({
|
|
384
|
+
promptsPath,
|
|
385
|
+
agentCommand,
|
|
386
|
+
k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
|
|
387
|
+
outputPath: values.output,
|
|
388
|
+
cwd: values.cwd,
|
|
389
|
+
timeout: Number.parseInt(values.timeout ?? String(DEFAULT_HARNESS_TIMEOUT), 10),
|
|
390
|
+
progress: values.progress ?? false,
|
|
391
|
+
append: values.append ?? false,
|
|
392
|
+
mcpServers,
|
|
393
|
+
grader,
|
|
394
|
+
})
|
|
395
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validate-refs command - check reference solutions against grader.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Validates that reference solutions in prompts.jsonl pass the grader.
|
|
6
|
+
* Helps identify prompts with broken or incorrect reference solutions.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadPrompts } from './capture.ts'
|
|
13
|
+
import { loadGrader } from './grader-loader.ts'
|
|
14
|
+
import type { Grader, ValidationResult } from './schemas.ts'
|
|
15
|
+
|
|
16
|
+
// ============================================================================
|
|
17
|
+
// Types
|
|
18
|
+
// ============================================================================
|
|
19
|
+
|
|
20
|
+
/** Configuration for validate-refs command */
|
|
21
|
+
export type ValidateRefsConfig = {
|
|
22
|
+
/** Path to prompts.jsonl file */
|
|
23
|
+
promptsPath: string
|
|
24
|
+
/** Output file path */
|
|
25
|
+
outputPath?: string
|
|
26
|
+
/** Grader function */
|
|
27
|
+
grader: Grader
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// ============================================================================
|
|
31
|
+
// Helpers
|
|
32
|
+
// ============================================================================
|
|
33
|
+
|
|
34
|
+
/** Resolve path relative to process.cwd() */
|
|
35
|
+
const resolvePath = (path: string): string => {
|
|
36
|
+
if (path.startsWith('/')) return path
|
|
37
|
+
return `${process.cwd()}/${path}`
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ============================================================================
|
|
41
|
+
// Validate-Refs Implementation
|
|
42
|
+
// ============================================================================
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Execute validate-refs with configuration object.
|
|
46
|
+
*
|
|
47
|
+
* @param config - Validate-refs configuration
|
|
48
|
+
* @returns Array of validation results
|
|
49
|
+
*/
|
|
50
|
+
export const runValidateRefs = async (config: ValidateRefsConfig): Promise<ValidationResult[]> => {
|
|
51
|
+
const { promptsPath, outputPath, grader } = config
|
|
52
|
+
|
|
53
|
+
// Load prompts
|
|
54
|
+
const prompts = await loadPrompts(promptsPath)
|
|
55
|
+
|
|
56
|
+
// Filter to prompts with reference solutions
|
|
57
|
+
const promptsWithRefs = prompts.filter((p) => p.reference !== undefined)
|
|
58
|
+
|
|
59
|
+
if (promptsWithRefs.length === 0) {
|
|
60
|
+
console.error('No prompts with reference solutions found')
|
|
61
|
+
return []
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
console.error(`Validating ${promptsWithRefs.length} reference solutions...`)
|
|
65
|
+
|
|
66
|
+
const results: ValidationResult[] = []
|
|
67
|
+
|
|
68
|
+
for (const prompt of promptsWithRefs) {
|
|
69
|
+
const graderResult = await grader({
|
|
70
|
+
input: prompt.input,
|
|
71
|
+
output: prompt.reference as string,
|
|
72
|
+
expected: prompt.expected,
|
|
73
|
+
trajectory: [], // No trajectory for reference validation
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
results.push({
|
|
77
|
+
id: prompt.id,
|
|
78
|
+
reference: prompt.reference as string,
|
|
79
|
+
passes: graderResult.pass,
|
|
80
|
+
graderResult,
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
const icon = graderResult.pass ? '✓' : '✗'
|
|
84
|
+
console.error(` ${icon} ${prompt.id}`)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Format output
|
|
88
|
+
const output = results.map((r) => JSON.stringify(r)).join('\n')
|
|
89
|
+
|
|
90
|
+
// Write output
|
|
91
|
+
if (outputPath) {
|
|
92
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
93
|
+
} else {
|
|
94
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
95
|
+
console.log(output)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Summary
|
|
99
|
+
const passed = results.filter((r) => r.passes).length
|
|
100
|
+
const failed = results.length - passed
|
|
101
|
+
console.error(`\nResults: ${passed} passed, ${failed} failed`)
|
|
102
|
+
|
|
103
|
+
if (failed > 0) {
|
|
104
|
+
console.error('\nFailing references:')
|
|
105
|
+
for (const result of results.filter((r) => !r.passes)) {
|
|
106
|
+
console.error(` - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`)
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return results
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ============================================================================
|
|
114
|
+
// CLI Entry Point
|
|
115
|
+
// ============================================================================
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Validate-refs command CLI handler.
|
|
119
|
+
*
|
|
120
|
+
* @param args - Command line arguments (after 'validate-refs')
|
|
121
|
+
*/
|
|
122
|
+
export const validateRefs = async (args: string[]): Promise<void> => {
|
|
123
|
+
const { values, positionals } = parseArgs({
|
|
124
|
+
args,
|
|
125
|
+
options: {
|
|
126
|
+
output: { type: 'string', short: 'o' },
|
|
127
|
+
grader: { type: 'string', short: 'g' },
|
|
128
|
+
help: { type: 'boolean', short: 'h' },
|
|
129
|
+
},
|
|
130
|
+
allowPositionals: true,
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
if (values.help) {
|
|
134
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
135
|
+
console.log(`
|
|
136
|
+
Usage: acp-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
|
|
137
|
+
|
|
138
|
+
Arguments:
|
|
139
|
+
prompts.jsonl Input file with prompts (must have 'reference' field)
|
|
140
|
+
|
|
141
|
+
Options:
|
|
142
|
+
-o, --output Output file (default: stdout)
|
|
143
|
+
-g, --grader Path to grader (.ts/.js module or executable script, required)
|
|
144
|
+
-h, --help Show this help message
|
|
145
|
+
|
|
146
|
+
Output:
|
|
147
|
+
JSONL with validation results for each reference solution.
|
|
148
|
+
|
|
149
|
+
Prompt Format:
|
|
150
|
+
{
|
|
151
|
+
"id": "test-001",
|
|
152
|
+
"input": "What is 2+2?",
|
|
153
|
+
"expected": "4",
|
|
154
|
+
"reference": "The answer is 4."
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
acp-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
|
|
159
|
+
`)
|
|
160
|
+
return
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const promptsPath = positionals[0]
|
|
164
|
+
if (!promptsPath) {
|
|
165
|
+
console.error('Error: prompts.jsonl path is required')
|
|
166
|
+
process.exit(1)
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (!values.grader) {
|
|
170
|
+
console.error('Error: --grader is required for validate-refs')
|
|
171
|
+
process.exit(1)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Load grader
|
|
175
|
+
let grader: Grader
|
|
176
|
+
try {
|
|
177
|
+
grader = await loadGrader(values.grader)
|
|
178
|
+
} catch (error) {
|
|
179
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
180
|
+
process.exit(1)
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
await runValidateRefs({
|
|
184
|
+
promptsPath,
|
|
185
|
+
outputPath: values.output,
|
|
186
|
+
grader,
|
|
187
|
+
})
|
|
188
|
+
}
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# Accuracy and Confidence Standards
|
|
2
|
-
|
|
3
|
-
**Confidence Threshold**: 95% - Report uncertainty rather than guess
|
|
4
|
-
|
|
5
|
-
## Verification Protocol
|
|
6
|
-
|
|
7
|
-
1. **Verification First**: Before stating any specific implementation detail (function signature, file path, API schema), use the `typescript-lsp` skill to verify types and signatures, then read the relevant file in real-time to verify accuracy.
|
|
8
|
-
|
|
9
|
-
2. **Handling Uncertainty**: If you cannot verify information or find contradictions between instructions and live code, you must NOT provide speculative answers.
|
|
10
|
-
- **Action**: Clearly state you cannot answer with high confidence and explain the discrepancy.
|
|
11
|
-
- Example: "I cannot confirm [detail] because my instructions indicate [X], but the current file shows [Y]. My knowledge may be outdated."
|
|
12
|
-
|
|
13
|
-
3. **Dynamic Exploration**:
|
|
14
|
-
- **PREFER typescript-lsp over Grep/Glob** for `.ts`, `.tsx`, `.js`, `.jsx` files
|
|
15
|
-
- Use `lsp-find` to search for symbols, types, and patterns across the workspace
|
|
16
|
-
- Use `lsp-references` to find all usages of a symbol
|
|
17
|
-
- Use `lsp-hover` to verify type signatures
|
|
18
|
-
- Only fall back to Grep/Glob for non-TypeScript files or when LSP is unavailable
|
|
19
|
-
- Use Read for other file types. Always prioritize live code over instructions.
|
|
20
|
-
|
|
21
|
-
4. **Tool-Assisted Verification**: Use these skills to enhance verification accuracy:
|
|
22
|
-
- **`typescript-lsp` skill**: Use `lsp-hover` to verify type signatures, `lsp-references` to find all usages before modifying, `lsp-symbols` for file structure, and `lsp-find` to search for patterns across the workspace.
|
|
23
|
-
- **WebFetch**: Retrieve current documentation from authoritative sources (MDN Web Docs, WHATWG specs) when using web platform APIs.
|
|
24
|
-
- These skills complement (but do not replace) reading live code - always verify outputs against actual implementation.
|
|
25
|
-
|
|
26
|
-
## Certainty Requirements
|
|
27
|
-
|
|
28
|
-
You may only propose a specific change if you are **at least 95% certain** it is correct, based on direct comparison with current code.
|
|
29
|
-
|
|
30
|
-
**When uncertain:**
|
|
31
|
-
- Report the discrepancy clearly
|
|
32
|
-
- State why you cannot confidently recommend a fix
|
|
33
|
-
- Present the issue to the user for manual resolution
|
|
34
|
-
- DO NOT invent solutions or infer changes
|
|
35
|
-
|
|
36
|
-
## For Agent-Specific Applications
|
|
37
|
-
|
|
38
|
-
Agents should apply these standards to their specific domain:
|
|
39
|
-
|
|
40
|
-
- **Documentation agents**: Only update TSDoc if parameter names/types match current code
|
|
41
|
-
- **Architecture agents**: Verify referenced patterns exist in current codebase
|
|
42
|
-
- **Code review agents**: Read files before commenting on implementation details
|
|
43
|
-
- **Pattern agents**: Confirm examples reflect actual usage in codebase
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
# Bun Platform APIs
|
|
2
|
-
|
|
3
|
-
**IMPORTANT**: Prefer Bun's native APIs over Node.js equivalents when running in the Bun environment.
|
|
4
|
-
|
|
5
|
-
## File System Operations
|
|
6
|
-
|
|
7
|
-
- ✅ Use `Bun.file(path).exists()` instead of `fs.existsSync()`
|
|
8
|
-
- ✅ Use `Bun.file(path)` API for reading/writing files
|
|
9
|
-
- ✅ Use `Bun.write()` for efficient file writes
|
|
10
|
-
|
|
11
|
-
```typescript
|
|
12
|
-
// ✅ Good: Bun APIs
|
|
13
|
-
const exists = await Bun.file('config.json').exists()
|
|
14
|
-
const content = await Bun.file('data.txt').text()
|
|
15
|
-
await Bun.write('output.json', JSON.stringify(data))
|
|
16
|
-
|
|
17
|
-
// ❌ Avoid: Node.js equivalents
|
|
18
|
-
import { existsSync, readFileSync, writeFileSync } from 'node:fs'
|
|
19
|
-
const exists = existsSync('config.json')
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
## Shell Commands
|
|
23
|
-
|
|
24
|
-
- ✅ Use `Bun.$` template literal for shell commands
|
|
25
|
-
- ❌ Avoid `child_process.spawn()` or `child_process.exec()`
|
|
26
|
-
|
|
27
|
-
```typescript
|
|
28
|
-
// ✅ Good: Bun shell
|
|
29
|
-
await Bun.$`npm install`
|
|
30
|
-
const result = await Bun.$`git status`.text()
|
|
31
|
-
|
|
32
|
-
// ❌ Avoid: Node.js child_process
|
|
33
|
-
import { spawn } from 'node:child_process'
|
|
34
|
-
spawn('npm', ['install'])
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
## Path Resolution
|
|
38
|
-
|
|
39
|
-
- ✅ Use `Bun.resolveSync()` for module resolution
|
|
40
|
-
- ✅ Use `import.meta.dir` for current directory
|
|
41
|
-
- ⚠️ Keep `node:path` utilities for path manipulation (join, resolve, dirname)
|
|
42
|
-
|
|
43
|
-
```typescript
|
|
44
|
-
// ✅ Good: Bun + node:path combo
|
|
45
|
-
import { join } from 'node:path'
|
|
46
|
-
const configPath = join(import.meta.dir, 'config.json')
|
|
47
|
-
const resolved = Bun.resolveSync('./module', import.meta.dir)
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
## Package Management
|
|
51
|
-
|
|
52
|
-
- ✅ Use `Bun.which(cmd)` to check for executables
|
|
53
|
-
- ⚠️ No programmatic package manager API yet - use CLI commands via `Bun.$`
|
|
54
|
-
|
|
55
|
-
```typescript
|
|
56
|
-
// ✅ Good: Check for executable
|
|
57
|
-
const bunPath = Bun.which('bun')
|
|
58
|
-
if (!bunPath) throw new Error('bun not found')
|
|
59
|
-
|
|
60
|
-
// Install packages via shell
|
|
61
|
-
await Bun.$`bun add zod`
|
|
62
|
-
```
|
|
63
|
-
|
|
64
|
-
## Environment Detection
|
|
65
|
-
|
|
66
|
-
- ✅ Check `typeof Bun !== 'undefined'` for Bun runtime
|
|
67
|
-
- ✅ Use `Bun.which('bun')` to verify bun executable exists
|
|
68
|
-
|
|
69
|
-
## When to Use Node.js APIs
|
|
70
|
-
|
|
71
|
-
- Interactive input (readline)
|
|
72
|
-
- Complex path manipulation (prefer `node:path` utilities)
|
|
73
|
-
- APIs without Bun equivalents
|
|
74
|
-
|
|
75
|
-
## Documentation
|
|
76
|
-
|
|
77
|
-
- Main docs: https://bun.sh/docs
|
|
78
|
-
- Shell API: https://bun.sh/docs/runtime/shell
|
|
79
|
-
- File I/O: https://bun.sh/docs/api/file-io
|
|
80
|
-
- Runtime APIs: https://bun.sh/docs/runtime/bun-apis
|