@plaited/agent-eval-harness 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +15 -0
- package/README.md +273 -0
- package/bin/cli.ts +162 -0
- package/bin/tests/cli.spec.ts +529 -0
- package/package.json +67 -0
- package/src/commands/balance.ts +257 -0
- package/src/commands/calibrate.ts +313 -0
- package/src/commands/capture.ts +393 -0
- package/src/commands/summarize.ts +228 -0
- package/src/commands/tests/balance-helpers.spec.ts +279 -0
- package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
- package/src/commands/tests/capture-cli.spec.ts +190 -0
- package/src/commands/tests/capture-helpers.spec.ts +524 -0
- package/src/commands/tests/summarize-helpers.spec.ts +339 -0
- package/src/commands/tests/trials-calculations.spec.ts +209 -0
- package/src/commands/tests/trials-cli.spec.ts +147 -0
- package/src/commands/trials.ts +388 -0
- package/src/commands/validate-refs.ts +188 -0
- package/src/commands.ts +33 -0
- package/src/core/core.ts +25 -0
- package/src/core/loading.ts +96 -0
- package/src/core/output.ts +121 -0
- package/src/core/tests/core.spec.ts +309 -0
- package/src/core/trajectory.ts +166 -0
- package/src/core.ts +28 -0
- package/src/harness.ts +46 -0
- package/src/headless/headless-cli.ts +430 -0
- package/src/headless/headless-history-builder.ts +141 -0
- package/src/headless/headless-output-parser.ts +366 -0
- package/src/headless/headless-session-manager.ts +587 -0
- package/src/headless/headless.schemas.ts +310 -0
- package/src/headless/headless.types.ts +19 -0
- package/src/headless/tests/headless.spec.ts +678 -0
- package/src/headless.ts +72 -0
- package/src/integration_tests/claude.spec.ts +157 -0
- package/src/integration_tests/gemini.spec.ts +139 -0
- package/src/pipeline/compare.ts +325 -0
- package/src/pipeline/extract.ts +241 -0
- package/src/pipeline/format.ts +292 -0
- package/src/pipeline/grade.ts +169 -0
- package/src/pipeline/pipeline.ts +41 -0
- package/src/pipeline/pipeline.types.ts +241 -0
- package/src/pipeline/run.ts +412 -0
- package/src/pipeline/tests/pipeline.spec.ts +356 -0
- package/src/pipeline.ts +34 -0
- package/src/schemas/constants.ts +94 -0
- package/src/schemas/grader-loader.ts +174 -0
- package/src/schemas/schemas-cli.ts +239 -0
- package/src/schemas/schemas.ts +558 -0
- package/src/schemas/tests/constants.spec.ts +121 -0
- package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
- package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
- package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
- package/src/schemas/tests/fixtures/grader-exec.py +29 -0
- package/src/schemas/tests/fixtures/grader-module.ts +14 -0
- package/src/schemas/tests/grader-loader.spec.ts +153 -0
- package/src/schemas/tests/schemas-cli.spec.ts +142 -0
- package/src/schemas/tests/schemas.spec.ts +606 -0
- package/src/schemas.ts +90 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-run trials command for pass@k/pass^k analysis.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Runs each prompt k times to measure non-determinism.
|
|
6
|
+
* Without a grader, captures raw trials. With a grader, computes:
|
|
7
|
+
* - passRate: Simple pass rate (passes / k)
|
|
8
|
+
* - passAtK: Probability of at least one pass in k samples
|
|
9
|
+
* - passExpK: Probability of all k samples passing
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { parseArgs } from 'node:util'
|
|
15
|
+
import { extractOutput, extractTrajectory, loadPrompts, logProgress, resolvePath, writeOutput } from '../core.ts'
|
|
16
|
+
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
17
|
+
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
18
|
+
import { createSessionManager } from '../headless/headless-session-manager.ts'
|
|
19
|
+
import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
|
|
20
|
+
import { loadGrader } from '../schemas/grader-loader.ts'
|
|
21
|
+
import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
|
|
22
|
+
|
|
23
|
+
// ============================================================================
|
|
24
|
+
// Pass@k/Pass^k Calculation
|
|
25
|
+
// ============================================================================
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Calculate pass@k: probability of at least one pass in k samples.
|
|
29
|
+
*
|
|
30
|
+
* @remarks
|
|
31
|
+
* Uses the unbiased estimator: 1 - C(n-c, k) / C(n, k)
|
|
32
|
+
* where n = total samples, c = correct samples, k = samples per trial
|
|
33
|
+
*
|
|
34
|
+
* For our case where n = k (we run exactly k trials per prompt):
|
|
35
|
+
* pass@k = 1 - (1 - passRate)^k (simplified)
|
|
36
|
+
*
|
|
37
|
+
* @param passes - Number of passing trials
|
|
38
|
+
* @param k - Total number of trials
|
|
39
|
+
* @returns Probability of at least one pass
|
|
40
|
+
*
|
|
41
|
+
* @public
|
|
42
|
+
*/
|
|
43
|
+
export const calculatePassAtK = (passes: number, k: number): number => {
|
|
44
|
+
if (passes >= k) return 1
|
|
45
|
+
if (passes === 0) return 0
|
|
46
|
+
|
|
47
|
+
// Simplified formula when n = k
|
|
48
|
+
const passRate = passes / k
|
|
49
|
+
return 1 - (1 - passRate) ** k
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Calculate pass^k: probability of all k samples passing.
|
|
54
|
+
*
|
|
55
|
+
* @remarks
|
|
56
|
+
* This is simply passRate^k
|
|
57
|
+
*
|
|
58
|
+
* @param passes - Number of passing trials
|
|
59
|
+
* @param k - Total number of trials
|
|
60
|
+
* @returns Probability of all k samples passing
|
|
61
|
+
*
|
|
62
|
+
* @public
|
|
63
|
+
*/
|
|
64
|
+
export const calculatePassExpK = (passes: number, k: number): number => {
|
|
65
|
+
if (passes === k) return 1
|
|
66
|
+
if (passes === 0) return 0
|
|
67
|
+
|
|
68
|
+
const passRate = passes / k
|
|
69
|
+
return passRate ** k
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// ============================================================================
|
|
73
|
+
// Types
|
|
74
|
+
// ============================================================================
|
|
75
|
+
|
|
76
|
+
/** Configuration for trials command */
|
|
77
|
+
export type TrialsConfig = {
|
|
78
|
+
/** Path to prompts.jsonl file */
|
|
79
|
+
promptsPath: string
|
|
80
|
+
/** Path to agent schema JSON file */
|
|
81
|
+
schemaPath: string
|
|
82
|
+
/** Number of trials per prompt */
|
|
83
|
+
k: number
|
|
84
|
+
/** Output file path */
|
|
85
|
+
outputPath?: string
|
|
86
|
+
/** Working directory for agent */
|
|
87
|
+
cwd?: string
|
|
88
|
+
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
89
|
+
timeout?: number
|
|
90
|
+
/** Show progress to stderr */
|
|
91
|
+
progress?: boolean
|
|
92
|
+
/** Append to output file */
|
|
93
|
+
append?: boolean
|
|
94
|
+
/** Optional grader function */
|
|
95
|
+
grader?: Grader
|
|
96
|
+
/** Enable debug mode */
|
|
97
|
+
debug?: boolean
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ============================================================================
|
|
101
|
+
// Trials Implementation
|
|
102
|
+
// ============================================================================
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Execute trials with configuration object.
|
|
106
|
+
*
|
|
107
|
+
* @param config - Trials configuration
|
|
108
|
+
* @returns Array of trial results
|
|
109
|
+
*/
|
|
110
|
+
export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
|
|
111
|
+
const {
|
|
112
|
+
promptsPath,
|
|
113
|
+
schemaPath,
|
|
114
|
+
k,
|
|
115
|
+
outputPath,
|
|
116
|
+
cwd,
|
|
117
|
+
timeout,
|
|
118
|
+
progress = false,
|
|
119
|
+
append = false,
|
|
120
|
+
grader,
|
|
121
|
+
debug = false,
|
|
122
|
+
} = config
|
|
123
|
+
|
|
124
|
+
// Load and validate schema
|
|
125
|
+
const schemaFile = Bun.file(schemaPath)
|
|
126
|
+
if (!(await schemaFile.exists())) {
|
|
127
|
+
throw new Error(`Schema file not found: ${schemaPath}`)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let schema: HeadlessAdapterConfig
|
|
131
|
+
try {
|
|
132
|
+
const rawSchema = await schemaFile.json()
|
|
133
|
+
schema = parseHeadlessConfig(rawSchema)
|
|
134
|
+
} catch (error) {
|
|
135
|
+
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Load prompts
|
|
139
|
+
const prompts = await loadPrompts(promptsPath)
|
|
140
|
+
|
|
141
|
+
// Resolve output path
|
|
142
|
+
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
143
|
+
|
|
144
|
+
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
145
|
+
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
146
|
+
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
147
|
+
|
|
148
|
+
// Log progress info
|
|
149
|
+
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
150
|
+
logProgress(`Running ${k} trials per prompt`, progress)
|
|
151
|
+
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
152
|
+
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
153
|
+
if (grader) {
|
|
154
|
+
logProgress('Grader: enabled (will compute pass@k metrics)', progress)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Create session manager with schema
|
|
158
|
+
const sessions = createSessionManager({
|
|
159
|
+
schema,
|
|
160
|
+
timeout: effectiveTimeout,
|
|
161
|
+
verbose: progress,
|
|
162
|
+
debug,
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
// Clear output file if not appending
|
|
166
|
+
if (resolvedOutputPath && !append) {
|
|
167
|
+
await Bun.write(resolvedOutputPath, '')
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const workingDir = cwd ?? process.cwd()
|
|
171
|
+
const results: TrialResult[] = []
|
|
172
|
+
let isFirstOutput = true
|
|
173
|
+
|
|
174
|
+
// Run evaluations
|
|
175
|
+
for (let i = 0; i < prompts.length; i++) {
|
|
176
|
+
const promptCase = prompts[i]
|
|
177
|
+
if (!promptCase) continue
|
|
178
|
+
|
|
179
|
+
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
|
|
180
|
+
|
|
181
|
+
const trialEntries: TrialEntry[] = []
|
|
182
|
+
|
|
183
|
+
for (let trialNum = 1; trialNum <= k; trialNum++) {
|
|
184
|
+
// Create fresh session for each trial
|
|
185
|
+
const session = await sessions.create(workingDir)
|
|
186
|
+
const startTime = Date.now()
|
|
187
|
+
|
|
188
|
+
try {
|
|
189
|
+
// Handle string or array input
|
|
190
|
+
const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
|
|
191
|
+
const allUpdates: ParsedUpdate[] = []
|
|
192
|
+
|
|
193
|
+
// TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
|
|
194
|
+
|
|
195
|
+
// Execute each turn sequentially
|
|
196
|
+
for (const turnInput of inputs) {
|
|
197
|
+
const turnResult = await sessions.prompt(session.id, turnInput)
|
|
198
|
+
allUpdates.push(...turnResult.updates)
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const endTime = Date.now()
|
|
202
|
+
const trajectory = extractTrajectory(allUpdates, startTime)
|
|
203
|
+
const output = extractOutput(trajectory)
|
|
204
|
+
|
|
205
|
+
const entry: TrialEntry = {
|
|
206
|
+
trialNum,
|
|
207
|
+
output,
|
|
208
|
+
trajectory,
|
|
209
|
+
duration: endTime - startTime,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Apply grader if provided
|
|
213
|
+
if (grader) {
|
|
214
|
+
const graderResult = await grader({
|
|
215
|
+
input: promptCase.input,
|
|
216
|
+
output,
|
|
217
|
+
hint: promptCase.hint,
|
|
218
|
+
trajectory,
|
|
219
|
+
})
|
|
220
|
+
entry.pass = graderResult.pass
|
|
221
|
+
entry.score = graderResult.score
|
|
222
|
+
entry.reasoning = graderResult.reasoning
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
trialEntries.push(entry)
|
|
226
|
+
logProgress(
|
|
227
|
+
` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
|
|
228
|
+
progress,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
// Clean up session
|
|
232
|
+
sessions.destroy(session.id)
|
|
233
|
+
} catch (error) {
|
|
234
|
+
const endTime = Date.now()
|
|
235
|
+
const message = error instanceof Error ? error.message : String(error)
|
|
236
|
+
|
|
237
|
+
trialEntries.push({
|
|
238
|
+
trialNum,
|
|
239
|
+
output: '',
|
|
240
|
+
trajectory: [],
|
|
241
|
+
duration: endTime - startTime,
|
|
242
|
+
pass: false,
|
|
243
|
+
reasoning: `Error: ${message}`,
|
|
244
|
+
})
|
|
245
|
+
logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress)
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Build result
|
|
250
|
+
const result: TrialResult = {
|
|
251
|
+
id: promptCase.id,
|
|
252
|
+
input: promptCase.input,
|
|
253
|
+
...(promptCase.hint && { hint: promptCase.hint }),
|
|
254
|
+
k,
|
|
255
|
+
trials: trialEntries,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Calculate metrics if grader was used
|
|
259
|
+
if (grader) {
|
|
260
|
+
const passes = trialEntries.filter((t) => t.pass).length
|
|
261
|
+
result.passRate = passes / k
|
|
262
|
+
result.passAtK = calculatePassAtK(passes, k)
|
|
263
|
+
result.passExpK = calculatePassExpK(passes, k)
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
results.push(result)
|
|
267
|
+
|
|
268
|
+
// Write result immediately
|
|
269
|
+
const formatted = JSON.stringify(result)
|
|
270
|
+
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
271
|
+
isFirstOutput = false
|
|
272
|
+
|
|
273
|
+
if (grader) {
|
|
274
|
+
logProgress(
|
|
275
|
+
` → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
|
|
276
|
+
progress,
|
|
277
|
+
)
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
logProgress('Done!', progress)
|
|
282
|
+
return results
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// ============================================================================
|
|
286
|
+
// CLI Entry Point
|
|
287
|
+
// ============================================================================
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Trials command CLI handler.
|
|
291
|
+
*
|
|
292
|
+
* @param args - Command line arguments (after 'trials')
|
|
293
|
+
*/
|
|
294
|
+
export const trials = async (args: string[]): Promise<void> => {
|
|
295
|
+
const { values, positionals } = parseArgs({
|
|
296
|
+
args,
|
|
297
|
+
options: {
|
|
298
|
+
schema: { type: 'string', short: 's' },
|
|
299
|
+
output: { type: 'string', short: 'o' },
|
|
300
|
+
k: { type: 'string', short: 'k', default: String(DEFAULT_TRIAL_COUNT) },
|
|
301
|
+
cwd: { type: 'string', short: 'c' },
|
|
302
|
+
timeout: { type: 'string', short: 't' },
|
|
303
|
+
progress: { type: 'boolean', default: false },
|
|
304
|
+
append: { type: 'boolean', default: false },
|
|
305
|
+
grader: { type: 'string', short: 'g' },
|
|
306
|
+
debug: { type: 'boolean', default: false },
|
|
307
|
+
help: { type: 'boolean', short: 'h' },
|
|
308
|
+
},
|
|
309
|
+
allowPositionals: true,
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
if (values.help) {
|
|
313
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
314
|
+
console.log(`
|
|
315
|
+
Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
|
|
316
|
+
|
|
317
|
+
Arguments:
|
|
318
|
+
prompts.jsonl Input file with evaluation prompts
|
|
319
|
+
|
|
320
|
+
Options:
|
|
321
|
+
-s, --schema Path to agent schema JSON file (required)
|
|
322
|
+
-o, --output Output file (default: stdout)
|
|
323
|
+
-k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
|
|
324
|
+
-c, --cwd Working directory for agent
|
|
325
|
+
-t, --timeout Request timeout in ms (overrides schema default)
|
|
326
|
+
--progress Show progress to stderr
|
|
327
|
+
--append Append to output file
|
|
328
|
+
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
329
|
+
--debug Enable debug mode
|
|
330
|
+
-h, --help Show this help message
|
|
331
|
+
|
|
332
|
+
Output Format:
|
|
333
|
+
Without grader: Raw trials with trajectories
|
|
334
|
+
With grader: Trials plus pass@k metrics (passRate, passAtK, passExpK)
|
|
335
|
+
|
|
336
|
+
Graders:
|
|
337
|
+
TS/JS modules must export a 'grade' function.
|
|
338
|
+
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
339
|
+
|
|
340
|
+
Examples:
|
|
341
|
+
# Capture only
|
|
342
|
+
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
|
|
343
|
+
|
|
344
|
+
# With TypeScript grader
|
|
345
|
+
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
|
|
346
|
+
|
|
347
|
+
# With Python grader
|
|
348
|
+
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.py -o trials.jsonl
|
|
349
|
+
`)
|
|
350
|
+
return
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
const promptsPath = positionals[0]
|
|
354
|
+
if (!promptsPath) {
|
|
355
|
+
console.error('Error: prompts.jsonl path is required')
|
|
356
|
+
process.exit(1)
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
if (!values.schema) {
|
|
360
|
+
console.error('Error: --schema is required')
|
|
361
|
+
console.error('Example: agent-eval-harness trials prompts.jsonl --schema ./claude.json')
|
|
362
|
+
process.exit(1)
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Load grader if specified
|
|
366
|
+
let grader: Grader | undefined
|
|
367
|
+
if (values.grader) {
|
|
368
|
+
try {
|
|
369
|
+
grader = await loadGrader(values.grader)
|
|
370
|
+
} catch (error) {
|
|
371
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
372
|
+
process.exit(1)
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
await runTrials({
|
|
377
|
+
promptsPath,
|
|
378
|
+
schemaPath: values.schema,
|
|
379
|
+
k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
|
|
380
|
+
outputPath: values.output,
|
|
381
|
+
cwd: values.cwd,
|
|
382
|
+
timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
|
|
383
|
+
progress: values.progress ?? false,
|
|
384
|
+
append: values.append ?? false,
|
|
385
|
+
grader,
|
|
386
|
+
debug: values.debug ?? false,
|
|
387
|
+
})
|
|
388
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validate-refs command - check reference solutions against grader.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Validates that reference solutions in prompts.jsonl pass the grader.
|
|
6
|
+
* Helps identify prompts with broken or incorrect reference solutions.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadGrader } from '../schemas/grader-loader.ts'
|
|
13
|
+
import type { Grader, ValidationResult } from '../schemas.ts'
|
|
14
|
+
import { loadPrompts } from './capture.ts'
|
|
15
|
+
|
|
16
|
+
// ============================================================================
|
|
17
|
+
// Types
|
|
18
|
+
// ============================================================================
|
|
19
|
+
|
|
20
|
+
/** Configuration for validate-refs command */
|
|
21
|
+
export type ValidateRefsConfig = {
|
|
22
|
+
/** Path to prompts.jsonl file */
|
|
23
|
+
promptsPath: string
|
|
24
|
+
/** Output file path */
|
|
25
|
+
outputPath?: string
|
|
26
|
+
/** Grader function */
|
|
27
|
+
grader: Grader
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// ============================================================================
|
|
31
|
+
// Helpers
|
|
32
|
+
// ============================================================================
|
|
33
|
+
|
|
34
|
+
/** Resolve path relative to process.cwd() */
|
|
35
|
+
const resolvePath = (path: string): string => {
|
|
36
|
+
if (path.startsWith('/')) return path
|
|
37
|
+
return `${process.cwd()}/${path}`
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ============================================================================
|
|
41
|
+
// Validate-Refs Implementation
|
|
42
|
+
// ============================================================================
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Execute validate-refs with configuration object.
|
|
46
|
+
*
|
|
47
|
+
* @param config - Validate-refs configuration
|
|
48
|
+
* @returns Array of validation results
|
|
49
|
+
*/
|
|
50
|
+
export const runValidateRefs = async (config: ValidateRefsConfig): Promise<ValidationResult[]> => {
|
|
51
|
+
const { promptsPath, outputPath, grader } = config
|
|
52
|
+
|
|
53
|
+
// Load prompts
|
|
54
|
+
const prompts = await loadPrompts(promptsPath)
|
|
55
|
+
|
|
56
|
+
// Filter to prompts with reference solutions
|
|
57
|
+
const promptsWithRefs = prompts.filter((p) => p.reference !== undefined)
|
|
58
|
+
|
|
59
|
+
if (promptsWithRefs.length === 0) {
|
|
60
|
+
console.error('No prompts with reference solutions found')
|
|
61
|
+
return []
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
console.error(`Validating ${promptsWithRefs.length} reference solutions...`)
|
|
65
|
+
|
|
66
|
+
const results: ValidationResult[] = []
|
|
67
|
+
|
|
68
|
+
for (const prompt of promptsWithRefs) {
|
|
69
|
+
const graderResult = await grader({
|
|
70
|
+
input: prompt.input,
|
|
71
|
+
output: prompt.reference as string,
|
|
72
|
+
hint: prompt.hint,
|
|
73
|
+
trajectory: [], // No trajectory for reference validation
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
results.push({
|
|
77
|
+
id: prompt.id,
|
|
78
|
+
reference: prompt.reference as string,
|
|
79
|
+
passes: graderResult.pass,
|
|
80
|
+
graderResult,
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
const icon = graderResult.pass ? '✓' : '✗'
|
|
84
|
+
console.error(` ${icon} ${prompt.id}`)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Format output
|
|
88
|
+
const output = results.map((r) => JSON.stringify(r)).join('\n')
|
|
89
|
+
|
|
90
|
+
// Write output
|
|
91
|
+
if (outputPath) {
|
|
92
|
+
await Bun.write(resolvePath(outputPath), output)
|
|
93
|
+
} else {
|
|
94
|
+
// biome-ignore lint/suspicious/noConsole: CLI stdout output
|
|
95
|
+
console.log(output)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Summary
|
|
99
|
+
const passed = results.filter((r) => r.passes).length
|
|
100
|
+
const failed = results.length - passed
|
|
101
|
+
console.error(`\nResults: ${passed} passed, ${failed} failed`)
|
|
102
|
+
|
|
103
|
+
if (failed > 0) {
|
|
104
|
+
console.error('\nFailing references:')
|
|
105
|
+
for (const result of results.filter((r) => !r.passes)) {
|
|
106
|
+
console.error(` - ${result.id}: ${result.graderResult.reasoning ?? 'No reasoning'}`)
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return results
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ============================================================================
|
|
114
|
+
// CLI Entry Point
|
|
115
|
+
// ============================================================================
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Validate-refs command CLI handler.
|
|
119
|
+
*
|
|
120
|
+
* @param args - Command line arguments (after 'validate-refs')
|
|
121
|
+
*/
|
|
122
|
+
export const validateRefs = async (args: string[]): Promise<void> => {
|
|
123
|
+
const { values, positionals } = parseArgs({
|
|
124
|
+
args,
|
|
125
|
+
options: {
|
|
126
|
+
output: { type: 'string', short: 'o' },
|
|
127
|
+
grader: { type: 'string', short: 'g' },
|
|
128
|
+
help: { type: 'boolean', short: 'h' },
|
|
129
|
+
},
|
|
130
|
+
allowPositionals: true,
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
if (values.help) {
|
|
134
|
+
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
135
|
+
console.log(`
|
|
136
|
+
Usage: agent-eval-harness validate-refs <prompts.jsonl> --grader <grader.ts> [options]
|
|
137
|
+
|
|
138
|
+
Arguments:
|
|
139
|
+
prompts.jsonl Input file with prompts (must have 'reference' field)
|
|
140
|
+
|
|
141
|
+
Options:
|
|
142
|
+
-o, --output Output file (default: stdout)
|
|
143
|
+
-g, --grader Path to grader (.ts/.js module or executable script, required)
|
|
144
|
+
-h, --help Show this help message
|
|
145
|
+
|
|
146
|
+
Output:
|
|
147
|
+
JSONL with validation results for each reference solution.
|
|
148
|
+
|
|
149
|
+
Prompt Format:
|
|
150
|
+
{
|
|
151
|
+
"id": "test-001",
|
|
152
|
+
"input": "What is 2+2?",
|
|
153
|
+
"expected": "4",
|
|
154
|
+
"reference": "The answer is 4."
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
agent-eval-harness validate-refs prompts.jsonl --grader ./grader.ts -o validation.jsonl
|
|
159
|
+
`)
|
|
160
|
+
return
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const promptsPath = positionals[0]
|
|
164
|
+
if (!promptsPath) {
|
|
165
|
+
console.error('Error: prompts.jsonl path is required')
|
|
166
|
+
process.exit(1)
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (!values.grader) {
|
|
170
|
+
console.error('Error: --grader is required for validate-refs')
|
|
171
|
+
process.exit(1)
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Load grader
|
|
175
|
+
let grader: Grader
|
|
176
|
+
try {
|
|
177
|
+
grader = await loadGrader(values.grader)
|
|
178
|
+
} catch (error) {
|
|
179
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
180
|
+
process.exit(1)
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
await runValidateRefs({
|
|
184
|
+
promptsPath,
|
|
185
|
+
outputPath: values.output,
|
|
186
|
+
grader,
|
|
187
|
+
})
|
|
188
|
+
}
|
package/src/commands.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CLI command implementations for agent evaluation harness.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Re-exports all CLI commands for programmatic use.
|
|
6
|
+
* For CLI usage, run `agent-eval-harness <command> --help`.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
// Balance command
|
|
12
|
+
export type { BalanceConfig } from './commands/balance.ts'
|
|
13
|
+
export { balance, runBalance } from './commands/balance.ts'
|
|
14
|
+
|
|
15
|
+
// Calibrate command
|
|
16
|
+
export type { CalibrateConfig } from './commands/calibrate.ts'
|
|
17
|
+
export { calibrate, runCalibrate } from './commands/calibrate.ts'
|
|
18
|
+
|
|
19
|
+
// Capture command
|
|
20
|
+
export type { CaptureConfig } from './commands/capture.ts'
|
|
21
|
+
export { capture, runCapture } from './commands/capture.ts'
|
|
22
|
+
|
|
23
|
+
// Summarize command
|
|
24
|
+
export type { SummarizeConfig } from './commands/summarize.ts'
|
|
25
|
+
export { runSummarize, summarize } from './commands/summarize.ts'
|
|
26
|
+
|
|
27
|
+
// Trials command
|
|
28
|
+
export type { TrialsConfig } from './commands/trials.ts'
|
|
29
|
+
export { runTrials, trials } from './commands/trials.ts'
|
|
30
|
+
|
|
31
|
+
// Validate-refs command
|
|
32
|
+
export type { ValidateRefsConfig } from './commands/validate-refs.ts'
|
|
33
|
+
export { runValidateRefs, validateRefs } from './commands/validate-refs.ts'
|
package/src/core/core.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core utilities for agent-eval-harness.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Re-exports shared utilities used across all commands:
|
|
6
|
+
* - Loading: JSONL file parsing for prompts and results
|
|
7
|
+
* - Trajectory: Extraction and analysis of agent trajectories
|
|
8
|
+
* - Output: Writing results, progress logging, path resolution
|
|
9
|
+
*
|
|
10
|
+
* @packageDocumentation
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// Loading utilities
|
|
14
|
+
export { loadJsonl, loadPrompts, loadResults } from './loading.ts'
|
|
15
|
+
// Output utilities
|
|
16
|
+
export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
|
|
17
|
+
// Trajectory utilities
|
|
18
|
+
export {
|
|
19
|
+
detectTrajectoryRichness,
|
|
20
|
+
extractContent,
|
|
21
|
+
extractFilePath,
|
|
22
|
+
extractOutput,
|
|
23
|
+
extractTrajectory,
|
|
24
|
+
hasToolErrors,
|
|
25
|
+
} from './trajectory.ts'
|