@plaited/agent-eval-harness 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/package.json +2 -2
- package/src/commands/balance.ts +1 -11
- package/src/commands/calibrate.ts +2 -10
- package/src/commands/capture.ts +56 -141
- package/src/commands/execution.ts +245 -0
- package/src/commands/tests/capture-cli.spec.ts +27 -0
- package/src/commands/tests/trials-cli.spec.ts +28 -0
- package/src/commands/trials.ts +49 -149
- package/src/commands/validate-refs.ts +3 -19
- package/src/core/core.ts +9 -1
- package/src/core/loading.ts +38 -0
- package/src/core.ts +1 -0
- package/src/schemas/grader-loader.ts +23 -6
- package/src/schemas/schemas-cli.ts +1 -6
- package/src/schemas.ts +1 -1
package/README.md
CHANGED
|
@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-... # For Claude
|
|
|
25
25
|
export GEMINI_API_KEY=... # For Gemini
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
Pre-built schemas are available in `.
|
|
28
|
+
Pre-built schemas are available in `.agents/skills/headless-adapters/schemas/` for Claude and Gemini.
|
|
29
29
|
|
|
30
30
|
### Core Commands
|
|
31
31
|
|
|
@@ -98,11 +98,11 @@ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparis
|
|
|
98
98
|
**Install skills** for use with AI coding agents:
|
|
99
99
|
|
|
100
100
|
```bash
|
|
101
|
-
|
|
101
|
+
npx skills add plaited/agent-eval-harness
|
|
102
|
+
# or
|
|
103
|
+
bunx skills add plaited/agent-eval-harness
|
|
102
104
|
```
|
|
103
105
|
|
|
104
|
-
Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
|
|
105
|
-
|
|
106
106
|
### Available Skills
|
|
107
107
|
|
|
108
108
|
#### Agent Eval Harness
|
|
@@ -416,7 +416,7 @@ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
|
|
|
416
416
|
## Requirements
|
|
417
417
|
|
|
418
418
|
- **Runtime:** Bun >= 1.2.9
|
|
419
|
-
- **Schema:** JSON schema describing CLI agent interaction (see `.
|
|
419
|
+
- **Schema:** JSON schema describing CLI agent interaction (see `.agents/skills/headless-adapters/schemas/`)
|
|
420
420
|
- **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
|
|
421
421
|
|
|
422
422
|
## License
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.12.0",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -56,7 +56,7 @@
|
|
|
56
56
|
]
|
|
57
57
|
},
|
|
58
58
|
"dependencies": {
|
|
59
|
-
"@plaited/development-skills": "0.
|
|
59
|
+
"@plaited/development-skills": "0.7.0",
|
|
60
60
|
"zod": "^4.3.6"
|
|
61
61
|
},
|
|
62
62
|
"devDependencies": {
|
package/src/commands/balance.ts
CHANGED
|
@@ -9,8 +9,8 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
|
+
import { loadPrompts, resolvePath } from '../core.ts'
|
|
12
13
|
import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
|
|
13
|
-
import { loadPrompts } from './capture.ts'
|
|
14
14
|
|
|
15
15
|
// ============================================================================
|
|
16
16
|
// Types
|
|
@@ -28,16 +28,6 @@ export type BalanceConfig = {
|
|
|
28
28
|
threshold?: number
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
-
// ============================================================================
|
|
32
|
-
// Helpers
|
|
33
|
-
// ============================================================================
|
|
34
|
-
|
|
35
|
-
/** Resolve path relative to process.cwd() */
|
|
36
|
-
const resolvePath = (path: string): string => {
|
|
37
|
-
if (path.startsWith('/')) return path
|
|
38
|
-
return `${process.cwd()}/${path}`
|
|
39
|
-
}
|
|
40
|
-
|
|
41
31
|
/**
|
|
42
32
|
* Analyze category distribution across prompts.
|
|
43
33
|
*
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
12
|
import { loadResults, resolvePath } from '../core.ts'
|
|
13
13
|
import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
|
|
14
|
-
import {
|
|
14
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
15
15
|
import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
|
|
16
16
|
|
|
17
17
|
// ============================================================================
|
|
@@ -293,15 +293,7 @@ Examples:
|
|
|
293
293
|
}
|
|
294
294
|
|
|
295
295
|
// Load grader if specified
|
|
296
|
-
|
|
297
|
-
if (values.grader) {
|
|
298
|
-
try {
|
|
299
|
-
grader = await loadGrader(values.grader)
|
|
300
|
-
} catch (error) {
|
|
301
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
302
|
-
process.exit(1)
|
|
303
|
-
}
|
|
304
|
-
}
|
|
296
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
305
297
|
|
|
306
298
|
await runCalibrate({
|
|
307
299
|
resultsPath,
|
package/src/commands/capture.ts
CHANGED
|
@@ -11,28 +11,22 @@
|
|
|
11
11
|
* @packageDocumentation
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import { mkdir } from 'node:fs/promises'
|
|
15
14
|
import { parseArgs } from 'node:util'
|
|
16
15
|
import {
|
|
17
16
|
createWorkspaceDir,
|
|
18
|
-
createWriteMutex,
|
|
19
17
|
detectTrajectoryRichness,
|
|
20
18
|
extractOutput,
|
|
21
19
|
extractTrajectory,
|
|
22
20
|
getInputPreview,
|
|
23
21
|
hasToolErrors,
|
|
24
|
-
loadPrompts,
|
|
25
22
|
logProgress,
|
|
26
|
-
|
|
27
|
-
runWorkerPool,
|
|
28
|
-
writeOutput,
|
|
23
|
+
readStdinPrompts,
|
|
29
24
|
} from '../core.ts'
|
|
30
|
-
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
31
25
|
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
32
|
-
import {
|
|
33
|
-
import {
|
|
34
|
-
import {
|
|
35
|
-
import type
|
|
26
|
+
import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
|
|
27
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
28
|
+
import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
|
|
29
|
+
import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
|
|
36
30
|
|
|
37
31
|
// ============================================================================
|
|
38
32
|
// Re-exports for backward compatibility
|
|
@@ -55,30 +49,7 @@ export {
|
|
|
55
49
|
// ============================================================================
|
|
56
50
|
|
|
57
51
|
/** Configuration for capture command */
|
|
58
|
-
export type CaptureConfig =
|
|
59
|
-
/** Path to prompts.jsonl file */
|
|
60
|
-
promptsPath: string
|
|
61
|
-
/** Path to agent schema JSON file */
|
|
62
|
-
schemaPath: string
|
|
63
|
-
/** Output file path (undefined for stdout) */
|
|
64
|
-
outputPath?: string
|
|
65
|
-
/** Working directory for agent */
|
|
66
|
-
cwd?: string
|
|
67
|
-
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
68
|
-
timeout?: number
|
|
69
|
-
/** Show progress to stderr */
|
|
70
|
-
progress?: boolean
|
|
71
|
-
/** Append to output file instead of overwriting */
|
|
72
|
-
append?: boolean
|
|
73
|
-
/** Optional grader function */
|
|
74
|
-
grader?: Grader
|
|
75
|
-
/** Enable debug mode for detailed output */
|
|
76
|
-
debug?: boolean
|
|
77
|
-
/** Number of concurrent workers (default: 1 for sequential) */
|
|
78
|
-
concurrency?: number
|
|
79
|
-
/** Base directory for per-prompt workspace isolation */
|
|
80
|
-
workspaceDir?: string
|
|
81
|
-
}
|
|
52
|
+
export type CaptureConfig = BaseExecutionConfig
|
|
82
53
|
|
|
83
54
|
// ============================================================================
|
|
84
55
|
// Capture Implementation
|
|
@@ -95,51 +66,25 @@ export type CaptureConfig = {
|
|
|
95
66
|
* @returns Array of capture results
|
|
96
67
|
*/
|
|
97
68
|
export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
|
|
69
|
+
const ctx = await prepareExecution(config)
|
|
98
70
|
const {
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
71
|
+
schema,
|
|
72
|
+
prompts,
|
|
73
|
+
sessions,
|
|
74
|
+
resolvedOutputPath,
|
|
75
|
+
resolvedWorkspaceDir,
|
|
76
|
+
defaultWorkingDir,
|
|
77
|
+
progress,
|
|
106
78
|
grader,
|
|
107
|
-
debug
|
|
108
|
-
|
|
109
|
-
workspaceDir,
|
|
110
|
-
} = config
|
|
111
|
-
|
|
112
|
-
// Load and validate schema
|
|
113
|
-
const schemaFile = Bun.file(schemaPath)
|
|
114
|
-
if (!(await schemaFile.exists())) {
|
|
115
|
-
throw new Error(`Schema file not found: ${schemaPath}`)
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
let schema: HeadlessAdapterConfig
|
|
119
|
-
try {
|
|
120
|
-
const rawSchema = await schemaFile.json()
|
|
121
|
-
schema = parseHeadlessConfig(rawSchema)
|
|
122
|
-
} catch (error) {
|
|
123
|
-
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
// Load prompts
|
|
127
|
-
const prompts = await loadPrompts(promptsPath)
|
|
128
|
-
|
|
129
|
-
// Resolve paths
|
|
130
|
-
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
131
|
-
const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
|
|
132
|
-
|
|
133
|
-
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
134
|
-
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
135
|
-
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
79
|
+
debug,
|
|
80
|
+
} = ctx
|
|
136
81
|
|
|
137
82
|
// Log progress info
|
|
138
|
-
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
139
|
-
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
140
|
-
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
141
|
-
if (concurrency > 1) {
|
|
142
|
-
logProgress(`Concurrency: ${concurrency} workers`, progress)
|
|
83
|
+
logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
|
|
84
|
+
logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
|
|
85
|
+
logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
|
|
86
|
+
if (ctx.concurrency > 1) {
|
|
87
|
+
logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
|
|
143
88
|
}
|
|
144
89
|
if (resolvedWorkspaceDir) {
|
|
145
90
|
logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
|
|
@@ -151,31 +96,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
151
96
|
logProgress(`Debug mode: enabled`, progress)
|
|
152
97
|
}
|
|
153
98
|
|
|
154
|
-
// Create session manager with schema
|
|
155
|
-
const sessions = createSessionManager({
|
|
156
|
-
schema,
|
|
157
|
-
timeout: effectiveTimeout,
|
|
158
|
-
verbose: progress,
|
|
159
|
-
debug,
|
|
160
|
-
})
|
|
161
|
-
|
|
162
|
-
// Clear output file if not appending
|
|
163
|
-
if (resolvedOutputPath && !append) {
|
|
164
|
-
await Bun.write(resolvedOutputPath, '')
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
// Create workspace base directory if specified
|
|
168
|
-
// Uses fs.mkdir instead of shell to prevent command injection
|
|
169
|
-
if (resolvedWorkspaceDir) {
|
|
170
|
-
await mkdir(resolvedWorkspaceDir, { recursive: true })
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
const defaultWorkingDir = cwd ?? process.cwd()
|
|
174
|
-
|
|
175
|
-
// Create write mutex for coordinating JSONL output
|
|
176
|
-
const writeMutex = createWriteMutex()
|
|
177
|
-
let isFirstOutput = true
|
|
178
|
-
|
|
179
99
|
// Process a single prompt (used by worker pool)
|
|
180
100
|
const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
|
|
181
101
|
// Determine working directory (per-prompt workspace or default)
|
|
@@ -301,11 +221,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
301
221
|
}
|
|
302
222
|
|
|
303
223
|
// Write result immediately (coordinated via mutex for concurrent writes)
|
|
304
|
-
await
|
|
305
|
-
const formatted = JSON.stringify(result)
|
|
306
|
-
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
307
|
-
isFirstOutput = false
|
|
308
|
-
})
|
|
224
|
+
await ctx.writeResult(result)
|
|
309
225
|
|
|
310
226
|
const statusIcon = result.toolErrors ? '!' : '✓'
|
|
311
227
|
const exitInfo = result.metadata?.timedOut
|
|
@@ -319,20 +235,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
319
235
|
}
|
|
320
236
|
|
|
321
237
|
// Run with worker pool
|
|
322
|
-
|
|
323
|
-
concurrency,
|
|
324
|
-
onProgress: (completed, total) => {
|
|
325
|
-
logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
|
|
326
|
-
},
|
|
327
|
-
})
|
|
328
|
-
|
|
329
|
-
// Log any errors that occurred
|
|
330
|
-
if (errors.length > 0) {
|
|
331
|
-
logProgress(`Completed with ${errors.length} error(s)`, progress)
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
logProgress('Done!', progress)
|
|
335
|
-
return results
|
|
238
|
+
return executePrompts(ctx, processPrompt)
|
|
336
239
|
}
|
|
337
240
|
|
|
338
241
|
// ============================================================================
|
|
@@ -356,6 +259,7 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
356
259
|
append: { type: 'boolean', default: false },
|
|
357
260
|
grader: { type: 'string', short: 'g' },
|
|
358
261
|
debug: { type: 'boolean', default: false },
|
|
262
|
+
stdin: { type: 'boolean', default: false },
|
|
359
263
|
concurrency: { type: 'string', short: 'j' },
|
|
360
264
|
'workspace-dir': { type: 'string' },
|
|
361
265
|
help: { type: 'boolean', short: 'h' },
|
|
@@ -366,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
|
|
|
366
270
|
if (values.help) {
|
|
367
271
|
console.log(`
|
|
368
272
|
Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
|
|
273
|
+
cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
|
|
369
274
|
|
|
370
275
|
Arguments:
|
|
371
276
|
prompts.jsonl Input file with evaluation prompts
|
|
@@ -376,6 +281,7 @@ Options:
|
|
|
376
281
|
-c, --cwd Working directory for agent
|
|
377
282
|
-t, --timeout Request timeout in ms (overrides schema default)
|
|
378
283
|
-j, --concurrency Number of concurrent workers (default: 1)
|
|
284
|
+
--stdin Read prompts from stdin (mutually exclusive with file arg)
|
|
379
285
|
--workspace-dir Base directory for per-prompt workspace isolation
|
|
380
286
|
--progress Show progress to stderr
|
|
381
287
|
--append Append to output file instead of overwriting
|
|
@@ -401,6 +307,11 @@ Parallelization:
|
|
|
401
307
|
Each prompt gets its own agent session for isolation.
|
|
402
308
|
Results are written as they complete (order may differ from input).
|
|
403
309
|
|
|
310
|
+
Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
|
|
311
|
+
at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
|
|
312
|
+
In memory-constrained environments (Docker, CI) this can cause OOM kills.
|
|
313
|
+
Use --stdin to pipe prompts for container-level orchestration.
|
|
314
|
+
|
|
404
315
|
Workspace Isolation:
|
|
405
316
|
Use --workspace-dir to create per-prompt directories.
|
|
406
317
|
Each prompt runs in {workspace-dir}/prompt-{id}/.
|
|
@@ -422,13 +333,24 @@ Examples:
|
|
|
422
333
|
|
|
423
334
|
# With debug mode
|
|
424
335
|
agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
|
|
336
|
+
|
|
337
|
+
# Read prompts from stdin (container orchestration)
|
|
338
|
+
cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
|
|
425
339
|
`)
|
|
426
340
|
return
|
|
427
341
|
}
|
|
428
342
|
|
|
429
343
|
const promptsPath = positionals[0]
|
|
430
|
-
|
|
431
|
-
|
|
344
|
+
const useStdin = values.stdin ?? false
|
|
345
|
+
|
|
346
|
+
// Mutual exclusivity: --stdin and positional file
|
|
347
|
+
if (useStdin && promptsPath) {
|
|
348
|
+
console.error('Error: --stdin and prompts file argument are mutually exclusive')
|
|
349
|
+
process.exit(1)
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
if (!useStdin && !promptsPath) {
|
|
353
|
+
console.error('Error: prompts.jsonl path is required (or use --stdin)')
|
|
432
354
|
process.exit(1)
|
|
433
355
|
}
|
|
434
356
|
|
|
@@ -438,30 +360,23 @@ Examples:
|
|
|
438
360
|
process.exit(1)
|
|
439
361
|
}
|
|
440
362
|
|
|
441
|
-
//
|
|
442
|
-
let
|
|
443
|
-
if (
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
363
|
+
// Read prompts from stdin if requested
|
|
364
|
+
let prompts: PromptCase[] | undefined
|
|
365
|
+
if (useStdin) {
|
|
366
|
+
const stdinPrompts = await readStdinPrompts()
|
|
367
|
+
if (!stdinPrompts || stdinPrompts.length === 0) {
|
|
368
|
+
console.error('Error: no prompts received on stdin')
|
|
448
369
|
process.exit(1)
|
|
449
370
|
}
|
|
371
|
+
prompts = stdinPrompts
|
|
450
372
|
}
|
|
451
373
|
|
|
452
|
-
//
|
|
453
|
-
|
|
454
|
-
if (values.concurrency) {
|
|
455
|
-
const parsed = Number.parseInt(values.concurrency, 10)
|
|
456
|
-
if (Number.isNaN(parsed) || parsed < 1) {
|
|
457
|
-
console.error('Error: --concurrency must be a positive integer')
|
|
458
|
-
process.exit(1)
|
|
459
|
-
}
|
|
460
|
-
concurrency = parsed
|
|
461
|
-
}
|
|
374
|
+
// Load grader if specified
|
|
375
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
462
376
|
|
|
463
377
|
await runCapture({
|
|
464
|
-
promptsPath,
|
|
378
|
+
promptsPath: promptsPath ?? undefined,
|
|
379
|
+
prompts,
|
|
465
380
|
schemaPath: values.schema,
|
|
466
381
|
outputPath: values.output,
|
|
467
382
|
cwd: values.cwd,
|
|
@@ -470,7 +385,7 @@ Examples:
|
|
|
470
385
|
append: values.append ?? false,
|
|
471
386
|
grader,
|
|
472
387
|
debug: values.debug ?? false,
|
|
473
|
-
concurrency,
|
|
388
|
+
concurrency: parseConcurrency(values.concurrency),
|
|
474
389
|
workspaceDir: values['workspace-dir'],
|
|
475
390
|
})
|
|
476
391
|
}
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared execution utilities for capture and trials commands.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Extracts common setup logic: schema loading, prompt loading, path resolution,
|
|
6
|
+
* session manager creation, output initialization, and worker pool execution.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { mkdir } from 'node:fs/promises'
|
|
12
|
+
import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
|
|
13
|
+
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
14
|
+
import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
|
|
15
|
+
import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
|
|
16
|
+
import type { Grader, PromptCase } from '../schemas.ts'
|
|
17
|
+
|
|
18
|
+
// ============================================================================
|
|
19
|
+
// Types
|
|
20
|
+
// ============================================================================
|
|
21
|
+
|
|
22
|
+
/** Base configuration shared by capture and trials commands */
|
|
23
|
+
export type BaseExecutionConfig = {
|
|
24
|
+
/** Path to prompts.jsonl file (required unless prompts provided) */
|
|
25
|
+
promptsPath?: string
|
|
26
|
+
/** Path to agent schema JSON file */
|
|
27
|
+
schemaPath: string
|
|
28
|
+
/** Pre-loaded prompt cases (from stdin); skips file loading when set */
|
|
29
|
+
prompts?: PromptCase[]
|
|
30
|
+
/** Output file path (undefined for stdout) */
|
|
31
|
+
outputPath?: string
|
|
32
|
+
/** Working directory for agent */
|
|
33
|
+
cwd?: string
|
|
34
|
+
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
35
|
+
timeout?: number
|
|
36
|
+
/** Show progress to stderr */
|
|
37
|
+
progress?: boolean
|
|
38
|
+
/** Append to output file instead of overwriting */
|
|
39
|
+
append?: boolean
|
|
40
|
+
/** Optional grader function */
|
|
41
|
+
grader?: Grader
|
|
42
|
+
/** Enable debug mode */
|
|
43
|
+
debug?: boolean
|
|
44
|
+
/** Number of concurrent workers (default: 1 for sequential) */
|
|
45
|
+
concurrency?: number
|
|
46
|
+
/** Base directory for per-prompt workspace isolation */
|
|
47
|
+
workspaceDir?: string
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** Prepared execution context returned by prepareExecution */
|
|
51
|
+
export type ExecutionContext = {
|
|
52
|
+
/** Parsed and validated headless adapter schema */
|
|
53
|
+
schema: HeadlessAdapterConfig
|
|
54
|
+
/** Loaded and validated prompt cases */
|
|
55
|
+
prompts: PromptCase[]
|
|
56
|
+
/** Session manager for creating/destroying agent sessions */
|
|
57
|
+
sessions: SessionManager
|
|
58
|
+
/** Resolved absolute output path (undefined for stdout) */
|
|
59
|
+
resolvedOutputPath?: string
|
|
60
|
+
/** Resolved absolute workspace directory path */
|
|
61
|
+
resolvedWorkspaceDir?: string
|
|
62
|
+
/** Effective timeout in milliseconds */
|
|
63
|
+
effectiveTimeout: number
|
|
64
|
+
/** Default working directory for agent sessions */
|
|
65
|
+
defaultWorkingDir: string
|
|
66
|
+
/** Number of concurrent workers */
|
|
67
|
+
concurrency: number
|
|
68
|
+
/** Whether to show progress output */
|
|
69
|
+
progress: boolean
|
|
70
|
+
/** Optional grader function */
|
|
71
|
+
grader?: Grader
|
|
72
|
+
/** Whether debug mode is enabled */
|
|
73
|
+
debug: boolean
|
|
74
|
+
/** Write a result object as JSONL, coordinated via mutex */
|
|
75
|
+
writeResult: (result: unknown) => Promise<void>
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// ============================================================================
|
|
79
|
+
// Execution Setup
|
|
80
|
+
// ============================================================================
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Prepare execution context from base configuration.
|
|
84
|
+
*
|
|
85
|
+
* @remarks
|
|
86
|
+
* Handles all shared setup: schema loading/validation, prompt loading,
|
|
87
|
+
* path resolution, session manager creation, output file initialization,
|
|
88
|
+
* workspace directory creation, and write mutex coordination.
|
|
89
|
+
*
|
|
90
|
+
* @param config - Base execution configuration
|
|
91
|
+
* @returns Prepared execution context
|
|
92
|
+
* @throws Error if schema file not found, invalid, or prompts missing
|
|
93
|
+
*
|
|
94
|
+
* @public
|
|
95
|
+
*/
|
|
96
|
+
export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
|
|
97
|
+
const {
|
|
98
|
+
promptsPath,
|
|
99
|
+
schemaPath,
|
|
100
|
+
outputPath,
|
|
101
|
+
cwd,
|
|
102
|
+
timeout,
|
|
103
|
+
progress = false,
|
|
104
|
+
append = false,
|
|
105
|
+
grader,
|
|
106
|
+
debug = false,
|
|
107
|
+
concurrency = 1,
|
|
108
|
+
workspaceDir,
|
|
109
|
+
} = config
|
|
110
|
+
|
|
111
|
+
// Validate prompt source
|
|
112
|
+
if (!config.prompts && !promptsPath) {
|
|
113
|
+
throw new Error('Either promptsPath or prompts must be provided')
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Load and validate schema
|
|
117
|
+
const schemaFile = Bun.file(schemaPath)
|
|
118
|
+
if (!(await schemaFile.exists())) {
|
|
119
|
+
throw new Error(`Schema file not found: ${schemaPath}`)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
let schema: HeadlessAdapterConfig
|
|
123
|
+
try {
|
|
124
|
+
const rawSchema = await schemaFile.json()
|
|
125
|
+
schema = parseHeadlessConfig(rawSchema)
|
|
126
|
+
} catch (error) {
|
|
127
|
+
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Load prompts
|
|
131
|
+
const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
|
|
132
|
+
|
|
133
|
+
// Resolve paths
|
|
134
|
+
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
135
|
+
const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
|
|
136
|
+
|
|
137
|
+
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
138
|
+
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
139
|
+
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
140
|
+
|
|
141
|
+
// Create session manager
|
|
142
|
+
const sessions = createSessionManager({
|
|
143
|
+
schema,
|
|
144
|
+
timeout: effectiveTimeout,
|
|
145
|
+
verbose: progress,
|
|
146
|
+
debug,
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
// Initialize output file (clear if not appending)
|
|
150
|
+
if (resolvedOutputPath && !append) {
|
|
151
|
+
await Bun.write(resolvedOutputPath, '')
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Create workspace base directory if specified
|
|
155
|
+
if (resolvedWorkspaceDir) {
|
|
156
|
+
await mkdir(resolvedWorkspaceDir, { recursive: true })
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const defaultWorkingDir = cwd ?? process.cwd()
|
|
160
|
+
|
|
161
|
+
// Create write mutex with closure for coordinated result writing
|
|
162
|
+
const writeMutex = createWriteMutex()
|
|
163
|
+
let isFirstOutput = true
|
|
164
|
+
|
|
165
|
+
const writeResult = async (result: unknown) => {
|
|
166
|
+
await writeMutex.write(async () => {
|
|
167
|
+
const formatted = JSON.stringify(result)
|
|
168
|
+
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
169
|
+
isFirstOutput = false
|
|
170
|
+
})
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
schema,
|
|
175
|
+
prompts,
|
|
176
|
+
sessions,
|
|
177
|
+
resolvedOutputPath,
|
|
178
|
+
resolvedWorkspaceDir,
|
|
179
|
+
effectiveTimeout,
|
|
180
|
+
defaultWorkingDir,
|
|
181
|
+
concurrency,
|
|
182
|
+
progress,
|
|
183
|
+
grader,
|
|
184
|
+
debug,
|
|
185
|
+
writeResult,
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// ============================================================================
|
|
190
|
+
// Worker Pool Execution
|
|
191
|
+
// ============================================================================
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Execute prompts through a worker pool with progress logging.
|
|
195
|
+
*
|
|
196
|
+
* @remarks
|
|
197
|
+
* Common wrapper for the runWorkerPool pattern used by both capture and trials.
|
|
198
|
+
* Handles progress callbacks, error logging, and completion logging.
|
|
199
|
+
*
|
|
200
|
+
* @param ctx - Execution context from prepareExecution
|
|
201
|
+
* @param processFn - Function to process each prompt
|
|
202
|
+
* @returns Array of results
|
|
203
|
+
*
|
|
204
|
+
* @public
|
|
205
|
+
*/
|
|
206
|
+
export const executePrompts = async <T>(
|
|
207
|
+
ctx: ExecutionContext,
|
|
208
|
+
processFn: (promptCase: PromptCase, index: number) => Promise<T>,
|
|
209
|
+
): Promise<T[]> => {
|
|
210
|
+
const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
|
|
211
|
+
concurrency: ctx.concurrency,
|
|
212
|
+
onProgress: (completed, total) => {
|
|
213
|
+
logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
|
|
214
|
+
},
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
if (errors.length > 0) {
|
|
218
|
+
logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
logProgress('Done!', ctx.progress)
|
|
222
|
+
return results
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// ============================================================================
|
|
226
|
+
// CLI Helpers
|
|
227
|
+
// ============================================================================
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Parse and validate concurrency CLI argument.
|
|
231
|
+
*
|
|
232
|
+
* @param value - Raw string value from parseArgs
|
|
233
|
+
* @returns Validated positive integer (default: 1)
|
|
234
|
+
*
|
|
235
|
+
* @public
|
|
236
|
+
*/
|
|
237
|
+
export const parseConcurrency = (value: string | undefined): number => {
|
|
238
|
+
if (!value) return 1
|
|
239
|
+
const parsed = Number.parseInt(value, 10)
|
|
240
|
+
if (Number.isNaN(parsed) || parsed < 1) {
|
|
241
|
+
console.error('Error: --concurrency must be a positive integer')
|
|
242
|
+
process.exit(1)
|
|
243
|
+
}
|
|
244
|
+
return parsed
|
|
245
|
+
}
|
|
@@ -142,6 +142,16 @@ describe('runCapture configuration', () => {
|
|
|
142
142
|
expect(config.concurrency).toBeUndefined()
|
|
143
143
|
expect(config.workspaceDir).toBeUndefined()
|
|
144
144
|
})
|
|
145
|
+
|
|
146
|
+
test('CaptureConfig accepts prompts without promptsPath', () => {
|
|
147
|
+
const config: CaptureConfig = {
|
|
148
|
+
schemaPath: './test-schema.json',
|
|
149
|
+
prompts: [{ id: 't1', input: 'hello' }],
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
expect(config.promptsPath).toBeUndefined()
|
|
153
|
+
expect(config.prompts).toHaveLength(1)
|
|
154
|
+
})
|
|
145
155
|
})
|
|
146
156
|
|
|
147
157
|
// ============================================================================
|
|
@@ -168,6 +178,23 @@ describe('capture CLI', () => {
|
|
|
168
178
|
expect(stdout).toContain('-s, --schema')
|
|
169
179
|
expect(stdout).toContain('-j, --concurrency')
|
|
170
180
|
expect(stdout).toContain('--workspace-dir')
|
|
181
|
+
expect(stdout).toContain('--stdin')
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
test('shows error for --stdin with positional file', async () => {
|
|
185
|
+
const proc = Bun.spawn(
|
|
186
|
+
['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
|
|
187
|
+
{
|
|
188
|
+
stdout: 'pipe',
|
|
189
|
+
stderr: 'pipe',
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
const stderr = await new Response(proc.stderr).text()
|
|
194
|
+
const exitCode = await proc.exited
|
|
195
|
+
|
|
196
|
+
expect(exitCode).not.toBe(0)
|
|
197
|
+
expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
|
|
171
198
|
})
|
|
172
199
|
|
|
173
200
|
test('shows error for missing prompts file argument', async () => {
|
|
@@ -44,6 +44,17 @@ describe('TrialsConfig configuration', () => {
|
|
|
44
44
|
expect(config.concurrency).toBeUndefined()
|
|
45
45
|
expect(config.workspaceDir).toBeUndefined()
|
|
46
46
|
})
|
|
47
|
+
|
|
48
|
+
test('TrialsConfig accepts prompts without promptsPath', () => {
|
|
49
|
+
const config: TrialsConfig = {
|
|
50
|
+
schemaPath: './test-schema.json',
|
|
51
|
+
k: 3,
|
|
52
|
+
prompts: [{ id: 't1', input: 'hello' }],
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
expect(config.promptsPath).toBeUndefined()
|
|
56
|
+
expect(config.prompts).toHaveLength(1)
|
|
57
|
+
})
|
|
47
58
|
})
|
|
48
59
|
|
|
49
60
|
// ============================================================================
|
|
@@ -72,6 +83,23 @@ describe('trials CLI', () => {
|
|
|
72
83
|
expect(stdout).toContain('pass@k')
|
|
73
84
|
expect(stdout).toContain('-j, --concurrency')
|
|
74
85
|
expect(stdout).toContain('--workspace-dir')
|
|
86
|
+
expect(stdout).toContain('--stdin')
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
test('shows error for --stdin with positional file', async () => {
|
|
90
|
+
const proc = Bun.spawn(
|
|
91
|
+
['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
|
|
92
|
+
{
|
|
93
|
+
stdout: 'pipe',
|
|
94
|
+
stderr: 'pipe',
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
const stderr = await new Response(proc.stderr).text()
|
|
99
|
+
const exitCode = await proc.exited
|
|
100
|
+
|
|
101
|
+
expect(exitCode).not.toBe(0)
|
|
102
|
+
expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
|
|
75
103
|
})
|
|
76
104
|
|
|
77
105
|
test('shows error for missing prompts file argument', async () => {
|
package/src/commands/trials.ts
CHANGED
|
@@ -11,25 +11,13 @@
|
|
|
11
11
|
* @packageDocumentation
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import { mkdir } from 'node:fs/promises'
|
|
15
14
|
import { parseArgs } from 'node:util'
|
|
16
|
-
import {
|
|
17
|
-
createWorkspaceDir,
|
|
18
|
-
createWriteMutex,
|
|
19
|
-
extractOutput,
|
|
20
|
-
extractTrajectory,
|
|
21
|
-
loadPrompts,
|
|
22
|
-
logProgress,
|
|
23
|
-
resolvePath,
|
|
24
|
-
runWorkerPool,
|
|
25
|
-
writeOutput,
|
|
26
|
-
} from '../core.ts'
|
|
27
|
-
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
15
|
+
import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
|
|
28
16
|
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
29
|
-
import {
|
|
30
|
-
import {
|
|
31
|
-
import {
|
|
32
|
-
import type
|
|
17
|
+
import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
|
|
18
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
19
|
+
import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
|
|
20
|
+
import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
|
|
33
21
|
|
|
34
22
|
// ============================================================================
|
|
35
23
|
// Pass@k/Pass^k Calculation
|
|
@@ -85,31 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
|
|
|
85
73
|
// ============================================================================
|
|
86
74
|
|
|
87
75
|
/** Configuration for trials command */
|
|
88
|
-
export type TrialsConfig = {
|
|
89
|
-
/** Path to prompts.jsonl file */
|
|
90
|
-
promptsPath: string
|
|
91
|
-
/** Path to agent schema JSON file */
|
|
92
|
-
schemaPath: string
|
|
76
|
+
export type TrialsConfig = BaseExecutionConfig & {
|
|
93
77
|
/** Number of trials per prompt */
|
|
94
78
|
k: number
|
|
95
|
-
/** Output file path */
|
|
96
|
-
outputPath?: string
|
|
97
|
-
/** Working directory for agent */
|
|
98
|
-
cwd?: string
|
|
99
|
-
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
100
|
-
timeout?: number
|
|
101
|
-
/** Show progress to stderr */
|
|
102
|
-
progress?: boolean
|
|
103
|
-
/** Append to output file */
|
|
104
|
-
append?: boolean
|
|
105
|
-
/** Optional grader function */
|
|
106
|
-
grader?: Grader
|
|
107
|
-
/** Enable debug mode */
|
|
108
|
-
debug?: boolean
|
|
109
|
-
/** Number of concurrent workers (default: 1 for sequential) */
|
|
110
|
-
concurrency?: number
|
|
111
|
-
/** Base directory for per-prompt workspace isolation */
|
|
112
|
-
workspaceDir?: string
|
|
113
79
|
}
|
|
114
80
|
|
|
115
81
|
// ============================================================================
|
|
@@ -123,53 +89,17 @@ export type TrialsConfig = {
|
|
|
123
89
|
* @returns Array of trial results
|
|
124
90
|
*/
|
|
125
91
|
export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
|
|
126
|
-
const {
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
k,
|
|
130
|
-
outputPath,
|
|
131
|
-
cwd,
|
|
132
|
-
timeout,
|
|
133
|
-
progress = false,
|
|
134
|
-
append = false,
|
|
135
|
-
grader,
|
|
136
|
-
debug = false,
|
|
137
|
-
concurrency = 1,
|
|
138
|
-
workspaceDir,
|
|
139
|
-
} = config
|
|
140
|
-
|
|
141
|
-
// Load and validate schema
|
|
142
|
-
const schemaFile = Bun.file(schemaPath)
|
|
143
|
-
if (!(await schemaFile.exists())) {
|
|
144
|
-
throw new Error(`Schema file not found: ${schemaPath}`)
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
let schema: HeadlessAdapterConfig
|
|
148
|
-
try {
|
|
149
|
-
const rawSchema = await schemaFile.json()
|
|
150
|
-
schema = parseHeadlessConfig(rawSchema)
|
|
151
|
-
} catch (error) {
|
|
152
|
-
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
// Load prompts
|
|
156
|
-
const prompts = await loadPrompts(promptsPath)
|
|
157
|
-
|
|
158
|
-
// Resolve paths
|
|
159
|
-
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
160
|
-
const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
|
|
161
|
-
|
|
162
|
-
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
163
|
-
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
164
|
-
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
92
|
+
const { k } = config
|
|
93
|
+
const ctx = await prepareExecution(config)
|
|
94
|
+
const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
|
|
165
95
|
|
|
166
96
|
// Log progress info
|
|
167
|
-
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
97
|
+
logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
|
|
168
98
|
logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
|
|
169
|
-
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
170
|
-
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
171
|
-
if (concurrency > 1) {
|
|
172
|
-
logProgress(`Concurrency: ${concurrency} workers`, progress)
|
|
99
|
+
logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
|
|
100
|
+
logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
|
|
101
|
+
if (ctx.concurrency > 1) {
|
|
102
|
+
logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
|
|
173
103
|
}
|
|
174
104
|
if (resolvedWorkspaceDir) {
|
|
175
105
|
logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
|
|
@@ -178,31 +108,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
178
108
|
logProgress('Grader: enabled (will compute pass@k metrics)', progress)
|
|
179
109
|
}
|
|
180
110
|
|
|
181
|
-
// Create session manager with schema
|
|
182
|
-
const sessions = createSessionManager({
|
|
183
|
-
schema,
|
|
184
|
-
timeout: effectiveTimeout,
|
|
185
|
-
verbose: progress,
|
|
186
|
-
debug,
|
|
187
|
-
})
|
|
188
|
-
|
|
189
|
-
// Clear output file if not appending
|
|
190
|
-
if (resolvedOutputPath && !append) {
|
|
191
|
-
await Bun.write(resolvedOutputPath, '')
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
// Create workspace base directory if specified
|
|
195
|
-
// Uses fs.mkdir instead of shell to prevent command injection
|
|
196
|
-
if (resolvedWorkspaceDir) {
|
|
197
|
-
await mkdir(resolvedWorkspaceDir, { recursive: true })
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
const defaultWorkingDir = cwd ?? process.cwd()
|
|
201
|
-
|
|
202
|
-
// Create write mutex for coordinating JSONL output
|
|
203
|
-
const writeMutex = createWriteMutex()
|
|
204
|
-
let isFirstOutput = true
|
|
205
|
-
|
|
206
111
|
// Process all trials for a single prompt
|
|
207
112
|
const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
|
|
208
113
|
logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
|
|
@@ -308,11 +213,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
308
213
|
}
|
|
309
214
|
|
|
310
215
|
// Write result immediately (coordinated via mutex for concurrent writes)
|
|
311
|
-
await
|
|
312
|
-
const formatted = JSON.stringify(result)
|
|
313
|
-
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
314
|
-
isFirstOutput = false
|
|
315
|
-
})
|
|
216
|
+
await ctx.writeResult(result)
|
|
316
217
|
|
|
317
218
|
if (grader) {
|
|
318
219
|
logProgress(
|
|
@@ -325,20 +226,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
325
226
|
}
|
|
326
227
|
|
|
327
228
|
// Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
|
|
328
|
-
|
|
329
|
-
concurrency,
|
|
330
|
-
onProgress: (completed, total) => {
|
|
331
|
-
logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
|
|
332
|
-
},
|
|
333
|
-
})
|
|
334
|
-
|
|
335
|
-
// Log any errors that occurred
|
|
336
|
-
if (errors.length > 0) {
|
|
337
|
-
logProgress(`Completed with ${errors.length} error(s)`, progress)
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
logProgress('Done!', progress)
|
|
341
|
-
return results
|
|
229
|
+
return executePrompts(ctx, processPromptTrials)
|
|
342
230
|
}
|
|
343
231
|
|
|
344
232
|
// ============================================================================
|
|
@@ -363,6 +251,7 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
363
251
|
append: { type: 'boolean', default: false },
|
|
364
252
|
grader: { type: 'string', short: 'g' },
|
|
365
253
|
debug: { type: 'boolean', default: false },
|
|
254
|
+
stdin: { type: 'boolean', default: false },
|
|
366
255
|
concurrency: { type: 'string', short: 'j' },
|
|
367
256
|
'workspace-dir': { type: 'string' },
|
|
368
257
|
help: { type: 'boolean', short: 'h' },
|
|
@@ -373,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
373
262
|
if (values.help) {
|
|
374
263
|
console.log(`
|
|
375
264
|
Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
|
|
265
|
+
cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
|
|
376
266
|
|
|
377
267
|
Arguments:
|
|
378
268
|
prompts.jsonl Input file with evaluation prompts
|
|
@@ -384,6 +274,7 @@ Options:
|
|
|
384
274
|
-c, --cwd Working directory for agent
|
|
385
275
|
-t, --timeout Request timeout in ms (overrides schema default)
|
|
386
276
|
-j, --concurrency Number of concurrent workers (default: 1)
|
|
277
|
+
--stdin Read prompts from stdin (mutually exclusive with file arg)
|
|
387
278
|
--workspace-dir Base directory for per-trial workspace isolation
|
|
388
279
|
--progress Show progress to stderr
|
|
389
280
|
--append Append to output file
|
|
@@ -404,6 +295,11 @@ Parallelization:
|
|
|
404
295
|
Each prompt's k trials still run sequentially (required for aggregation).
|
|
405
296
|
With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
|
|
406
297
|
|
|
298
|
+
Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
|
|
299
|
+
at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
|
|
300
|
+
In memory-constrained environments (Docker, CI) this can cause OOM kills.
|
|
301
|
+
Use --stdin to pipe prompts for container-level orchestration.
|
|
302
|
+
|
|
407
303
|
Workspace Isolation:
|
|
408
304
|
Use --workspace-dir to create per-trial directories.
|
|
409
305
|
Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
|
|
@@ -422,13 +318,24 @@ Examples:
|
|
|
422
318
|
|
|
423
319
|
# With TypeScript grader
|
|
424
320
|
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
|
|
321
|
+
|
|
322
|
+
# Read prompts from stdin (container orchestration)
|
|
323
|
+
cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
|
|
425
324
|
`)
|
|
426
325
|
return
|
|
427
326
|
}
|
|
428
327
|
|
|
429
328
|
const promptsPath = positionals[0]
|
|
430
|
-
|
|
431
|
-
|
|
329
|
+
const useStdin = values.stdin ?? false
|
|
330
|
+
|
|
331
|
+
// Mutual exclusivity: --stdin and positional file
|
|
332
|
+
if (useStdin && promptsPath) {
|
|
333
|
+
console.error('Error: --stdin and prompts file argument are mutually exclusive')
|
|
334
|
+
process.exit(1)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (!useStdin && !promptsPath) {
|
|
338
|
+
console.error('Error: prompts.jsonl path is required (or use --stdin)')
|
|
432
339
|
process.exit(1)
|
|
433
340
|
}
|
|
434
341
|
|
|
@@ -438,30 +345,23 @@ Examples:
|
|
|
438
345
|
process.exit(1)
|
|
439
346
|
}
|
|
440
347
|
|
|
441
|
-
//
|
|
442
|
-
let
|
|
443
|
-
if (
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
348
|
+
// Read prompts from stdin if requested
|
|
349
|
+
let prompts: PromptCase[] | undefined
|
|
350
|
+
if (useStdin) {
|
|
351
|
+
const stdinPrompts = await readStdinPrompts()
|
|
352
|
+
if (!stdinPrompts || stdinPrompts.length === 0) {
|
|
353
|
+
console.error('Error: no prompts received on stdin')
|
|
448
354
|
process.exit(1)
|
|
449
355
|
}
|
|
356
|
+
prompts = stdinPrompts
|
|
450
357
|
}
|
|
451
358
|
|
|
452
|
-
//
|
|
453
|
-
|
|
454
|
-
if (values.concurrency) {
|
|
455
|
-
const parsed = Number.parseInt(values.concurrency, 10)
|
|
456
|
-
if (Number.isNaN(parsed) || parsed < 1) {
|
|
457
|
-
console.error('Error: --concurrency must be a positive integer')
|
|
458
|
-
process.exit(1)
|
|
459
|
-
}
|
|
460
|
-
concurrency = parsed
|
|
461
|
-
}
|
|
359
|
+
// Load grader if specified
|
|
360
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
462
361
|
|
|
463
362
|
await runTrials({
|
|
464
|
-
promptsPath,
|
|
363
|
+
promptsPath: promptsPath ?? undefined,
|
|
364
|
+
prompts,
|
|
465
365
|
schemaPath: values.schema,
|
|
466
366
|
k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
|
|
467
367
|
outputPath: values.output,
|
|
@@ -471,7 +371,7 @@ Examples:
|
|
|
471
371
|
append: values.append ?? false,
|
|
472
372
|
grader,
|
|
473
373
|
debug: values.debug ?? false,
|
|
474
|
-
concurrency,
|
|
374
|
+
concurrency: parseConcurrency(values.concurrency),
|
|
475
375
|
workspaceDir: values['workspace-dir'],
|
|
476
376
|
})
|
|
477
377
|
}
|
|
@@ -9,9 +9,9 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
|
-
import {
|
|
12
|
+
import { loadPrompts, resolvePath } from '../core.ts'
|
|
13
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
13
14
|
import type { Grader, ValidationResult } from '../schemas.ts'
|
|
14
|
-
import { loadPrompts } from './capture.ts'
|
|
15
15
|
|
|
16
16
|
// ============================================================================
|
|
17
17
|
// Types
|
|
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
|
|
|
27
27
|
grader: Grader
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
// ============================================================================
|
|
31
|
-
// Helpers
|
|
32
|
-
// ============================================================================
|
|
33
|
-
|
|
34
|
-
/** Resolve path relative to process.cwd() */
|
|
35
|
-
const resolvePath = (path: string): string => {
|
|
36
|
-
if (path.startsWith('/')) return path
|
|
37
|
-
return `${process.cwd()}/${path}`
|
|
38
|
-
}
|
|
39
|
-
|
|
40
30
|
// ============================================================================
|
|
41
31
|
// Validate-Refs Implementation
|
|
42
32
|
// ============================================================================
|
|
@@ -171,13 +161,7 @@ Examples:
|
|
|
171
161
|
}
|
|
172
162
|
|
|
173
163
|
// Load grader
|
|
174
|
-
|
|
175
|
-
try {
|
|
176
|
-
grader = await loadGrader(values.grader)
|
|
177
|
-
} catch (error) {
|
|
178
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
179
|
-
process.exit(1)
|
|
180
|
-
}
|
|
164
|
+
const grader = await loadGraderOrExit(values.grader)
|
|
181
165
|
|
|
182
166
|
await runValidateRefs({
|
|
183
167
|
promptsPath,
|
package/src/core/core.ts
CHANGED
|
@@ -11,7 +11,15 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
// Loading utilities
|
|
14
|
-
export {
|
|
14
|
+
export {
|
|
15
|
+
buildResultsIndex,
|
|
16
|
+
countLines,
|
|
17
|
+
loadJsonl,
|
|
18
|
+
loadPrompts,
|
|
19
|
+
loadResults,
|
|
20
|
+
readStdinPrompts,
|
|
21
|
+
streamResults,
|
|
22
|
+
} from './loading.ts'
|
|
15
23
|
// Output utilities
|
|
16
24
|
export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
|
|
17
25
|
// Native streaming utilities
|
package/src/core/loading.ts
CHANGED
|
@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
|
|
|
39
39
|
})
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
+
/**
|
|
43
|
+
* Read prompts from stdin as JSONL.
|
|
44
|
+
*
|
|
45
|
+
* @remarks
|
|
46
|
+
* Reads all data from stdin, parses each line as JSON, and validates against
|
|
47
|
+
* PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
|
|
48
|
+
* Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
|
|
49
|
+
*
|
|
50
|
+
* @returns Parsed and validated prompt cases, or null if stdin is a TTY
|
|
51
|
+
* @throws Error if any line is invalid JSON or fails schema validation
|
|
52
|
+
*
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
|
|
56
|
+
if (process.stdin.isTTY) {
|
|
57
|
+
return null
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const chunks: Buffer[] = []
|
|
61
|
+
for await (const chunk of process.stdin) {
|
|
62
|
+
chunks.push(chunk)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const content = Buffer.concat(chunks).toString('utf-8').trim()
|
|
66
|
+
if (!content) return null
|
|
67
|
+
|
|
68
|
+
return content
|
|
69
|
+
.split('\n')
|
|
70
|
+
.filter(Boolean)
|
|
71
|
+
.map((line, index) => {
|
|
72
|
+
try {
|
|
73
|
+
return PromptCaseSchema.parse(JSON.parse(line))
|
|
74
|
+
} catch (error) {
|
|
75
|
+
throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
76
|
+
}
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
|
|
42
80
|
/**
|
|
43
81
|
* Load capture results from a JSONL file.
|
|
44
82
|
*
|
package/src/core.ts
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* @packageDocumentation
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
|
+
import { resolvePath } from '../core.ts'
|
|
16
17
|
import type { Grader, TrajectoryStep } from './schemas.ts'
|
|
17
18
|
import { GraderResultSchema } from './schemas.ts'
|
|
18
19
|
|
|
@@ -30,12 +31,6 @@ const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
|
|
|
30
31
|
/** Check if a file path is a JavaScript/TypeScript module */
|
|
31
32
|
const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
|
|
32
33
|
|
|
33
|
-
/** Resolve path relative to process.cwd() */
|
|
34
|
-
const resolvePath = (path: string): string => {
|
|
35
|
-
if (path.startsWith('/')) return path
|
|
36
|
-
return `${process.cwd()}/${path}`
|
|
37
|
-
}
|
|
38
|
-
|
|
39
34
|
// ============================================================================
|
|
40
35
|
// Executable Grader
|
|
41
36
|
// ============================================================================
|
|
@@ -169,6 +164,28 @@ const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
|
|
|
169
164
|
* const grader = await loadGrader('./my-grader')
|
|
170
165
|
* ```
|
|
171
166
|
*/
|
|
167
|
+
/**
|
|
168
|
+
* Load a grader from a file path, exiting on failure.
|
|
169
|
+
*
|
|
170
|
+
* @remarks
|
|
171
|
+
* CLI-friendly wrapper around `loadGrader` that prints the error to stderr
|
|
172
|
+
* and calls `process.exit(1)` on failure. Eliminates the duplicated
|
|
173
|
+
* try/catch pattern across CLI handlers.
|
|
174
|
+
*
|
|
175
|
+
* @param graderPath - Path to the grader (relative or absolute)
|
|
176
|
+
* @returns Grader function (never returns on failure)
|
|
177
|
+
*
|
|
178
|
+
* @public
|
|
179
|
+
*/
|
|
180
|
+
export const loadGraderOrExit = async (graderPath: string): Promise<Grader> => {
|
|
181
|
+
try {
|
|
182
|
+
return await loadGrader(graderPath)
|
|
183
|
+
} catch (error) {
|
|
184
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
185
|
+
process.exit(1)
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
172
189
|
export const loadGrader = async (graderPath: string): Promise<Grader> => {
|
|
173
190
|
const resolvedPath = resolvePath(graderPath)
|
|
174
191
|
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
12
|
import { z } from 'zod'
|
|
13
|
+
import { resolvePath } from '../core.ts'
|
|
13
14
|
import * as schemas from './schemas.ts'
|
|
14
15
|
|
|
15
16
|
// ============================================================================
|
|
@@ -57,12 +58,6 @@ export type SchemasConfig = {
|
|
|
57
58
|
// Helpers
|
|
58
59
|
// ============================================================================
|
|
59
60
|
|
|
60
|
-
/** Resolve path relative to process.cwd() */
|
|
61
|
-
const resolvePath = (path: string): string => {
|
|
62
|
-
if (path.startsWith('/')) return path
|
|
63
|
-
return `${process.cwd()}/${path}`
|
|
64
|
-
}
|
|
65
|
-
|
|
66
61
|
/** Generate JSON Schema from Zod schema */
|
|
67
62
|
const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
|
|
68
63
|
try {
|
package/src/schemas.ts
CHANGED
|
@@ -18,7 +18,7 @@ export {
|
|
|
18
18
|
TAIL_LINES,
|
|
19
19
|
} from './schemas/constants.ts'
|
|
20
20
|
// Grader loader
|
|
21
|
-
export { loadGrader } from './schemas/grader-loader.ts'
|
|
21
|
+
export { loadGrader, loadGraderOrExit } from './schemas/grader-loader.ts'
|
|
22
22
|
// Core session types
|
|
23
23
|
// JSON-RPC types (MCP compatibility)
|
|
24
24
|
// MCP server configuration
|