@plaited/agent-eval-harness 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-... # For Claude
25
25
  export GEMINI_API_KEY=... # For Gemini
26
26
  ```
27
27
 
28
- Pre-built schemas are available in `.plaited/skills/headless-adapters/schemas/` for Claude and Gemini.
28
+ Pre-built schemas are available in `.agents/skills/headless-adapters/schemas/` for Claude and Gemini.
29
29
 
30
30
  ### Core Commands
31
31
 
@@ -98,11 +98,11 @@ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparis
98
98
  **Install skills** for use with AI coding agents:
99
99
 
100
100
  ```bash
101
- curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agents <agent-name> --project agent-eval-harness
101
+ npx skills add plaited/agent-eval-harness
102
+ # or
103
+ bunx skills add plaited/agent-eval-harness
102
104
  ```
103
105
 
104
- Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
105
-
106
106
  ### Available Skills
107
107
 
108
108
  #### Agent Eval Harness
@@ -416,7 +416,7 @@ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
416
416
  ## Requirements
417
417
 
418
418
  - **Runtime:** Bun >= 1.2.9
419
- - **Schema:** JSON schema describing CLI agent interaction (see `.plaited/skills/headless-adapters/schemas/`)
419
+ - **Schema:** JSON schema describing CLI agent interaction (see `.agents/skills/headless-adapters/schemas/`)
420
420
  - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
421
421
 
422
422
  ## License
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.10.0",
3
+ "version": "0.12.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -56,7 +56,7 @@
56
56
  ]
57
57
  },
58
58
  "dependencies": {
59
- "@plaited/development-skills": "0.6.5",
59
+ "@plaited/development-skills": "0.7.0",
60
60
  "zod": "^4.3.6"
61
61
  },
62
62
  "devDependencies": {
@@ -9,8 +9,8 @@
9
9
  */
10
10
 
11
11
  import { parseArgs } from 'node:util'
12
+ import { loadPrompts, resolvePath } from '../core.ts'
12
13
  import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
13
- import { loadPrompts } from './capture.ts'
14
14
 
15
15
  // ============================================================================
16
16
  // Types
@@ -28,16 +28,6 @@ export type BalanceConfig = {
28
28
  threshold?: number
29
29
  }
30
30
 
31
- // ============================================================================
32
- // Helpers
33
- // ============================================================================
34
-
35
- /** Resolve path relative to process.cwd() */
36
- const resolvePath = (path: string): string => {
37
- if (path.startsWith('/')) return path
38
- return `${process.cwd()}/${path}`
39
- }
40
-
41
31
  /**
42
32
  * Analyze category distribution across prompts.
43
33
  *
@@ -11,7 +11,7 @@
11
11
  import { parseArgs } from 'node:util'
12
12
  import { loadResults, resolvePath } from '../core.ts'
13
13
  import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
14
- import { loadGrader } from '../schemas/grader-loader.ts'
14
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
15
15
  import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
16
16
 
17
17
  // ============================================================================
@@ -293,15 +293,7 @@ Examples:
293
293
  }
294
294
 
295
295
  // Load grader if specified
296
- let grader: Grader | undefined
297
- if (values.grader) {
298
- try {
299
- grader = await loadGrader(values.grader)
300
- } catch (error) {
301
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
302
- process.exit(1)
303
- }
304
- }
296
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
305
297
 
306
298
  await runCalibrate({
307
299
  resultsPath,
@@ -11,28 +11,22 @@
11
11
  * @packageDocumentation
12
12
  */
13
13
 
14
- import { mkdir } from 'node:fs/promises'
15
14
  import { parseArgs } from 'node:util'
16
15
  import {
17
16
  createWorkspaceDir,
18
- createWriteMutex,
19
17
  detectTrajectoryRichness,
20
18
  extractOutput,
21
19
  extractTrajectory,
22
20
  getInputPreview,
23
21
  hasToolErrors,
24
- loadPrompts,
25
22
  logProgress,
26
- resolvePath,
27
- runWorkerPool,
28
- writeOutput,
23
+ readStdinPrompts,
29
24
  } from '../core.ts'
30
- import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
31
25
  import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
32
- import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
33
- import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
34
- import { loadGrader } from '../schemas/grader-loader.ts'
35
- import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
26
+ import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
27
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
28
+ import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
29
+ import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
36
30
 
37
31
  // ============================================================================
38
32
  // Re-exports for backward compatibility
@@ -55,30 +49,7 @@ export {
55
49
  // ============================================================================
56
50
 
57
51
  /** Configuration for capture command */
58
- export type CaptureConfig = {
59
- /** Path to prompts.jsonl file */
60
- promptsPath: string
61
- /** Path to agent schema JSON file */
62
- schemaPath: string
63
- /** Output file path (undefined for stdout) */
64
- outputPath?: string
65
- /** Working directory for agent */
66
- cwd?: string
67
- /** Timeout per prompt in milliseconds (overrides schema default) */
68
- timeout?: number
69
- /** Show progress to stderr */
70
- progress?: boolean
71
- /** Append to output file instead of overwriting */
72
- append?: boolean
73
- /** Optional grader function */
74
- grader?: Grader
75
- /** Enable debug mode for detailed output */
76
- debug?: boolean
77
- /** Number of concurrent workers (default: 1 for sequential) */
78
- concurrency?: number
79
- /** Base directory for per-prompt workspace isolation */
80
- workspaceDir?: string
81
- }
52
+ export type CaptureConfig = BaseExecutionConfig
82
53
 
83
54
  // ============================================================================
84
55
  // Capture Implementation
@@ -95,51 +66,25 @@ export type CaptureConfig = {
95
66
  * @returns Array of capture results
96
67
  */
97
68
  export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
69
+ const ctx = await prepareExecution(config)
98
70
  const {
99
- promptsPath,
100
- schemaPath,
101
- outputPath,
102
- cwd,
103
- timeout,
104
- progress = false,
105
- append = false,
71
+ schema,
72
+ prompts,
73
+ sessions,
74
+ resolvedOutputPath,
75
+ resolvedWorkspaceDir,
76
+ defaultWorkingDir,
77
+ progress,
106
78
  grader,
107
- debug = false,
108
- concurrency = 1,
109
- workspaceDir,
110
- } = config
111
-
112
- // Load and validate schema
113
- const schemaFile = Bun.file(schemaPath)
114
- if (!(await schemaFile.exists())) {
115
- throw new Error(`Schema file not found: ${schemaPath}`)
116
- }
117
-
118
- let schema: HeadlessAdapterConfig
119
- try {
120
- const rawSchema = await schemaFile.json()
121
- schema = parseHeadlessConfig(rawSchema)
122
- } catch (error) {
123
- throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
124
- }
125
-
126
- // Load prompts
127
- const prompts = await loadPrompts(promptsPath)
128
-
129
- // Resolve paths
130
- const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
131
- const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
132
-
133
- // Determine effective timeout (CLI flag > schema default > harness default)
134
- const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
135
- const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
79
+ debug,
80
+ } = ctx
136
81
 
137
82
  // Log progress info
138
- logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
139
- logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
140
- logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
141
- if (concurrency > 1) {
142
- logProgress(`Concurrency: ${concurrency} workers`, progress)
83
+ logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
84
+ logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
85
+ logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
86
+ if (ctx.concurrency > 1) {
87
+ logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
143
88
  }
144
89
  if (resolvedWorkspaceDir) {
145
90
  logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
@@ -151,31 +96,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
151
96
  logProgress(`Debug mode: enabled`, progress)
152
97
  }
153
98
 
154
- // Create session manager with schema
155
- const sessions = createSessionManager({
156
- schema,
157
- timeout: effectiveTimeout,
158
- verbose: progress,
159
- debug,
160
- })
161
-
162
- // Clear output file if not appending
163
- if (resolvedOutputPath && !append) {
164
- await Bun.write(resolvedOutputPath, '')
165
- }
166
-
167
- // Create workspace base directory if specified
168
- // Uses fs.mkdir instead of shell to prevent command injection
169
- if (resolvedWorkspaceDir) {
170
- await mkdir(resolvedWorkspaceDir, { recursive: true })
171
- }
172
-
173
- const defaultWorkingDir = cwd ?? process.cwd()
174
-
175
- // Create write mutex for coordinating JSONL output
176
- const writeMutex = createWriteMutex()
177
- let isFirstOutput = true
178
-
179
99
  // Process a single prompt (used by worker pool)
180
100
  const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
181
101
  // Determine working directory (per-prompt workspace or default)
@@ -301,11 +221,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
301
221
  }
302
222
 
303
223
  // Write result immediately (coordinated via mutex for concurrent writes)
304
- await writeMutex.write(async () => {
305
- const formatted = JSON.stringify(result)
306
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
307
- isFirstOutput = false
308
- })
224
+ await ctx.writeResult(result)
309
225
 
310
226
  const statusIcon = result.toolErrors ? '!' : '✓'
311
227
  const exitInfo = result.metadata?.timedOut
@@ -319,20 +235,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
319
235
  }
320
236
 
321
237
  // Run with worker pool
322
- const { results, errors } = await runWorkerPool(prompts, processPrompt, {
323
- concurrency,
324
- onProgress: (completed, total) => {
325
- logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
326
- },
327
- })
328
-
329
- // Log any errors that occurred
330
- if (errors.length > 0) {
331
- logProgress(`Completed with ${errors.length} error(s)`, progress)
332
- }
333
-
334
- logProgress('Done!', progress)
335
- return results
238
+ return executePrompts(ctx, processPrompt)
336
239
  }
337
240
 
338
241
  // ============================================================================
@@ -356,6 +259,7 @@ export const capture = async (args: string[]): Promise<void> => {
356
259
  append: { type: 'boolean', default: false },
357
260
  grader: { type: 'string', short: 'g' },
358
261
  debug: { type: 'boolean', default: false },
262
+ stdin: { type: 'boolean', default: false },
359
263
  concurrency: { type: 'string', short: 'j' },
360
264
  'workspace-dir': { type: 'string' },
361
265
  help: { type: 'boolean', short: 'h' },
@@ -366,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
366
270
  if (values.help) {
367
271
  console.log(`
368
272
  Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
273
+ cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
369
274
 
370
275
  Arguments:
371
276
  prompts.jsonl Input file with evaluation prompts
@@ -376,6 +281,7 @@ Options:
376
281
  -c, --cwd Working directory for agent
377
282
  -t, --timeout Request timeout in ms (overrides schema default)
378
283
  -j, --concurrency Number of concurrent workers (default: 1)
284
+ --stdin Read prompts from stdin (mutually exclusive with file arg)
379
285
  --workspace-dir Base directory for per-prompt workspace isolation
380
286
  --progress Show progress to stderr
381
287
  --append Append to output file instead of overwriting
@@ -401,6 +307,11 @@ Parallelization:
401
307
  Each prompt gets its own agent session for isolation.
402
308
  Results are written as they complete (order may differ from input).
403
309
 
310
+ Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
311
+ at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
312
+ In memory-constrained environments (Docker, CI) this can cause OOM kills.
313
+ Use --stdin to pipe prompts for container-level orchestration.
314
+
404
315
  Workspace Isolation:
405
316
  Use --workspace-dir to create per-prompt directories.
406
317
  Each prompt runs in {workspace-dir}/prompt-{id}/.
@@ -422,13 +333,24 @@ Examples:
422
333
 
423
334
  # With debug mode
424
335
  agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
336
+
337
+ # Read prompts from stdin (container orchestration)
338
+ cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
425
339
  `)
426
340
  return
427
341
  }
428
342
 
429
343
  const promptsPath = positionals[0]
430
- if (!promptsPath) {
431
- console.error('Error: prompts.jsonl path is required')
344
+ const useStdin = values.stdin ?? false
345
+
346
+ // Mutual exclusivity: --stdin and positional file
347
+ if (useStdin && promptsPath) {
348
+ console.error('Error: --stdin and prompts file argument are mutually exclusive')
349
+ process.exit(1)
350
+ }
351
+
352
+ if (!useStdin && !promptsPath) {
353
+ console.error('Error: prompts.jsonl path is required (or use --stdin)')
432
354
  process.exit(1)
433
355
  }
434
356
 
@@ -438,30 +360,23 @@ Examples:
438
360
  process.exit(1)
439
361
  }
440
362
 
441
- // Load grader if specified
442
- let grader: Grader | undefined
443
- if (values.grader) {
444
- try {
445
- grader = await loadGrader(values.grader)
446
- } catch (error) {
447
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
363
+ // Read prompts from stdin if requested
364
+ let prompts: PromptCase[] | undefined
365
+ if (useStdin) {
366
+ const stdinPrompts = await readStdinPrompts()
367
+ if (!stdinPrompts || stdinPrompts.length === 0) {
368
+ console.error('Error: no prompts received on stdin')
448
369
  process.exit(1)
449
370
  }
371
+ prompts = stdinPrompts
450
372
  }
451
373
 
452
- // Validate and parse concurrency
453
- let concurrency = 1
454
- if (values.concurrency) {
455
- const parsed = Number.parseInt(values.concurrency, 10)
456
- if (Number.isNaN(parsed) || parsed < 1) {
457
- console.error('Error: --concurrency must be a positive integer')
458
- process.exit(1)
459
- }
460
- concurrency = parsed
461
- }
374
+ // Load grader if specified
375
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
462
376
 
463
377
  await runCapture({
464
- promptsPath,
378
+ promptsPath: promptsPath ?? undefined,
379
+ prompts,
465
380
  schemaPath: values.schema,
466
381
  outputPath: values.output,
467
382
  cwd: values.cwd,
@@ -470,7 +385,7 @@ Examples:
470
385
  append: values.append ?? false,
471
386
  grader,
472
387
  debug: values.debug ?? false,
473
- concurrency,
388
+ concurrency: parseConcurrency(values.concurrency),
474
389
  workspaceDir: values['workspace-dir'],
475
390
  })
476
391
  }
@@ -0,0 +1,245 @@
1
+ /**
2
+ * Shared execution utilities for capture and trials commands.
3
+ *
4
+ * @remarks
5
+ * Extracts common setup logic: schema loading, prompt loading, path resolution,
6
+ * session manager creation, output initialization, and worker pool execution.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { mkdir } from 'node:fs/promises'
12
+ import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
13
+ import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
14
+ import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
15
+ import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
16
+ import type { Grader, PromptCase } from '../schemas.ts'
17
+
18
+ // ============================================================================
19
+ // Types
20
+ // ============================================================================
21
+
22
+ /** Base configuration shared by capture and trials commands */
23
+ export type BaseExecutionConfig = {
24
+ /** Path to prompts.jsonl file (required unless prompts provided) */
25
+ promptsPath?: string
26
+ /** Path to agent schema JSON file */
27
+ schemaPath: string
28
+ /** Pre-loaded prompt cases (from stdin); skips file loading when set */
29
+ prompts?: PromptCase[]
30
+ /** Output file path (undefined for stdout) */
31
+ outputPath?: string
32
+ /** Working directory for agent */
33
+ cwd?: string
34
+ /** Timeout per prompt in milliseconds (overrides schema default) */
35
+ timeout?: number
36
+ /** Show progress to stderr */
37
+ progress?: boolean
38
+ /** Append to output file instead of overwriting */
39
+ append?: boolean
40
+ /** Optional grader function */
41
+ grader?: Grader
42
+ /** Enable debug mode */
43
+ debug?: boolean
44
+ /** Number of concurrent workers (default: 1 for sequential) */
45
+ concurrency?: number
46
+ /** Base directory for per-prompt workspace isolation */
47
+ workspaceDir?: string
48
+ }
49
+
50
+ /** Prepared execution context returned by prepareExecution */
51
+ export type ExecutionContext = {
52
+ /** Parsed and validated headless adapter schema */
53
+ schema: HeadlessAdapterConfig
54
+ /** Loaded and validated prompt cases */
55
+ prompts: PromptCase[]
56
+ /** Session manager for creating/destroying agent sessions */
57
+ sessions: SessionManager
58
+ /** Resolved absolute output path (undefined for stdout) */
59
+ resolvedOutputPath?: string
60
+ /** Resolved absolute workspace directory path */
61
+ resolvedWorkspaceDir?: string
62
+ /** Effective timeout in milliseconds */
63
+ effectiveTimeout: number
64
+ /** Default working directory for agent sessions */
65
+ defaultWorkingDir: string
66
+ /** Number of concurrent workers */
67
+ concurrency: number
68
+ /** Whether to show progress output */
69
+ progress: boolean
70
+ /** Optional grader function */
71
+ grader?: Grader
72
+ /** Whether debug mode is enabled */
73
+ debug: boolean
74
+ /** Write a result object as JSONL, coordinated via mutex */
75
+ writeResult: (result: unknown) => Promise<void>
76
+ }
77
+
78
+ // ============================================================================
79
+ // Execution Setup
80
+ // ============================================================================
81
+
82
+ /**
83
+ * Prepare execution context from base configuration.
84
+ *
85
+ * @remarks
86
+ * Handles all shared setup: schema loading/validation, prompt loading,
87
+ * path resolution, session manager creation, output file initialization,
88
+ * workspace directory creation, and write mutex coordination.
89
+ *
90
+ * @param config - Base execution configuration
91
+ * @returns Prepared execution context
92
+ * @throws Error if schema file not found, invalid, or prompts missing
93
+ *
94
+ * @public
95
+ */
96
+ export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
97
+ const {
98
+ promptsPath,
99
+ schemaPath,
100
+ outputPath,
101
+ cwd,
102
+ timeout,
103
+ progress = false,
104
+ append = false,
105
+ grader,
106
+ debug = false,
107
+ concurrency = 1,
108
+ workspaceDir,
109
+ } = config
110
+
111
+ // Validate prompt source
112
+ if (!config.prompts && !promptsPath) {
113
+ throw new Error('Either promptsPath or prompts must be provided')
114
+ }
115
+
116
+ // Load and validate schema
117
+ const schemaFile = Bun.file(schemaPath)
118
+ if (!(await schemaFile.exists())) {
119
+ throw new Error(`Schema file not found: ${schemaPath}`)
120
+ }
121
+
122
+ let schema: HeadlessAdapterConfig
123
+ try {
124
+ const rawSchema = await schemaFile.json()
125
+ schema = parseHeadlessConfig(rawSchema)
126
+ } catch (error) {
127
+ throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
128
+ }
129
+
130
+ // Load prompts
131
+ const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
132
+
133
+ // Resolve paths
134
+ const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
135
+ const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
136
+
137
+ // Determine effective timeout (CLI flag > schema default > harness default)
138
+ const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
139
+ const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
140
+
141
+ // Create session manager
142
+ const sessions = createSessionManager({
143
+ schema,
144
+ timeout: effectiveTimeout,
145
+ verbose: progress,
146
+ debug,
147
+ })
148
+
149
+ // Initialize output file (clear if not appending)
150
+ if (resolvedOutputPath && !append) {
151
+ await Bun.write(resolvedOutputPath, '')
152
+ }
153
+
154
+ // Create workspace base directory if specified
155
+ if (resolvedWorkspaceDir) {
156
+ await mkdir(resolvedWorkspaceDir, { recursive: true })
157
+ }
158
+
159
+ const defaultWorkingDir = cwd ?? process.cwd()
160
+
161
+ // Create write mutex with closure for coordinated result writing
162
+ const writeMutex = createWriteMutex()
163
+ let isFirstOutput = true
164
+
165
+ const writeResult = async (result: unknown) => {
166
+ await writeMutex.write(async () => {
167
+ const formatted = JSON.stringify(result)
168
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
169
+ isFirstOutput = false
170
+ })
171
+ }
172
+
173
+ return {
174
+ schema,
175
+ prompts,
176
+ sessions,
177
+ resolvedOutputPath,
178
+ resolvedWorkspaceDir,
179
+ effectiveTimeout,
180
+ defaultWorkingDir,
181
+ concurrency,
182
+ progress,
183
+ grader,
184
+ debug,
185
+ writeResult,
186
+ }
187
+ }
188
+
189
+ // ============================================================================
190
+ // Worker Pool Execution
191
+ // ============================================================================
192
+
193
+ /**
194
+ * Execute prompts through a worker pool with progress logging.
195
+ *
196
+ * @remarks
197
+ * Common wrapper for the runWorkerPool pattern used by both capture and trials.
198
+ * Handles progress callbacks, error logging, and completion logging.
199
+ *
200
+ * @param ctx - Execution context from prepareExecution
201
+ * @param processFn - Function to process each prompt
202
+ * @returns Array of results
203
+ *
204
+ * @public
205
+ */
206
+ export const executePrompts = async <T>(
207
+ ctx: ExecutionContext,
208
+ processFn: (promptCase: PromptCase, index: number) => Promise<T>,
209
+ ): Promise<T[]> => {
210
+ const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
211
+ concurrency: ctx.concurrency,
212
+ onProgress: (completed, total) => {
213
+ logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
214
+ },
215
+ })
216
+
217
+ if (errors.length > 0) {
218
+ logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
219
+ }
220
+
221
+ logProgress('Done!', ctx.progress)
222
+ return results
223
+ }
224
+
225
+ // ============================================================================
226
+ // CLI Helpers
227
+ // ============================================================================
228
+
229
+ /**
230
+ * Parse and validate concurrency CLI argument.
231
+ *
232
+ * @param value - Raw string value from parseArgs
233
+ * @returns Validated positive integer (default: 1)
234
+ *
235
+ * @public
236
+ */
237
+ export const parseConcurrency = (value: string | undefined): number => {
238
+ if (!value) return 1
239
+ const parsed = Number.parseInt(value, 10)
240
+ if (Number.isNaN(parsed) || parsed < 1) {
241
+ console.error('Error: --concurrency must be a positive integer')
242
+ process.exit(1)
243
+ }
244
+ return parsed
245
+ }
@@ -142,6 +142,16 @@ describe('runCapture configuration', () => {
142
142
  expect(config.concurrency).toBeUndefined()
143
143
  expect(config.workspaceDir).toBeUndefined()
144
144
  })
145
+
146
+ test('CaptureConfig accepts prompts without promptsPath', () => {
147
+ const config: CaptureConfig = {
148
+ schemaPath: './test-schema.json',
149
+ prompts: [{ id: 't1', input: 'hello' }],
150
+ }
151
+
152
+ expect(config.promptsPath).toBeUndefined()
153
+ expect(config.prompts).toHaveLength(1)
154
+ })
145
155
  })
146
156
 
147
157
  // ============================================================================
@@ -168,6 +178,23 @@ describe('capture CLI', () => {
168
178
  expect(stdout).toContain('-s, --schema')
169
179
  expect(stdout).toContain('-j, --concurrency')
170
180
  expect(stdout).toContain('--workspace-dir')
181
+ expect(stdout).toContain('--stdin')
182
+ })
183
+
184
+ test('shows error for --stdin with positional file', async () => {
185
+ const proc = Bun.spawn(
186
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
187
+ {
188
+ stdout: 'pipe',
189
+ stderr: 'pipe',
190
+ },
191
+ )
192
+
193
+ const stderr = await new Response(proc.stderr).text()
194
+ const exitCode = await proc.exited
195
+
196
+ expect(exitCode).not.toBe(0)
197
+ expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
171
198
  })
172
199
 
173
200
  test('shows error for missing prompts file argument', async () => {
@@ -44,6 +44,17 @@ describe('TrialsConfig configuration', () => {
44
44
  expect(config.concurrency).toBeUndefined()
45
45
  expect(config.workspaceDir).toBeUndefined()
46
46
  })
47
+
48
+ test('TrialsConfig accepts prompts without promptsPath', () => {
49
+ const config: TrialsConfig = {
50
+ schemaPath: './test-schema.json',
51
+ k: 3,
52
+ prompts: [{ id: 't1', input: 'hello' }],
53
+ }
54
+
55
+ expect(config.promptsPath).toBeUndefined()
56
+ expect(config.prompts).toHaveLength(1)
57
+ })
47
58
  })
48
59
 
49
60
  // ============================================================================
@@ -72,6 +83,23 @@ describe('trials CLI', () => {
72
83
  expect(stdout).toContain('pass@k')
73
84
  expect(stdout).toContain('-j, --concurrency')
74
85
  expect(stdout).toContain('--workspace-dir')
86
+ expect(stdout).toContain('--stdin')
87
+ })
88
+
89
+ test('shows error for --stdin with positional file', async () => {
90
+ const proc = Bun.spawn(
91
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
92
+ {
93
+ stdout: 'pipe',
94
+ stderr: 'pipe',
95
+ },
96
+ )
97
+
98
+ const stderr = await new Response(proc.stderr).text()
99
+ const exitCode = await proc.exited
100
+
101
+ expect(exitCode).not.toBe(0)
102
+ expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
75
103
  })
76
104
 
77
105
  test('shows error for missing prompts file argument', async () => {
@@ -11,25 +11,13 @@
11
11
  * @packageDocumentation
12
12
  */
13
13
 
14
- import { mkdir } from 'node:fs/promises'
15
14
  import { parseArgs } from 'node:util'
16
- import {
17
- createWorkspaceDir,
18
- createWriteMutex,
19
- extractOutput,
20
- extractTrajectory,
21
- loadPrompts,
22
- logProgress,
23
- resolvePath,
24
- runWorkerPool,
25
- writeOutput,
26
- } from '../core.ts'
27
- import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
15
+ import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
28
16
  import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
29
- import { createSessionManager } from '../headless/headless-session-manager.ts'
30
- import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
31
- import { loadGrader } from '../schemas/grader-loader.ts'
32
- import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
17
+ import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
18
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
19
+ import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
20
+ import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
33
21
 
34
22
  // ============================================================================
35
23
  // Pass@k/Pass^k Calculation
@@ -85,31 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
85
73
  // ============================================================================
86
74
 
87
75
  /** Configuration for trials command */
88
- export type TrialsConfig = {
89
- /** Path to prompts.jsonl file */
90
- promptsPath: string
91
- /** Path to agent schema JSON file */
92
- schemaPath: string
76
+ export type TrialsConfig = BaseExecutionConfig & {
93
77
  /** Number of trials per prompt */
94
78
  k: number
95
- /** Output file path */
96
- outputPath?: string
97
- /** Working directory for agent */
98
- cwd?: string
99
- /** Timeout per prompt in milliseconds (overrides schema default) */
100
- timeout?: number
101
- /** Show progress to stderr */
102
- progress?: boolean
103
- /** Append to output file */
104
- append?: boolean
105
- /** Optional grader function */
106
- grader?: Grader
107
- /** Enable debug mode */
108
- debug?: boolean
109
- /** Number of concurrent workers (default: 1 for sequential) */
110
- concurrency?: number
111
- /** Base directory for per-prompt workspace isolation */
112
- workspaceDir?: string
113
79
  }
114
80
 
115
81
  // ============================================================================
@@ -123,53 +89,17 @@ export type TrialsConfig = {
123
89
  * @returns Array of trial results
124
90
  */
125
91
  export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
126
- const {
127
- promptsPath,
128
- schemaPath,
129
- k,
130
- outputPath,
131
- cwd,
132
- timeout,
133
- progress = false,
134
- append = false,
135
- grader,
136
- debug = false,
137
- concurrency = 1,
138
- workspaceDir,
139
- } = config
140
-
141
- // Load and validate schema
142
- const schemaFile = Bun.file(schemaPath)
143
- if (!(await schemaFile.exists())) {
144
- throw new Error(`Schema file not found: ${schemaPath}`)
145
- }
146
-
147
- let schema: HeadlessAdapterConfig
148
- try {
149
- const rawSchema = await schemaFile.json()
150
- schema = parseHeadlessConfig(rawSchema)
151
- } catch (error) {
152
- throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
153
- }
154
-
155
- // Load prompts
156
- const prompts = await loadPrompts(promptsPath)
157
-
158
- // Resolve paths
159
- const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
160
- const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
161
-
162
- // Determine effective timeout (CLI flag > schema default > harness default)
163
- const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
164
- const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
92
+ const { k } = config
93
+ const ctx = await prepareExecution(config)
94
+ const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
165
95
 
166
96
  // Log progress info
167
- logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
97
+ logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
168
98
  logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
169
- logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
170
- logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
171
- if (concurrency > 1) {
172
- logProgress(`Concurrency: ${concurrency} workers`, progress)
99
+ logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
100
+ logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
101
+ if (ctx.concurrency > 1) {
102
+ logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
173
103
  }
174
104
  if (resolvedWorkspaceDir) {
175
105
  logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
@@ -178,31 +108,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
178
108
  logProgress('Grader: enabled (will compute pass@k metrics)', progress)
179
109
  }
180
110
 
181
- // Create session manager with schema
182
- const sessions = createSessionManager({
183
- schema,
184
- timeout: effectiveTimeout,
185
- verbose: progress,
186
- debug,
187
- })
188
-
189
- // Clear output file if not appending
190
- if (resolvedOutputPath && !append) {
191
- await Bun.write(resolvedOutputPath, '')
192
- }
193
-
194
- // Create workspace base directory if specified
195
- // Uses fs.mkdir instead of shell to prevent command injection
196
- if (resolvedWorkspaceDir) {
197
- await mkdir(resolvedWorkspaceDir, { recursive: true })
198
- }
199
-
200
- const defaultWorkingDir = cwd ?? process.cwd()
201
-
202
- // Create write mutex for coordinating JSONL output
203
- const writeMutex = createWriteMutex()
204
- let isFirstOutput = true
205
-
206
111
  // Process all trials for a single prompt
207
112
  const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
208
113
  logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
@@ -308,11 +213,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
308
213
  }
309
214
 
310
215
  // Write result immediately (coordinated via mutex for concurrent writes)
311
- await writeMutex.write(async () => {
312
- const formatted = JSON.stringify(result)
313
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
314
- isFirstOutput = false
315
- })
216
+ await ctx.writeResult(result)
316
217
 
317
218
  if (grader) {
318
219
  logProgress(
@@ -325,20 +226,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
325
226
  }
326
227
 
327
228
  // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
328
- const { results, errors } = await runWorkerPool(prompts, processPromptTrials, {
329
- concurrency,
330
- onProgress: (completed, total) => {
331
- logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
332
- },
333
- })
334
-
335
- // Log any errors that occurred
336
- if (errors.length > 0) {
337
- logProgress(`Completed with ${errors.length} error(s)`, progress)
338
- }
339
-
340
- logProgress('Done!', progress)
341
- return results
229
+ return executePrompts(ctx, processPromptTrials)
342
230
  }
343
231
 
344
232
  // ============================================================================
@@ -363,6 +251,7 @@ export const trials = async (args: string[]): Promise<void> => {
363
251
  append: { type: 'boolean', default: false },
364
252
  grader: { type: 'string', short: 'g' },
365
253
  debug: { type: 'boolean', default: false },
254
+ stdin: { type: 'boolean', default: false },
366
255
  concurrency: { type: 'string', short: 'j' },
367
256
  'workspace-dir': { type: 'string' },
368
257
  help: { type: 'boolean', short: 'h' },
@@ -373,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
373
262
  if (values.help) {
374
263
  console.log(`
375
264
  Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
265
+ cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
376
266
 
377
267
  Arguments:
378
268
  prompts.jsonl Input file with evaluation prompts
@@ -384,6 +274,7 @@ Options:
384
274
  -c, --cwd Working directory for agent
385
275
  -t, --timeout Request timeout in ms (overrides schema default)
386
276
  -j, --concurrency Number of concurrent workers (default: 1)
277
+ --stdin Read prompts from stdin (mutually exclusive with file arg)
387
278
  --workspace-dir Base directory for per-trial workspace isolation
388
279
  --progress Show progress to stderr
389
280
  --append Append to output file
@@ -404,6 +295,11 @@ Parallelization:
404
295
  Each prompt's k trials still run sequentially (required for aggregation).
405
296
  With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
406
297
 
298
+ Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
299
+ at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
300
+ In memory-constrained environments (Docker, CI) this can cause OOM kills.
301
+ Use --stdin to pipe prompts for container-level orchestration.
302
+
407
303
  Workspace Isolation:
408
304
  Use --workspace-dir to create per-trial directories.
409
305
  Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
@@ -422,13 +318,24 @@ Examples:
422
318
 
423
319
  # With TypeScript grader
424
320
  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
321
+
322
+ # Read prompts from stdin (container orchestration)
323
+ cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
425
324
  `)
426
325
  return
427
326
  }
428
327
 
429
328
  const promptsPath = positionals[0]
430
- if (!promptsPath) {
431
- console.error('Error: prompts.jsonl path is required')
329
+ const useStdin = values.stdin ?? false
330
+
331
+ // Mutual exclusivity: --stdin and positional file
332
+ if (useStdin && promptsPath) {
333
+ console.error('Error: --stdin and prompts file argument are mutually exclusive')
334
+ process.exit(1)
335
+ }
336
+
337
+ if (!useStdin && !promptsPath) {
338
+ console.error('Error: prompts.jsonl path is required (or use --stdin)')
432
339
  process.exit(1)
433
340
  }
434
341
 
@@ -438,30 +345,23 @@ Examples:
438
345
  process.exit(1)
439
346
  }
440
347
 
441
- // Load grader if specified
442
- let grader: Grader | undefined
443
- if (values.grader) {
444
- try {
445
- grader = await loadGrader(values.grader)
446
- } catch (error) {
447
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
348
+ // Read prompts from stdin if requested
349
+ let prompts: PromptCase[] | undefined
350
+ if (useStdin) {
351
+ const stdinPrompts = await readStdinPrompts()
352
+ if (!stdinPrompts || stdinPrompts.length === 0) {
353
+ console.error('Error: no prompts received on stdin')
448
354
  process.exit(1)
449
355
  }
356
+ prompts = stdinPrompts
450
357
  }
451
358
 
452
- // Validate and parse concurrency
453
- let concurrency = 1
454
- if (values.concurrency) {
455
- const parsed = Number.parseInt(values.concurrency, 10)
456
- if (Number.isNaN(parsed) || parsed < 1) {
457
- console.error('Error: --concurrency must be a positive integer')
458
- process.exit(1)
459
- }
460
- concurrency = parsed
461
- }
359
+ // Load grader if specified
360
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
462
361
 
463
362
  await runTrials({
464
- promptsPath,
363
+ promptsPath: promptsPath ?? undefined,
364
+ prompts,
465
365
  schemaPath: values.schema,
466
366
  k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
467
367
  outputPath: values.output,
@@ -471,7 +371,7 @@ Examples:
471
371
  append: values.append ?? false,
472
372
  grader,
473
373
  debug: values.debug ?? false,
474
- concurrency,
374
+ concurrency: parseConcurrency(values.concurrency),
475
375
  workspaceDir: values['workspace-dir'],
476
376
  })
477
377
  }
@@ -9,9 +9,9 @@
9
9
  */
10
10
 
11
11
  import { parseArgs } from 'node:util'
12
- import { loadGrader } from '../schemas/grader-loader.ts'
12
+ import { loadPrompts, resolvePath } from '../core.ts'
13
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
13
14
  import type { Grader, ValidationResult } from '../schemas.ts'
14
- import { loadPrompts } from './capture.ts'
15
15
 
16
16
  // ============================================================================
17
17
  // Types
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
27
27
  grader: Grader
28
28
  }
29
29
 
30
- // ============================================================================
31
- // Helpers
32
- // ============================================================================
33
-
34
- /** Resolve path relative to process.cwd() */
35
- const resolvePath = (path: string): string => {
36
- if (path.startsWith('/')) return path
37
- return `${process.cwd()}/${path}`
38
- }
39
-
40
30
  // ============================================================================
41
31
  // Validate-Refs Implementation
42
32
  // ============================================================================
@@ -171,13 +161,7 @@ Examples:
171
161
  }
172
162
 
173
163
  // Load grader
174
- let grader: Grader
175
- try {
176
- grader = await loadGrader(values.grader)
177
- } catch (error) {
178
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
179
- process.exit(1)
180
- }
164
+ const grader = await loadGraderOrExit(values.grader)
181
165
 
182
166
  await runValidateRefs({
183
167
  promptsPath,
package/src/core/core.ts CHANGED
@@ -11,7 +11,15 @@
11
11
  */
12
12
 
13
13
  // Loading utilities
14
- export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
14
+ export {
15
+ buildResultsIndex,
16
+ countLines,
17
+ loadJsonl,
18
+ loadPrompts,
19
+ loadResults,
20
+ readStdinPrompts,
21
+ streamResults,
22
+ } from './loading.ts'
15
23
  // Output utilities
16
24
  export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
17
25
  // Native streaming utilities
@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
39
39
  })
40
40
  }
41
41
 
42
+ /**
43
+ * Read prompts from stdin as JSONL.
44
+ *
45
+ * @remarks
46
+ * Reads all data from stdin, parses each line as JSON, and validates against
47
+ * PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
48
+ * Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
49
+ *
50
+ * @returns Parsed and validated prompt cases, or null if stdin is a TTY
51
+ * @throws Error if any line is invalid JSON or fails schema validation
52
+ *
53
+ * @public
54
+ */
55
+ export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
56
+ if (process.stdin.isTTY) {
57
+ return null
58
+ }
59
+
60
+ const chunks: Buffer[] = []
61
+ for await (const chunk of process.stdin) {
62
+ chunks.push(chunk)
63
+ }
64
+
65
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
66
+ if (!content) return null
67
+
68
+ return content
69
+ .split('\n')
70
+ .filter(Boolean)
71
+ .map((line, index) => {
72
+ try {
73
+ return PromptCaseSchema.parse(JSON.parse(line))
74
+ } catch (error) {
75
+ throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
76
+ }
77
+ })
78
+ }
79
+
42
80
  /**
43
81
  * Load capture results from a JSONL file.
44
82
  *
package/src/core.ts CHANGED
@@ -31,6 +31,7 @@ export {
31
31
  loadResults,
32
32
  logProgress,
33
33
  type ProgressCallback,
34
+ readStdinPrompts,
34
35
  resolvePath,
35
36
  runWorkerPool,
36
37
  streamJsonl,
@@ -13,6 +13,7 @@
13
13
  * @packageDocumentation
14
14
  */
15
15
 
16
+ import { resolvePath } from '../core.ts'
16
17
  import type { Grader, TrajectoryStep } from './schemas.ts'
17
18
  import { GraderResultSchema } from './schemas.ts'
18
19
 
@@ -30,12 +31,6 @@ const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
30
31
  /** Check if a file path is a JavaScript/TypeScript module */
31
32
  const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
32
33
 
33
- /** Resolve path relative to process.cwd() */
34
- const resolvePath = (path: string): string => {
35
- if (path.startsWith('/')) return path
36
- return `${process.cwd()}/${path}`
37
- }
38
-
39
34
  // ============================================================================
40
35
  // Executable Grader
41
36
  // ============================================================================
@@ -169,6 +164,28 @@ const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
169
164
  * const grader = await loadGrader('./my-grader')
170
165
  * ```
171
166
  */
167
+ /**
168
+ * Load a grader from a file path, exiting on failure.
169
+ *
170
+ * @remarks
171
+ * CLI-friendly wrapper around `loadGrader` that prints the error to stderr
172
+ * and calls `process.exit(1)` on failure. Eliminates the duplicated
173
+ * try/catch pattern across CLI handlers.
174
+ *
175
+ * @param graderPath - Path to the grader (relative or absolute)
176
+ * @returns Grader function (never returns on failure)
177
+ *
178
+ * @public
179
+ */
180
+ export const loadGraderOrExit = async (graderPath: string): Promise<Grader> => {
181
+ try {
182
+ return await loadGrader(graderPath)
183
+ } catch (error) {
184
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
185
+ process.exit(1)
186
+ }
187
+ }
188
+
172
189
  export const loadGrader = async (graderPath: string): Promise<Grader> => {
173
190
  const resolvedPath = resolvePath(graderPath)
174
191
 
@@ -10,6 +10,7 @@
10
10
 
11
11
  import { parseArgs } from 'node:util'
12
12
  import { z } from 'zod'
13
+ import { resolvePath } from '../core.ts'
13
14
  import * as schemas from './schemas.ts'
14
15
 
15
16
  // ============================================================================
@@ -57,12 +58,6 @@ export type SchemasConfig = {
57
58
  // Helpers
58
59
  // ============================================================================
59
60
 
60
- /** Resolve path relative to process.cwd() */
61
- const resolvePath = (path: string): string => {
62
- if (path.startsWith('/')) return path
63
- return `${process.cwd()}/${path}`
64
- }
65
-
66
61
  /** Generate JSON Schema from Zod schema */
67
62
  const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
68
63
  try {
package/src/schemas.ts CHANGED
@@ -18,7 +18,7 @@ export {
18
18
  TAIL_LINES,
19
19
  } from './schemas/constants.ts'
20
20
  // Grader loader
21
- export { loadGrader } from './schemas/grader-loader.ts'
21
+ export { loadGrader, loadGraderOrExit } from './schemas/grader-loader.ts'
22
22
  // Core session types
23
23
  // JSON-RPC types (MCP compatibility)
24
24
  // MCP server configuration