@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
58
58
  --schema ./schemas/claude-headless.json \
59
59
  -o results.jsonl
60
60
 
61
+ # Parallel capture (4x faster with 4 workers)
62
+ bunx @plaited/agent-eval-harness capture prompts.jsonl \
63
+ --schema ./schemas/claude-headless.json \
64
+ -j 4 -o results.jsonl
65
+
61
66
  # Run trials for pass@k analysis with debug mode
62
67
  bunx @plaited/agent-eval-harness trials prompts.jsonl \
63
68
  --schema ./schemas/claude-headless.json \
64
69
  -k 5 --grader ./grader.ts --debug
65
70
 
71
+ # Parallel trials (4 prompts running trials concurrently)
72
+ bunx @plaited/agent-eval-harness trials prompts.jsonl \
73
+ --schema ./schemas/claude-headless.json \
74
+ -k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
75
+
66
76
  # Summarize results
67
77
  bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
68
78
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.9.0",
3
+ "version": "0.11.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -9,8 +9,8 @@
9
9
  */
10
10
 
11
11
  import { parseArgs } from 'node:util'
12
+ import { loadPrompts, resolvePath } from '../core.ts'
12
13
  import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
13
- import { loadPrompts } from './capture.ts'
14
14
 
15
15
  // ============================================================================
16
16
  // Types
@@ -28,16 +28,6 @@ export type BalanceConfig = {
28
28
  threshold?: number
29
29
  }
30
30
 
31
- // ============================================================================
32
- // Helpers
33
- // ============================================================================
34
-
35
- /** Resolve path relative to process.cwd() */
36
- const resolvePath = (path: string): string => {
37
- if (path.startsWith('/')) return path
38
- return `${process.cwd()}/${path}`
39
- }
40
-
41
31
  /**
42
32
  * Analyze category distribution across prompts.
43
33
  *
@@ -11,7 +11,7 @@
11
11
  import { parseArgs } from 'node:util'
12
12
  import { loadResults, resolvePath } from '../core.ts'
13
13
  import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
14
- import { loadGrader } from '../schemas/grader-loader.ts'
14
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
15
15
  import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
16
16
 
17
17
  // ============================================================================
@@ -293,15 +293,7 @@ Examples:
293
293
  }
294
294
 
295
295
  // Load grader if specified
296
- let grader: Grader | undefined
297
- if (values.grader) {
298
- try {
299
- grader = await loadGrader(values.grader)
300
- } catch (error) {
301
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
302
- process.exit(1)
303
- }
304
- }
296
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
305
297
 
306
298
  await runCalibrate({
307
299
  resultsPath,
@@ -13,22 +13,20 @@
13
13
 
14
14
  import { parseArgs } from 'node:util'
15
15
  import {
16
+ createWorkspaceDir,
16
17
  detectTrajectoryRichness,
17
18
  extractOutput,
18
19
  extractTrajectory,
19
20
  getInputPreview,
20
21
  hasToolErrors,
21
- loadPrompts,
22
22
  logProgress,
23
- resolvePath,
24
- writeOutput,
23
+ readStdinPrompts,
25
24
  } from '../core.ts'
26
- import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
27
25
  import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
28
- import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
29
- import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
30
- import { loadGrader } from '../schemas/grader-loader.ts'
31
- import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
26
+ import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
27
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
28
+ import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
29
+ import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
32
30
 
33
31
  // ============================================================================
34
32
  // Re-exports for backward compatibility
@@ -51,26 +49,7 @@ export {
51
49
  // ============================================================================
52
50
 
53
51
  /** Configuration for capture command */
54
- export type CaptureConfig = {
55
- /** Path to prompts.jsonl file */
56
- promptsPath: string
57
- /** Path to agent schema JSON file */
58
- schemaPath: string
59
- /** Output file path (undefined for stdout) */
60
- outputPath?: string
61
- /** Working directory for agent */
62
- cwd?: string
63
- /** Timeout per prompt in milliseconds (overrides schema default) */
64
- timeout?: number
65
- /** Show progress to stderr */
66
- progress?: boolean
67
- /** Append to output file instead of overwriting */
68
- append?: boolean
69
- /** Optional grader function */
70
- grader?: Grader
71
- /** Enable debug mode for detailed output */
72
- debug?: boolean
73
- }
52
+ export type CaptureConfig = BaseExecutionConfig
74
53
 
75
54
  // ============================================================================
76
55
  // Capture Implementation
@@ -87,46 +66,29 @@ export type CaptureConfig = {
87
66
  * @returns Array of capture results
88
67
  */
89
68
  export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
69
+ const ctx = await prepareExecution(config)
90
70
  const {
91
- promptsPath,
92
- schemaPath,
93
- outputPath,
94
- cwd,
95
- timeout,
96
- progress = false,
97
- append = false,
71
+ schema,
72
+ prompts,
73
+ sessions,
74
+ resolvedOutputPath,
75
+ resolvedWorkspaceDir,
76
+ defaultWorkingDir,
77
+ progress,
98
78
  grader,
99
- debug = false,
100
- } = config
79
+ debug,
80
+ } = ctx
101
81
 
102
- // Load and validate schema
103
- const schemaFile = Bun.file(schemaPath)
104
- if (!(await schemaFile.exists())) {
105
- throw new Error(`Schema file not found: ${schemaPath}`)
82
+ // Log progress info
83
+ logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
84
+ logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
85
+ logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
86
+ if (ctx.concurrency > 1) {
87
+ logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
106
88
  }
107
-
108
- let schema: HeadlessAdapterConfig
109
- try {
110
- const rawSchema = await schemaFile.json()
111
- schema = parseHeadlessConfig(rawSchema)
112
- } catch (error) {
113
- throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
89
+ if (resolvedWorkspaceDir) {
90
+ logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
114
91
  }
115
-
116
- // Load prompts
117
- const prompts = await loadPrompts(promptsPath)
118
-
119
- // Resolve output path
120
- const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
121
-
122
- // Determine effective timeout (CLI flag > schema default > harness default)
123
- const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
124
- const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
125
-
126
- // Log progress info
127
- logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
128
- logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
129
- logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
130
92
  if (resolvedOutputPath) {
131
93
  logProgress(`Output: ${resolvedOutputPath}`, progress)
132
94
  }
@@ -134,37 +96,24 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
134
96
  logProgress(`Debug mode: enabled`, progress)
135
97
  }
136
98
 
137
- // Create session manager with schema
138
- const sessions = createSessionManager({
139
- schema,
140
- timeout: effectiveTimeout,
141
- verbose: progress,
142
- debug,
143
- })
99
+ // Process a single prompt (used by worker pool)
100
+ const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
101
+ // Determine working directory (per-prompt workspace or default)
102
+ const workingDir = resolvedWorkspaceDir
103
+ ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
104
+ : defaultWorkingDir
144
105
 
145
- // Clear output file if not appending
146
- if (resolvedOutputPath && !append) {
147
- await Bun.write(resolvedOutputPath, '')
148
- }
149
-
150
- const workingDir = cwd ?? process.cwd()
151
- const results: CaptureResult[] = []
152
- let isFirstOutput = true
153
-
154
- // Run evaluations sequentially - fresh session per entry
155
- for (let i = 0; i < prompts.length; i++) {
156
- const promptCase = prompts[i]
157
- if (!promptCase) continue
158
-
159
- logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
106
+ logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
160
107
 
161
108
  const startTime = Date.now()
162
109
  let result: CaptureResult
110
+ let sessionId: string | undefined
163
111
 
164
112
  try {
165
113
  // Create fresh session for each entry (ensures isolation)
166
114
  const sessionStart = Date.now()
167
115
  const session = await sessions.create(workingDir)
116
+ sessionId = session.id
168
117
  const sessionCreation = Date.now() - sessionStart
169
118
  logProgress(` Session: ${session.id}`, progress)
170
119
 
@@ -177,9 +126,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
177
126
  let lastExitInfo: ProcessExitInfo | undefined
178
127
  let lastOutput = ''
179
128
 
180
- // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
181
- // The session manager would need to accept timeout per-call to support this
182
-
183
129
  // Execute each turn sequentially in the same session
184
130
  for (const turnInput of inputs) {
185
131
  const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
@@ -198,7 +144,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
198
144
 
199
145
  result = {
200
146
  id: promptCase.id,
201
- input: promptCase.input, // Preserve original (string or array)
147
+ input: promptCase.input,
202
148
  output,
203
149
  ...(promptCase.hint && { hint: promptCase.hint }),
204
150
  trajectory,
@@ -207,6 +153,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
207
153
  agent: schema.name,
208
154
  trajectoryRichness,
209
155
  turnCount,
156
+ ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
210
157
  ...(lastExitInfo && {
211
158
  exitCode: lastExitInfo.exitCode,
212
159
  signal: lastExitInfo.signal,
@@ -236,14 +183,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
236
183
 
237
184
  result.score = graderResult
238
185
 
239
- // Merge outcome from grader if present
240
186
  if (graderResult.outcome) {
241
187
  result.outcome = graderResult.outcome
242
188
  }
243
189
  }
244
-
245
- // Clean up session
246
- sessions.destroy(session.id)
247
190
  } catch (error) {
248
191
  const endTime = Date.now()
249
192
  const message = error instanceof Error ? error.message : String(error)
@@ -259,6 +202,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
259
202
  agent: schema.name,
260
203
  trajectoryRichness: 'minimal' as TrajectoryRichness,
261
204
  turnCount: inputs.length,
205
+ ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
262
206
  },
263
207
  timing: {
264
208
  start: startTime,
@@ -269,14 +213,15 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
269
213
  toolErrors: true,
270
214
  errors: [message],
271
215
  }
216
+ } finally {
217
+ // Always clean up session if it was created
218
+ if (sessionId) {
219
+ sessions.destroy(sessionId)
220
+ }
272
221
  }
273
222
 
274
- results.push(result)
275
-
276
- // Write result immediately
277
- const formatted = JSON.stringify(result)
278
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
279
- isFirstOutput = false
223
+ // Write result immediately (coordinated via mutex for concurrent writes)
224
+ await ctx.writeResult(result)
280
225
 
281
226
  const statusIcon = result.toolErrors ? '!' : '✓'
282
227
  const exitInfo = result.metadata?.timedOut
@@ -284,11 +229,13 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
284
229
  : result.metadata?.exitCode && result.metadata.exitCode !== 0
285
230
  ? ` - exit ${result.metadata.exitCode}`
286
231
  : ''
287
- logProgress(` ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
232
+ logProgress(` ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
233
+
234
+ return result
288
235
  }
289
236
 
290
- logProgress('Done!', progress)
291
- return results
237
+ // Run with worker pool
238
+ return executePrompts(ctx, processPrompt)
292
239
  }
293
240
 
294
241
  // ============================================================================
@@ -312,6 +259,9 @@ export const capture = async (args: string[]): Promise<void> => {
312
259
  append: { type: 'boolean', default: false },
313
260
  grader: { type: 'string', short: 'g' },
314
261
  debug: { type: 'boolean', default: false },
262
+ stdin: { type: 'boolean', default: false },
263
+ concurrency: { type: 'string', short: 'j' },
264
+ 'workspace-dir': { type: 'string' },
315
265
  help: { type: 'boolean', short: 'h' },
316
266
  },
317
267
  allowPositionals: true,
@@ -320,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
320
270
  if (values.help) {
321
271
  console.log(`
322
272
  Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
273
+ cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
323
274
 
324
275
  Arguments:
325
276
  prompts.jsonl Input file with evaluation prompts
@@ -329,6 +280,9 @@ Options:
329
280
  -o, --output Output file (default: stdout)
330
281
  -c, --cwd Working directory for agent
331
282
  -t, --timeout Request timeout in ms (overrides schema default)
283
+ -j, --concurrency Number of concurrent workers (default: 1)
284
+ --stdin Read prompts from stdin (mutually exclusive with file arg)
285
+ --workspace-dir Base directory for per-prompt workspace isolation
332
286
  --progress Show progress to stderr
333
287
  --append Append to output file instead of overwriting
334
288
  -g, --grader Path to grader (.ts/.js module or executable script)
@@ -348,25 +302,55 @@ Graders:
348
302
  TS/JS modules must export a 'grade' function.
349
303
  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
350
304
 
305
+ Parallelization:
306
+ Use -j/--concurrency to run multiple prompts in parallel.
307
+ Each prompt gets its own agent session for isolation.
308
+ Results are written as they complete (order may differ from input).
309
+
310
+ Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
311
+ at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
312
+ In memory-constrained environments (Docker, CI) this can cause OOM kills.
313
+ Use --stdin to pipe prompts for container-level orchestration.
314
+
315
+ Workspace Isolation:
316
+ Use --workspace-dir to create per-prompt directories.
317
+ Each prompt runs in {workspace-dir}/prompt-{id}/.
318
+ Useful for code generation tasks requiring filesystem isolation.
319
+
351
320
  Examples:
352
321
  # Basic capture with schema
353
322
  agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
354
323
 
324
+ # Run 4 prompts in parallel
325
+ agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
326
+
327
+ # With workspace isolation for code generation
328
+ agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
329
+ --workspace-dir ./workspaces -o results.jsonl
330
+
355
331
  # With TypeScript grader
356
332
  agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
357
333
 
358
334
  # With debug mode
359
335
  agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
360
336
 
361
- # With per-prompt timeout override (in prompts.jsonl):
362
- {"id": "slow-task", "input": "...", "timeout": 180000}
337
+ # Read prompts from stdin (container orchestration)
338
+ cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
363
339
  `)
364
340
  return
365
341
  }
366
342
 
367
343
  const promptsPath = positionals[0]
368
- if (!promptsPath) {
369
- console.error('Error: prompts.jsonl path is required')
344
+ const useStdin = values.stdin ?? false
345
+
346
+ // Mutual exclusivity: --stdin and positional file
347
+ if (useStdin && promptsPath) {
348
+ console.error('Error: --stdin and prompts file argument are mutually exclusive')
349
+ process.exit(1)
350
+ }
351
+
352
+ if (!useStdin && !promptsPath) {
353
+ console.error('Error: prompts.jsonl path is required (or use --stdin)')
370
354
  process.exit(1)
371
355
  }
372
356
 
@@ -376,19 +360,23 @@ Examples:
376
360
  process.exit(1)
377
361
  }
378
362
 
379
- // Load grader if specified
380
- let grader: Grader | undefined
381
- if (values.grader) {
382
- try {
383
- grader = await loadGrader(values.grader)
384
- } catch (error) {
385
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
363
+ // Read prompts from stdin if requested
364
+ let prompts: PromptCase[] | undefined
365
+ if (useStdin) {
366
+ const stdinPrompts = await readStdinPrompts()
367
+ if (!stdinPrompts || stdinPrompts.length === 0) {
368
+ console.error('Error: no prompts received on stdin')
386
369
  process.exit(1)
387
370
  }
371
+ prompts = stdinPrompts
388
372
  }
389
373
 
374
+ // Load grader if specified
375
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
376
+
390
377
  await runCapture({
391
- promptsPath,
378
+ promptsPath: promptsPath ?? undefined,
379
+ prompts,
392
380
  schemaPath: values.schema,
393
381
  outputPath: values.output,
394
382
  cwd: values.cwd,
@@ -397,5 +385,7 @@ Examples:
397
385
  append: values.append ?? false,
398
386
  grader,
399
387
  debug: values.debug ?? false,
388
+ concurrency: parseConcurrency(values.concurrency),
389
+ workspaceDir: values['workspace-dir'],
400
390
  })
401
391
  }