@plaited/agent-eval-harness 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
58
58
  --schema ./schemas/claude-headless.json \
59
59
  -o results.jsonl
60
60
 
61
+ # Parallel capture (4x faster with 4 workers)
62
+ bunx @plaited/agent-eval-harness capture prompts.jsonl \
63
+ --schema ./schemas/claude-headless.json \
64
+ -j 4 -o results.jsonl
65
+
61
66
  # Run trials for pass@k analysis with debug mode
62
67
  bunx @plaited/agent-eval-harness trials prompts.jsonl \
63
68
  --schema ./schemas/claude-headless.json \
64
69
  -k 5 --grader ./grader.ts --debug
65
70
 
71
+ # Parallel trials (4 prompts running trials concurrently)
72
+ bunx @plaited/agent-eval-harness trials prompts.jsonl \
73
+ --schema ./schemas/claude-headless.json \
74
+ -k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
75
+
66
76
  # Summarize results
67
77
  bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl
68
78
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.9.0",
3
+ "version": "0.10.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -11,8 +11,11 @@
11
11
  * @packageDocumentation
12
12
  */
13
13
 
14
+ import { mkdir } from 'node:fs/promises'
14
15
  import { parseArgs } from 'node:util'
15
16
  import {
17
+ createWorkspaceDir,
18
+ createWriteMutex,
16
19
  detectTrajectoryRichness,
17
20
  extractOutput,
18
21
  extractTrajectory,
@@ -21,6 +24,7 @@ import {
21
24
  loadPrompts,
22
25
  logProgress,
23
26
  resolvePath,
27
+ runWorkerPool,
24
28
  writeOutput,
25
29
  } from '../core.ts'
26
30
  import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
@@ -70,6 +74,10 @@ export type CaptureConfig = {
70
74
  grader?: Grader
71
75
  /** Enable debug mode for detailed output */
72
76
  debug?: boolean
77
+ /** Number of concurrent workers (default: 1 for sequential) */
78
+ concurrency?: number
79
+ /** Base directory for per-prompt workspace isolation */
80
+ workspaceDir?: string
73
81
  }
74
82
 
75
83
  // ============================================================================
@@ -97,6 +105,8 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
97
105
  append = false,
98
106
  grader,
99
107
  debug = false,
108
+ concurrency = 1,
109
+ workspaceDir,
100
110
  } = config
101
111
 
102
112
  // Load and validate schema
@@ -116,8 +126,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
116
126
  // Load prompts
117
127
  const prompts = await loadPrompts(promptsPath)
118
128
 
119
- // Resolve output path
129
+ // Resolve paths
120
130
  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
131
+ const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
121
132
 
122
133
  // Determine effective timeout (CLI flag > schema default > harness default)
123
134
  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
@@ -127,6 +138,12 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
127
138
  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
128
139
  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
129
140
  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
141
+ if (concurrency > 1) {
142
+ logProgress(`Concurrency: ${concurrency} workers`, progress)
143
+ }
144
+ if (resolvedWorkspaceDir) {
145
+ logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
146
+ }
130
147
  if (resolvedOutputPath) {
131
148
  logProgress(`Output: ${resolvedOutputPath}`, progress)
132
149
  }
@@ -147,24 +164,36 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
147
164
  await Bun.write(resolvedOutputPath, '')
148
165
  }
149
166
 
150
- const workingDir = cwd ?? process.cwd()
151
- const results: CaptureResult[] = []
167
+ // Create workspace base directory if specified
168
+ // Uses fs.mkdir instead of shell to prevent command injection
169
+ if (resolvedWorkspaceDir) {
170
+ await mkdir(resolvedWorkspaceDir, { recursive: true })
171
+ }
172
+
173
+ const defaultWorkingDir = cwd ?? process.cwd()
174
+
175
+ // Create write mutex for coordinating JSONL output
176
+ const writeMutex = createWriteMutex()
152
177
  let isFirstOutput = true
153
178
 
154
- // Run evaluations sequentially - fresh session per entry
155
- for (let i = 0; i < prompts.length; i++) {
156
- const promptCase = prompts[i]
157
- if (!promptCase) continue
179
+ // Process a single prompt (used by worker pool)
180
+ const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
181
+ // Determine working directory (per-prompt workspace or default)
182
+ const workingDir = resolvedWorkspaceDir
183
+ ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
184
+ : defaultWorkingDir
158
185
 
159
- logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
186
+ logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
160
187
 
161
188
  const startTime = Date.now()
162
189
  let result: CaptureResult
190
+ let sessionId: string | undefined
163
191
 
164
192
  try {
165
193
  // Create fresh session for each entry (ensures isolation)
166
194
  const sessionStart = Date.now()
167
195
  const session = await sessions.create(workingDir)
196
+ sessionId = session.id
168
197
  const sessionCreation = Date.now() - sessionStart
169
198
  logProgress(` Session: ${session.id}`, progress)
170
199
 
@@ -177,9 +206,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
177
206
  let lastExitInfo: ProcessExitInfo | undefined
178
207
  let lastOutput = ''
179
208
 
180
- // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
181
- // The session manager would need to accept timeout per-call to support this
182
-
183
209
  // Execute each turn sequentially in the same session
184
210
  for (const turnInput of inputs) {
185
211
  const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
@@ -198,7 +224,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
198
224
 
199
225
  result = {
200
226
  id: promptCase.id,
201
- input: promptCase.input, // Preserve original (string or array)
227
+ input: promptCase.input,
202
228
  output,
203
229
  ...(promptCase.hint && { hint: promptCase.hint }),
204
230
  trajectory,
@@ -207,6 +233,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
207
233
  agent: schema.name,
208
234
  trajectoryRichness,
209
235
  turnCount,
236
+ ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
210
237
  ...(lastExitInfo && {
211
238
  exitCode: lastExitInfo.exitCode,
212
239
  signal: lastExitInfo.signal,
@@ -236,14 +263,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
236
263
 
237
264
  result.score = graderResult
238
265
 
239
- // Merge outcome from grader if present
240
266
  if (graderResult.outcome) {
241
267
  result.outcome = graderResult.outcome
242
268
  }
243
269
  }
244
-
245
- // Clean up session
246
- sessions.destroy(session.id)
247
270
  } catch (error) {
248
271
  const endTime = Date.now()
249
272
  const message = error instanceof Error ? error.message : String(error)
@@ -259,6 +282,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
259
282
  agent: schema.name,
260
283
  trajectoryRichness: 'minimal' as TrajectoryRichness,
261
284
  turnCount: inputs.length,
285
+ ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
262
286
  },
263
287
  timing: {
264
288
  start: startTime,
@@ -269,14 +293,19 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
269
293
  toolErrors: true,
270
294
  errors: [message],
271
295
  }
296
+ } finally {
297
+ // Always clean up session if it was created
298
+ if (sessionId) {
299
+ sessions.destroy(sessionId)
300
+ }
272
301
  }
273
302
 
274
- results.push(result)
275
-
276
- // Write result immediately
277
- const formatted = JSON.stringify(result)
278
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
279
- isFirstOutput = false
303
+ // Write result immediately (coordinated via mutex for concurrent writes)
304
+ await writeMutex.write(async () => {
305
+ const formatted = JSON.stringify(result)
306
+ await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
307
+ isFirstOutput = false
308
+ })
280
309
 
281
310
  const statusIcon = result.toolErrors ? '!' : '✓'
282
311
  const exitInfo = result.metadata?.timedOut
@@ -284,7 +313,22 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
284
313
  : result.metadata?.exitCode && result.metadata.exitCode !== 0
285
314
  ? ` - exit ${result.metadata.exitCode}`
286
315
  : ''
287
- logProgress(` ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
316
+ logProgress(` ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
317
+
318
+ return result
319
+ }
320
+
321
+ // Run with worker pool
322
+ const { results, errors } = await runWorkerPool(prompts, processPrompt, {
323
+ concurrency,
324
+ onProgress: (completed, total) => {
325
+ logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
326
+ },
327
+ })
328
+
329
+ // Log any errors that occurred
330
+ if (errors.length > 0) {
331
+ logProgress(`Completed with ${errors.length} error(s)`, progress)
288
332
  }
289
333
 
290
334
  logProgress('Done!', progress)
@@ -312,6 +356,8 @@ export const capture = async (args: string[]): Promise<void> => {
312
356
  append: { type: 'boolean', default: false },
313
357
  grader: { type: 'string', short: 'g' },
314
358
  debug: { type: 'boolean', default: false },
359
+ concurrency: { type: 'string', short: 'j' },
360
+ 'workspace-dir': { type: 'string' },
315
361
  help: { type: 'boolean', short: 'h' },
316
362
  },
317
363
  allowPositionals: true,
@@ -329,6 +375,8 @@ Options:
329
375
  -o, --output Output file (default: stdout)
330
376
  -c, --cwd Working directory for agent
331
377
  -t, --timeout Request timeout in ms (overrides schema default)
378
+ -j, --concurrency Number of concurrent workers (default: 1)
379
+ --workspace-dir Base directory for per-prompt workspace isolation
332
380
  --progress Show progress to stderr
333
381
  --append Append to output file instead of overwriting
334
382
  -g, --grader Path to grader (.ts/.js module or executable script)
@@ -348,18 +396,32 @@ Graders:
348
396
  TS/JS modules must export a 'grade' function.
349
397
  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
350
398
 
399
+ Parallelization:
400
+ Use -j/--concurrency to run multiple prompts in parallel.
401
+ Each prompt gets its own agent session for isolation.
402
+ Results are written as they complete (order may differ from input).
403
+
404
+ Workspace Isolation:
405
+ Use --workspace-dir to create per-prompt directories.
406
+ Each prompt runs in {workspace-dir}/prompt-{id}/.
407
+ Useful for code generation tasks requiring filesystem isolation.
408
+
351
409
  Examples:
352
410
  # Basic capture with schema
353
411
  agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
354
412
 
413
+ # Run 4 prompts in parallel
414
+ agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
415
+
416
+ # With workspace isolation for code generation
417
+ agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
418
+ --workspace-dir ./workspaces -o results.jsonl
419
+
355
420
  # With TypeScript grader
356
421
  agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
357
422
 
358
423
  # With debug mode
359
424
  agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
360
-
361
- # With per-prompt timeout override (in prompts.jsonl):
362
- {"id": "slow-task", "input": "...", "timeout": 180000}
363
425
  `)
364
426
  return
365
427
  }
@@ -387,6 +449,17 @@ Examples:
387
449
  }
388
450
  }
389
451
 
452
+ // Validate and parse concurrency
453
+ let concurrency = 1
454
+ if (values.concurrency) {
455
+ const parsed = Number.parseInt(values.concurrency, 10)
456
+ if (Number.isNaN(parsed) || parsed < 1) {
457
+ console.error('Error: --concurrency must be a positive integer')
458
+ process.exit(1)
459
+ }
460
+ concurrency = parsed
461
+ }
462
+
390
463
  await runCapture({
391
464
  promptsPath,
392
465
  schemaPath: values.schema,
@@ -397,5 +470,7 @@ Examples:
397
470
  append: values.append ?? false,
398
471
  grader,
399
472
  debug: values.debug ?? false,
473
+ concurrency,
474
+ workspaceDir: values['workspace-dir'],
400
475
  })
401
476
  }
@@ -117,10 +117,14 @@ describe('runCapture configuration', () => {
117
117
  progress: true,
118
118
  append: false,
119
119
  debug: false,
120
+ concurrency: 4,
121
+ workspaceDir: '/tmp/workspaces',
120
122
  }
121
123
 
122
124
  expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
123
125
  expect(config.schemaPath).toBe('./schemas/claude-headless.json')
126
+ expect(config.concurrency).toBe(4)
127
+ expect(config.workspaceDir).toBe('/tmp/workspaces')
124
128
  })
125
129
 
126
130
  test('CaptureConfig allows minimal configuration', () => {
@@ -135,6 +139,8 @@ describe('runCapture configuration', () => {
135
139
  expect(config.progress).toBeUndefined()
136
140
  expect(config.append).toBeUndefined()
137
141
  expect(config.grader).toBeUndefined()
142
+ expect(config.concurrency).toBeUndefined()
143
+ expect(config.workspaceDir).toBeUndefined()
138
144
  })
139
145
  })
140
146
 
@@ -160,6 +166,8 @@ describe('capture CLI', () => {
160
166
  expect(stdout).toContain('--progress')
161
167
  expect(stdout).toContain('-g, --grader')
162
168
  expect(stdout).toContain('-s, --schema')
169
+ expect(stdout).toContain('-j, --concurrency')
170
+ expect(stdout).toContain('--workspace-dir')
163
171
  })
164
172
 
165
173
  test('shows error for missing prompts file argument', async () => {
@@ -187,4 +195,53 @@ describe('capture CLI', () => {
187
195
  expect(exitCode).not.toBe(0)
188
196
  expect(stderr).toContain('--schema is required')
189
197
  })
198
+
199
+ test('shows error for invalid concurrency value', async () => {
200
+ const proc = Bun.spawn(
201
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
202
+ {
203
+ stdout: 'pipe',
204
+ stderr: 'pipe',
205
+ },
206
+ )
207
+
208
+ const stderr = await new Response(proc.stderr).text()
209
+ const exitCode = await proc.exited
210
+
211
+ expect(exitCode).not.toBe(0)
212
+ expect(stderr).toContain('--concurrency must be a positive integer')
213
+ })
214
+
215
+ test('shows error for zero concurrency', async () => {
216
+ const proc = Bun.spawn(
217
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
218
+ {
219
+ stdout: 'pipe',
220
+ stderr: 'pipe',
221
+ },
222
+ )
223
+
224
+ const stderr = await new Response(proc.stderr).text()
225
+ const exitCode = await proc.exited
226
+
227
+ expect(exitCode).not.toBe(0)
228
+ expect(stderr).toContain('--concurrency must be a positive integer')
229
+ })
230
+
231
+ test('shows error for negative concurrency', async () => {
232
+ // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
233
+ const proc = Bun.spawn(
234
+ ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
235
+ {
236
+ stdout: 'pipe',
237
+ stderr: 'pipe',
238
+ },
239
+ )
240
+
241
+ const stderr = await new Response(proc.stderr).text()
242
+ const exitCode = await proc.exited
243
+
244
+ expect(exitCode).not.toBe(0)
245
+ expect(stderr).toContain('--concurrency must be a positive integer')
246
+ })
190
247
  })
@@ -17,11 +17,15 @@ describe('TrialsConfig configuration', () => {
17
17
  progress: true,
18
18
  append: false,
19
19
  debug: false,
20
+ concurrency: 4,
21
+ workspaceDir: '/tmp/workspaces',
20
22
  }
21
23
 
22
24
  expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
23
25
  expect(config.schemaPath).toBe('./schemas/claude-headless.json')
24
26
  expect(config.k).toBe(5)
27
+ expect(config.concurrency).toBe(4)
28
+ expect(config.workspaceDir).toBe('/tmp/workspaces')
25
29
  })
26
30
 
27
31
  test('TrialsConfig allows minimal configuration', () => {
@@ -37,6 +41,8 @@ describe('TrialsConfig configuration', () => {
37
41
  expect(config.progress).toBeUndefined()
38
42
  expect(config.append).toBeUndefined()
39
43
  expect(config.grader).toBeUndefined()
44
+ expect(config.concurrency).toBeUndefined()
45
+ expect(config.workspaceDir).toBeUndefined()
40
46
  })
41
47
  })
42
48
 
@@ -64,6 +70,8 @@ describe('trials CLI', () => {
64
70
  expect(stdout).toContain('-g, --grader')
65
71
  expect(stdout).toContain('-s, --schema')
66
72
  expect(stdout).toContain('pass@k')
73
+ expect(stdout).toContain('-j, --concurrency')
74
+ expect(stdout).toContain('--workspace-dir')
67
75
  })
68
76
 
69
77
  test('shows error for missing prompts file argument', async () => {
@@ -91,6 +99,38 @@ describe('trials CLI', () => {
91
99
  expect(exitCode).not.toBe(0)
92
100
  expect(stderr).toContain('--schema is required')
93
101
  })
102
+
103
+ test('shows error for invalid concurrency value', async () => {
104
+ const proc = Bun.spawn(
105
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
106
+ {
107
+ stdout: 'pipe',
108
+ stderr: 'pipe',
109
+ },
110
+ )
111
+
112
+ const stderr = await new Response(proc.stderr).text()
113
+ const exitCode = await proc.exited
114
+
115
+ expect(exitCode).not.toBe(0)
116
+ expect(stderr).toContain('--concurrency must be a positive integer')
117
+ })
118
+
119
+ test('shows error for zero concurrency', async () => {
120
+ const proc = Bun.spawn(
121
+ ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
122
+ {
123
+ stdout: 'pipe',
124
+ stderr: 'pipe',
125
+ },
126
+ )
127
+
128
+ const stderr = await new Response(proc.stderr).text()
129
+ const exitCode = await proc.exited
130
+
131
+ expect(exitCode).not.toBe(0)
132
+ expect(stderr).toContain('--concurrency must be a positive integer')
133
+ })
94
134
  })
95
135
 
96
136
  // ============================================================================