@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,13 +12,12 @@
12
12
  */
13
13
 
14
14
  import { parseArgs } from 'node:util'
15
- import { extractOutput, extractTrajectory, loadPrompts, logProgress, resolvePath, writeOutput } from '../core.ts'
16
- import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
15
+ import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
17
16
  import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
18
- import { createSessionManager } from '../headless/headless-session-manager.ts'
19
- import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
20
- import { loadGrader } from '../schemas/grader-loader.ts'
21
- import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
17
+ import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
18
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
19
+ import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
20
+ import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
22
21
 
23
22
  // ============================================================================
24
23
  // Pass@k/Pass^k Calculation
@@ -74,27 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
74
73
  // ============================================================================
75
74
 
76
75
  /** Configuration for trials command */
77
- export type TrialsConfig = {
78
- /** Path to prompts.jsonl file */
79
- promptsPath: string
80
- /** Path to agent schema JSON file */
81
- schemaPath: string
76
+ export type TrialsConfig = BaseExecutionConfig & {
82
77
  /** Number of trials per prompt */
83
78
  k: number
84
- /** Output file path */
85
- outputPath?: string
86
- /** Working directory for agent */
87
- cwd?: string
88
- /** Timeout per prompt in milliseconds (overrides schema default) */
89
- timeout?: number
90
- /** Show progress to stderr */
91
- progress?: boolean
92
- /** Append to output file */
93
- append?: boolean
94
- /** Optional grader function */
95
- grader?: Grader
96
- /** Enable debug mode */
97
- debug?: boolean
98
79
  }
99
80
 
100
81
  // ============================================================================
@@ -108,79 +89,38 @@ export type TrialsConfig = {
108
89
  * @returns Array of trial results
109
90
  */
110
91
  export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
111
- const {
112
- promptsPath,
113
- schemaPath,
114
- k,
115
- outputPath,
116
- cwd,
117
- timeout,
118
- progress = false,
119
- append = false,
120
- grader,
121
- debug = false,
122
- } = config
92
+ const { k } = config
93
+ const ctx = await prepareExecution(config)
94
+ const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
123
95
 
124
- // Load and validate schema
125
- const schemaFile = Bun.file(schemaPath)
126
- if (!(await schemaFile.exists())) {
127
- throw new Error(`Schema file not found: ${schemaPath}`)
96
+ // Log progress info
97
+ logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
98
+ logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
99
+ logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
100
+ logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
101
+ if (ctx.concurrency > 1) {
102
+ logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
128
103
  }
129
-
130
- let schema: HeadlessAdapterConfig
131
- try {
132
- const rawSchema = await schemaFile.json()
133
- schema = parseHeadlessConfig(rawSchema)
134
- } catch (error) {
135
- throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
104
+ if (resolvedWorkspaceDir) {
105
+ logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
136
106
  }
137
-
138
- // Load prompts
139
- const prompts = await loadPrompts(promptsPath)
140
-
141
- // Resolve output path
142
- const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
143
-
144
- // Determine effective timeout (CLI flag > schema default > harness default)
145
- const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
146
- const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
147
-
148
- // Log progress info
149
- logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
150
- logProgress(`Running ${k} trials per prompt`, progress)
151
- logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
152
- logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
153
107
  if (grader) {
154
108
  logProgress('Grader: enabled (will compute pass@k metrics)', progress)
155
109
  }
156
110
 
157
- // Create session manager with schema
158
- const sessions = createSessionManager({
159
- schema,
160
- timeout: effectiveTimeout,
161
- verbose: progress,
162
- debug,
163
- })
164
-
165
- // Clear output file if not appending
166
- if (resolvedOutputPath && !append) {
167
- await Bun.write(resolvedOutputPath, '')
168
- }
169
-
170
- const workingDir = cwd ?? process.cwd()
171
- const results: TrialResult[] = []
172
- let isFirstOutput = true
173
-
174
- // Run evaluations
175
- for (let i = 0; i < prompts.length; i++) {
176
- const promptCase = prompts[i]
177
- if (!promptCase) continue
178
-
179
- logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
111
+ // Process all trials for a single prompt
112
+ const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
113
+ logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
180
114
 
181
115
  const trialEntries: TrialEntry[] = []
182
116
 
183
117
  for (let trialNum = 1; trialNum <= k; trialNum++) {
118
+ // Determine working directory (per-prompt workspace or default)
119
+ // For trials, include trial number in workspace path for isolation
120
+ const workingDir = resolvedWorkspaceDir
121
+ ? await createWorkspaceDir(resolvedWorkspaceDir, `${promptCase.id}-trial-${trialNum}`)
122
+ : defaultWorkingDir
123
+
184
124
  // Create fresh session for each trial
185
125
  const session = await sessions.create(workingDir)
186
126
  const startTime = Date.now()
@@ -190,8 +130,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
190
130
  const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
191
131
  const allUpdates: ParsedUpdate[] = []
192
132
 
193
- // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
194
-
195
133
  // Execute each turn sequentially
196
134
  for (const turnInput of inputs) {
197
135
  const turnResult = await sessions.prompt(session.id, turnInput)
@@ -223,7 +161,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
223
161
  entry.score = graderResult.score
224
162
  entry.reasoning = graderResult.reasoning
225
163
 
226
- // Merge outcome from grader if present
227
164
  if (graderResult.outcome) {
228
165
  entry.outcome = graderResult.outcome
229
166
  }
@@ -234,9 +171,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
234
171
  ` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
235
172
  progress,
236
173
  )
237
-
238
- // Clean up session
239
- sessions.destroy(session.id)
240
174
  } catch (error) {
241
175
  const endTime = Date.now()
242
176
  const message = error instanceof Error ? error.message : String(error)
@@ -250,6 +184,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
250
184
  reasoning: `Error: ${message}`,
251
185
  })
252
186
  logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress)
187
+ } finally {
188
+ // Always clean up session
189
+ sessions.destroy(session.id)
253
190
  }
254
191
  }
255
192
 
@@ -260,6 +197,11 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
260
197
  ...(promptCase.hint && { hint: promptCase.hint }),
261
198
  k,
262
199
  trials: trialEntries,
200
+ metadata: {
201
+ ...promptCase.metadata,
202
+ agent: schema.name,
203
+ ...(resolvedWorkspaceDir && { workspaceDir: resolvedWorkspaceDir }),
204
+ },
263
205
  }
264
206
 
265
207
  // Calculate metrics if grader was used
@@ -270,23 +212,21 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
270
212
  result.passExpK = calculatePassExpK(passes, k)
271
213
  }
272
214
 
273
- results.push(result)
274
-
275
- // Write result immediately
276
- const formatted = JSON.stringify(result)
277
- await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
278
- isFirstOutput = false
215
+ // Write result immediately (coordinated via mutex for concurrent writes)
216
+ await ctx.writeResult(result)
279
217
 
280
218
  if (grader) {
281
219
  logProgress(
282
- ` → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
220
+ ` → ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
283
221
  progress,
284
222
  )
285
223
  }
224
+
225
+ return result
286
226
  }
287
227
 
288
- logProgress('Done!', progress)
289
- return results
228
+ // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
229
+ return executePrompts(ctx, processPromptTrials)
290
230
  }
291
231
 
292
232
  // ============================================================================
@@ -311,6 +251,9 @@ export const trials = async (args: string[]): Promise<void> => {
311
251
  append: { type: 'boolean', default: false },
312
252
  grader: { type: 'string', short: 'g' },
313
253
  debug: { type: 'boolean', default: false },
254
+ stdin: { type: 'boolean', default: false },
255
+ concurrency: { type: 'string', short: 'j' },
256
+ 'workspace-dir': { type: 'string' },
314
257
  help: { type: 'boolean', short: 'h' },
315
258
  },
316
259
  allowPositionals: true,
@@ -319,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
319
262
  if (values.help) {
320
263
  console.log(`
321
264
  Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
265
+ cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
322
266
 
323
267
  Arguments:
324
268
  prompts.jsonl Input file with evaluation prompts
@@ -329,6 +273,9 @@ Options:
329
273
  -k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
330
274
  -c, --cwd Working directory for agent
331
275
  -t, --timeout Request timeout in ms (overrides schema default)
276
+ -j, --concurrency Number of concurrent workers (default: 1)
277
+ --stdin Read prompts from stdin (mutually exclusive with file arg)
278
+ --workspace-dir Base directory for per-trial workspace isolation
332
279
  --progress Show progress to stderr
333
280
  --append Append to output file
334
281
  -g, --grader Path to grader (.ts/.js module or executable script)
@@ -343,22 +290,52 @@ Graders:
343
290
  TS/JS modules must export a 'grade' function.
344
291
  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
345
292
 
293
+ Parallelization:
294
+ Use -j/--concurrency to run multiple prompts' trials in parallel.
295
+ Each prompt's k trials still run sequentially (required for aggregation).
296
+ With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
297
+
298
+ Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
299
+ at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
300
+ In memory-constrained environments (Docker, CI) this can cause OOM kills.
301
+ Use --stdin to pipe prompts for container-level orchestration.
302
+
303
+ Workspace Isolation:
304
+ Use --workspace-dir to create per-trial directories.
305
+ Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
306
+ Useful for code generation tasks requiring filesystem isolation.
307
+
346
308
  Examples:
347
- # Capture only
309
+ # Basic trials
348
310
  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
349
311
 
312
+ # Run 4 prompts' trials in parallel (4x faster for 151 prompts)
313
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 -o trials.jsonl
314
+
315
+ # With workspace isolation for code generation
316
+ agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 \\
317
+ --workspace-dir ./workspaces -o trials.jsonl
318
+
350
319
  # With TypeScript grader
351
320
  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
352
321
 
353
- # With Python grader
354
- agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.py -o trials.jsonl
322
+ # Read prompts from stdin (container orchestration)
323
+ cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
355
324
  `)
356
325
  return
357
326
  }
358
327
 
359
328
  const promptsPath = positionals[0]
360
- if (!promptsPath) {
361
- console.error('Error: prompts.jsonl path is required')
329
+ const useStdin = values.stdin ?? false
330
+
331
+ // Mutual exclusivity: --stdin and positional file
332
+ if (useStdin && promptsPath) {
333
+ console.error('Error: --stdin and prompts file argument are mutually exclusive')
334
+ process.exit(1)
335
+ }
336
+
337
+ if (!useStdin && !promptsPath) {
338
+ console.error('Error: prompts.jsonl path is required (or use --stdin)')
362
339
  process.exit(1)
363
340
  }
364
341
 
@@ -368,19 +345,23 @@ Examples:
368
345
  process.exit(1)
369
346
  }
370
347
 
371
- // Load grader if specified
372
- let grader: Grader | undefined
373
- if (values.grader) {
374
- try {
375
- grader = await loadGrader(values.grader)
376
- } catch (error) {
377
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
348
+ // Read prompts from stdin if requested
349
+ let prompts: PromptCase[] | undefined
350
+ if (useStdin) {
351
+ const stdinPrompts = await readStdinPrompts()
352
+ if (!stdinPrompts || stdinPrompts.length === 0) {
353
+ console.error('Error: no prompts received on stdin')
378
354
  process.exit(1)
379
355
  }
356
+ prompts = stdinPrompts
380
357
  }
381
358
 
359
+ // Load grader if specified
360
+ const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
361
+
382
362
  await runTrials({
383
- promptsPath,
363
+ promptsPath: promptsPath ?? undefined,
364
+ prompts,
384
365
  schemaPath: values.schema,
385
366
  k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
386
367
  outputPath: values.output,
@@ -390,5 +371,7 @@ Examples:
390
371
  append: values.append ?? false,
391
372
  grader,
392
373
  debug: values.debug ?? false,
374
+ concurrency: parseConcurrency(values.concurrency),
375
+ workspaceDir: values['workspace-dir'],
393
376
  })
394
377
  }
@@ -9,9 +9,9 @@
9
9
  */
10
10
 
11
11
  import { parseArgs } from 'node:util'
12
- import { loadGrader } from '../schemas/grader-loader.ts'
12
+ import { loadPrompts, resolvePath } from '../core.ts'
13
+ import { loadGraderOrExit } from '../schemas/grader-loader.ts'
13
14
  import type { Grader, ValidationResult } from '../schemas.ts'
14
- import { loadPrompts } from './capture.ts'
15
15
 
16
16
  // ============================================================================
17
17
  // Types
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
27
27
  grader: Grader
28
28
  }
29
29
 
30
- // ============================================================================
31
- // Helpers
32
- // ============================================================================
33
-
34
- /** Resolve path relative to process.cwd() */
35
- const resolvePath = (path: string): string => {
36
- if (path.startsWith('/')) return path
37
- return `${process.cwd()}/${path}`
38
- }
39
-
40
30
  // ============================================================================
41
31
  // Validate-Refs Implementation
42
32
  // ============================================================================
@@ -171,13 +161,7 @@ Examples:
171
161
  }
172
162
 
173
163
  // Load grader
174
- let grader: Grader
175
- try {
176
- grader = await loadGrader(values.grader)
177
- } catch (error) {
178
- console.error(`Error: ${error instanceof Error ? error.message : error}`)
179
- process.exit(1)
180
- }
164
+ const grader = await loadGraderOrExit(values.grader)
181
165
 
182
166
  await runValidateRefs({
183
167
  promptsPath,
package/src/core/core.ts CHANGED
@@ -11,9 +11,25 @@
11
11
  */
12
12
 
13
13
  // Loading utilities
14
- export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
14
+ export {
15
+ buildResultsIndex,
16
+ countLines,
17
+ loadJsonl,
18
+ loadPrompts,
19
+ loadResults,
20
+ readStdinPrompts,
21
+ streamResults,
22
+ } from './loading.ts'
15
23
  // Output utilities
16
24
  export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
25
+ // Native streaming utilities
26
+ export {
27
+ countLinesStreaming,
28
+ streamJsonl,
29
+ streamPrompts,
30
+ streamResultsNative,
31
+ streamTrialResults,
32
+ } from './streaming.ts'
17
33
  // Trajectory utilities
18
34
  export {
19
35
  detectTrajectoryRichness,
@@ -23,3 +39,13 @@ export {
23
39
  extractTrajectory,
24
40
  hasToolErrors,
25
41
  } from './trajectory.ts'
42
+ // Worker pool utilities
43
+ export {
44
+ createWorkspaceDir,
45
+ createWriteMutex,
46
+ type ProgressCallback,
47
+ runWorkerPool,
48
+ type WorkerPoolOptions,
49
+ type WorkerPoolResult,
50
+ type WriteMutex,
51
+ } from './worker-pool.ts'
@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
39
39
  })
40
40
  }
41
41
 
42
+ /**
43
+ * Read prompts from stdin as JSONL.
44
+ *
45
+ * @remarks
46
+ * Reads all data from stdin, parses each line as JSON, and validates against
47
+ * PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
48
+ * Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
49
+ *
50
+ * @returns Parsed and validated prompt cases, or null if stdin is a TTY
51
+ * @throws Error if any line is invalid JSON or fails schema validation
52
+ *
53
+ * @public
54
+ */
55
+ export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
56
+ if (process.stdin.isTTY) {
57
+ return null
58
+ }
59
+
60
+ const chunks: Buffer[] = []
61
+ for await (const chunk of process.stdin) {
62
+ chunks.push(chunk)
63
+ }
64
+
65
+ const content = Buffer.concat(chunks).toString('utf-8').trim()
66
+ if (!content) return null
67
+
68
+ return content
69
+ .split('\n')
70
+ .filter(Boolean)
71
+ .map((line, index) => {
72
+ try {
73
+ return PromptCaseSchema.parse(JSON.parse(line))
74
+ } catch (error) {
75
+ throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
76
+ }
77
+ })
78
+ }
79
+
42
80
  /**
43
81
  * Load capture results from a JSONL file.
44
82
  *
@@ -99,12 +137,21 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
99
137
  // Streaming Loading
100
138
  // ============================================================================
101
139
 
140
+ // Re-export native streaming functions for backward compatibility
141
+ export {
142
+ countLinesStreaming,
143
+ streamJsonl,
144
+ streamPrompts,
145
+ streamResultsNative,
146
+ streamTrialResults,
147
+ } from './streaming.ts'
148
+
102
149
  /**
103
150
  * Stream capture results from a JSONL file.
104
151
  *
105
152
  * @remarks
106
153
  * Memory-efficient alternative to loadResults for large files.
107
- * Yields results one at a time using an async generator.
154
+ * Uses native streaming via Bun.file().stream() for O(1) memory usage.
108
155
  *
109
156
  * @param path - Path to the results.jsonl file
110
157
  * @yields Parsed and validated capture results
@@ -113,20 +160,8 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
113
160
  * @public
114
161
  */
115
162
  export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
116
- const file = Bun.file(path)
117
- const text = await file.text()
118
- const lines = text.split('\n')
119
-
120
- for (let i = 0; i < lines.length; i++) {
121
- const line = lines[i]?.trim()
122
- if (!line) continue
123
-
124
- try {
125
- yield CaptureResultSchema.parse(JSON.parse(line))
126
- } catch (error) {
127
- throw new Error(`Invalid result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
128
- }
129
- }
163
+ const { streamResultsNative } = await import('./streaming.ts')
164
+ yield* streamResultsNative(path)
130
165
  }
131
166
 
132
167
  /**
@@ -159,7 +194,7 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
159
194
  *
160
195
  * @remarks
161
196
  * Useful for detecting large files that should use streaming mode.
162
- * Uses byte-level scanning for efficiency.
197
+ * Uses native streaming for O(1) memory usage.
163
198
  *
164
199
  * @param path - Path to the JSONL file
165
200
  * @returns Number of non-empty lines
@@ -167,7 +202,6 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
167
202
  * @public
168
203
  */
169
204
  export const countLines = async (path: string): Promise<number> => {
170
- const file = Bun.file(path)
171
- const text = await file.text()
172
- return text.split('\n').filter((line) => line.trim()).length
205
+ const { countLinesStreaming } = await import('./streaming.ts')
206
+ return countLinesStreaming(path)
173
207
  }