npm - @plaited/agent-eval-harness - Versions diffs - 0.9.0 → 0.11.0 - Mend

@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +10 -0
package/package.json +1 -1
package/src/commands/balance.ts +1 -11
package/src/commands/calibrate.ts +2 -10
package/src/commands/capture.ts +104 -114
package/src/commands/execution.ts +245 -0
package/src/commands/tests/capture-cli.spec.ts +84 -0
package/src/commands/tests/trials-cli.spec.ts +68 -0
package/src/commands/trials.ts +98 -115
package/src/commands/validate-refs.ts +3 -19
package/src/core/core.ts +27 -1
package/src/core/loading.ts +53 -19
package/src/core/streaming.ts +172 -0
package/src/core/tests/streaming.spec.ts +399 -0
package/src/core/tests/worker-pool.spec.ts +377 -0
package/src/core/worker-pool.ts +220 -0
package/src/core.ts +15 -0
package/src/schemas/grader-loader.ts +23 -6
package/src/schemas/schemas-cli.ts +1 -6
package/src/schemas/schemas.ts +2 -0
package/src/schemas.ts +1 -1

package/README.md CHANGED Viewed

@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
   --schema ./schemas/claude-headless.json \
   -o results.jsonl
+# Parallel capture (4x faster with 4 workers)
+bunx @plaited/agent-eval-harness capture prompts.jsonl \
+  --schema ./schemas/claude-headless.json \
+  -j 4 -o results.jsonl
 # Run trials for pass@k analysis with debug mode
 bunx @plaited/agent-eval-harness trials prompts.jsonl \
   --schema ./schemas/claude-headless.json \
   -k 5 --grader ./grader.ts --debug
+# Parallel trials (4 prompts running trials concurrently)
+bunx @plaited/agent-eval-harness trials prompts.jsonl \
+  --schema ./schemas/claude-headless.json \
+  -k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
 # Summarize results
 bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.9.0",
+  "version": "0.11.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/commands/balance.ts CHANGED Viewed

@@ -9,8 +9,8 @@
  */
 import { parseArgs } from 'node:util'
+import { loadPrompts, resolvePath } from '../core.ts'
 import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
-import { loadPrompts } from './capture.ts'
 // ============================================================================
 // Types
@@ -28,16 +28,6 @@ export type BalanceConfig = {
   threshold?: number
 }
-// ============================================================================
-// Helpers
-// ============================================================================
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 /**
  * Analyze category distribution across prompts.
  *

package/src/commands/calibrate.ts CHANGED Viewed

@@ -11,7 +11,7 @@
 import { parseArgs } from 'node:util'
 import { loadResults, resolvePath } from '../core.ts'
 import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
 import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
 // ============================================================================
@@ -293,15 +293,7 @@ Examples:
   }
   // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
-      process.exit(1)
-    }
-  }
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runCalibrate({
     resultsPath,

package/src/commands/capture.ts CHANGED Viewed

@@ -13,22 +13,20 @@
 import { parseArgs } from 'node:util'
 import {
+  createWorkspaceDir,
   detectTrajectoryRichness,
   extractOutput,
   extractTrajectory,
   getInputPreview,
   hasToolErrors,
-  loadPrompts,
   logProgress,
-  resolvePath,
-  writeOutput,
+  readStdinPrompts,
 } from '../core.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
 import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
-import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
+import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
+import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
+import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
 // ============================================================================
 // Re-exports for backward compatibility
@@ -51,26 +49,7 @@ export {
 // ============================================================================
 /** Configuration for capture command */
-export type CaptureConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Path to agent schema JSON file */
-  schemaPath: string
-  /** Output file path (undefined for stdout) */
-  outputPath?: string
-  /** Working directory for agent */
-  cwd?: string
-  /** Timeout per prompt in milliseconds (overrides schema default) */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Append to output file instead of overwriting */
-  append?: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Enable debug mode for detailed output */
-  debug?: boolean
-}
+export type CaptureConfig = BaseExecutionConfig
 // ============================================================================
 // Capture Implementation
@@ -87,46 +66,29 @@ export type CaptureConfig = {
  * @returns Array of capture results
  */
 export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
+  const ctx = await prepareExecution(config)
   const {
-    promptsPath,
-    schemaPath,
-    outputPath,
-    cwd,
-    timeout,
-    progress = false,
-    append = false,
+    schema,
+    prompts,
+    sessions,
+    resolvedOutputPath,
+    resolvedWorkspaceDir,
+    defaultWorkingDir,
+    progress,
     grader,
-    debug = false,
-  } = config
+    debug,
+  } = ctx
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
+  // Log progress info
+  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
+  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
+  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
+  if (ctx.concurrency > 1) {
+    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
   }
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
+  if (resolvedWorkspaceDir) {
+    logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
   }
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-  // Resolve output path
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  // Determine effective timeout (CLI flag > schema default > harness default)
-  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
-  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
-  // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
-  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
-  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
   if (resolvedOutputPath) {
     logProgress(`Output: ${resolvedOutputPath}`, progress)
   }
@@ -134,37 +96,24 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
     logProgress(`Debug mode: enabled`, progress)
   }
-  // Create session manager with schema
-  const sessions = createSessionManager({
-    schema,
-    timeout: effectiveTimeout,
-    verbose: progress,
-    debug,
-  })
+  // Process a single prompt (used by worker pool)
+  const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
+    // Determine working directory (per-prompt workspace or default)
+    const workingDir = resolvedWorkspaceDir
+      ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
+      : defaultWorkingDir
-  // Clear output file if not appending
-  if (resolvedOutputPath && !append) {
-    await Bun.write(resolvedOutputPath, '')
-  }
-  const workingDir = cwd ?? process.cwd()
-  const results: CaptureResult[] = []
-  let isFirstOutput = true
-  // Run evaluations sequentially - fresh session per entry
-  for (let i = 0; i < prompts.length; i++) {
-    const promptCase = prompts[i]
-    if (!promptCase) continue
-    logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
+    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
     const startTime = Date.now()
     let result: CaptureResult
+    let sessionId: string | undefined
     try {
       // Create fresh session for each entry (ensures isolation)
       const sessionStart = Date.now()
       const session = await sessions.create(workingDir)
+      sessionId = session.id
       const sessionCreation = Date.now() - sessionStart
       logProgress(`  Session: ${session.id}`, progress)
@@ -177,9 +126,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       let lastExitInfo: ProcessExitInfo | undefined
       let lastOutput = ''
-      // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
-      // The session manager would need to accept timeout per-call to support this
       // Execute each turn sequentially in the same session
       for (const turnInput of inputs) {
         const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
@@ -198,7 +144,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       result = {
         id: promptCase.id,
-        input: promptCase.input, // Preserve original (string or array)
+        input: promptCase.input,
         output,
         ...(promptCase.hint && { hint: promptCase.hint }),
         trajectory,
@@ -207,6 +153,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
           agent: schema.name,
           trajectoryRichness,
           turnCount,
+          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
           ...(lastExitInfo && {
             exitCode: lastExitInfo.exitCode,
             signal: lastExitInfo.signal,
@@ -236,14 +183,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
         result.score = graderResult
-        // Merge outcome from grader if present
         if (graderResult.outcome) {
           result.outcome = graderResult.outcome
         }
       }
-      // Clean up session
-      sessions.destroy(session.id)
     } catch (error) {
       const endTime = Date.now()
       const message = error instanceof Error ? error.message : String(error)
@@ -259,6 +202,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
           agent: schema.name,
           trajectoryRichness: 'minimal' as TrajectoryRichness,
           turnCount: inputs.length,
+          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
         },
         timing: {
           start: startTime,
@@ -269,14 +213,15 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
         toolErrors: true,
         errors: [message],
       }
+    } finally {
+      // Always clean up session if it was created
+      if (sessionId) {
+        sessions.destroy(sessionId)
+      }
     }
-    results.push(result)
-    // Write result immediately
-    const formatted = JSON.stringify(result)
-    await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-    isFirstOutput = false
+    // Write result immediately (coordinated via mutex for concurrent writes)
+    await ctx.writeResult(result)
     const statusIcon = result.toolErrors ? '!' : '✓'
     const exitInfo = result.metadata?.timedOut
@@ -284,11 +229,13 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       : result.metadata?.exitCode && result.metadata.exitCode !== 0
         ? ` - exit ${result.metadata.exitCode}`
         : ''
-    logProgress(`  ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
+    logProgress(`  ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
+    return result
   }
-  logProgress('Done!', progress)
-  return results
+  // Run with worker pool
+  return executePrompts(ctx, processPrompt)
 }
 // ============================================================================
@@ -312,6 +259,9 @@ export const capture = async (args: string[]): Promise<void> => {
       append: { type: 'boolean', default: false },
       grader: { type: 'string', short: 'g' },
       debug: { type: 'boolean', default: false },
+      stdin: { type: 'boolean', default: false },
+      concurrency: { type: 'string', short: 'j' },
+      'workspace-dir': { type: 'string' },
       help: { type: 'boolean', short: 'h' },
     },
     allowPositionals: true,
@@ -320,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
   if (values.help) {
     console.log(`
 Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
+       cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
 Arguments:
   prompts.jsonl     Input file with evaluation prompts
@@ -329,6 +280,9 @@ Options:
   -o, --output      Output file (default: stdout)
   -c, --cwd         Working directory for agent
   -t, --timeout     Request timeout in ms (overrides schema default)
+  -j, --concurrency Number of concurrent workers (default: 1)
+  --stdin           Read prompts from stdin (mutually exclusive with file arg)
+  --workspace-dir   Base directory for per-prompt workspace isolation
   --progress        Show progress to stderr
   --append          Append to output file instead of overwriting
   -g, --grader      Path to grader (.ts/.js module or executable script)
@@ -348,25 +302,55 @@ Graders:
   TS/JS modules must export a 'grade' function.
   Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Parallelization:
+  Use -j/--concurrency to run multiple prompts in parallel.
+  Each prompt gets its own agent session for isolation.
+  Results are written as they complete (order may differ from input).
+  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
+  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
+  In memory-constrained environments (Docker, CI) this can cause OOM kills.
+  Use --stdin to pipe prompts for container-level orchestration.
+Workspace Isolation:
+  Use --workspace-dir to create per-prompt directories.
+  Each prompt runs in {workspace-dir}/prompt-{id}/.
+  Useful for code generation tasks requiring filesystem isolation.
 Examples:
   # Basic capture with schema
   agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
+  # Run 4 prompts in parallel
+  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
+  # With workspace isolation for code generation
+  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
+    --workspace-dir ./workspaces -o results.jsonl
   # With TypeScript grader
   agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
   # With debug mode
   agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
-  # With per-prompt timeout override (in prompts.jsonl):
-  {"id": "slow-task", "input": "...", "timeout": 180000}
+  # Read prompts from stdin (container orchestration)
+  cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
 `)
     return
   }
   const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
+  const useStdin = values.stdin ?? false
+  // Mutual exclusivity: --stdin and positional file
+  if (useStdin && promptsPath) {
+    console.error('Error: --stdin and prompts file argument are mutually exclusive')
+    process.exit(1)
+  }
+  if (!useStdin && !promptsPath) {
+    console.error('Error: prompts.jsonl path is required (or use --stdin)')
     process.exit(1)
   }
@@ -376,19 +360,23 @@ Examples:
     process.exit(1)
   }
-  // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+  // Read prompts from stdin if requested
+  let prompts: PromptCase[] | undefined
+  if (useStdin) {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: no prompts received on stdin')
       process.exit(1)
     }
+    prompts = stdinPrompts
   }
+  // Load grader if specified
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runCapture({
-    promptsPath,
+    promptsPath: promptsPath ?? undefined,
+    prompts,
     schemaPath: values.schema,
     outputPath: values.output,
     cwd: values.cwd,
@@ -397,5 +385,7 @@ Examples:
     append: values.append ?? false,
     grader,
     debug: values.debug ?? false,
+    concurrency: parseConcurrency(values.concurrency),
+    workspaceDir: values['workspace-dir'],
   })
 }