npm - @plaited/agent-eval-harness - Versions diffs - 0.10.0 → 0.12.0 - Mend

@plaited/agent-eval-harness 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +5 -5
package/package.json +2 -2
package/src/commands/balance.ts +1 -11
package/src/commands/calibrate.ts +2 -10
package/src/commands/capture.ts +56 -141
package/src/commands/execution.ts +245 -0
package/src/commands/tests/capture-cli.spec.ts +27 -0
package/src/commands/tests/trials-cli.spec.ts +28 -0
package/src/commands/trials.ts +49 -149
package/src/commands/validate-refs.ts +3 -19
package/src/core/core.ts +9 -1
package/src/core/loading.ts +38 -0
package/src/core.ts +1 -0
package/src/schemas/grader-loader.ts +23 -6
package/src/schemas/schemas-cli.ts +1 -6
package/src/schemas.ts +1 -1

package/README.md CHANGED Viewed

@@ -25,7 +25,7 @@ export ANTHROPIC_API_KEY=sk-...   # For Claude
 export GEMINI_API_KEY=...         # For Gemini
 ```
-Pre-built schemas are available in `.plaited/skills/headless-adapters/schemas/` for Claude and Gemini.
+Pre-built schemas are available in `.agents/skills/headless-adapters/schemas/` for Claude and Gemini.
 ### Core Commands
@@ -98,11 +98,11 @@ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparis
 **Install skills** for use with AI coding agents:
 ```bash
-curl -fsSL https://raw.githubusercontent.com/plaited/skills-installer/main/install.sh | bash -s -- --agents <agent-name> --project agent-eval-harness
+npx skills add plaited/agent-eval-harness
+# or
+bunx skills add plaited/agent-eval-harness
 ```
-Replace `<agent-name>` with your agent: `claude`, `cursor`, `copilot`, `opencode`, `amp`, `goose`, `factory`
 ### Available Skills
 #### Agent Eval Harness
@@ -416,7 +416,7 @@ ANTHROPIC_API_KEY=sk-... GEMINI_API_KEY=... \
 ## Requirements
 - **Runtime:** Bun >= 1.2.9
-- **Schema:** JSON schema describing CLI agent interaction (see `.plaited/skills/headless-adapters/schemas/`)
+- **Schema:** JSON schema describing CLI agent interaction (see `.agents/skills/headless-adapters/schemas/`)
 - **API Key:** `ANTHROPIC_API_KEY` for Claude, `GEMINI_API_KEY` for Gemini
 ## License

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.10.0",
+  "version": "0.12.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {
@@ -56,7 +56,7 @@
     ]
   },
   "dependencies": {
-    "@plaited/development-skills": "0.6.5",
+    "@plaited/development-skills": "0.7.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {

package/src/commands/balance.ts CHANGED Viewed

@@ -9,8 +9,8 @@
  */
 import { parseArgs } from 'node:util'
+import { loadPrompts, resolvePath } from '../core.ts'
 import type { BalanceAnalysis, CategoryDistribution, PromptCase } from '../schemas.ts'
-import { loadPrompts } from './capture.ts'
 // ============================================================================
 // Types
@@ -28,16 +28,6 @@ export type BalanceConfig = {
   threshold?: number
 }
-// ============================================================================
-// Helpers
-// ============================================================================
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 /**
  * Analyze category distribution across prompts.
  *

package/src/commands/calibrate.ts CHANGED Viewed

@@ -11,7 +11,7 @@
 import { parseArgs } from 'node:util'
 import { loadResults, resolvePath } from '../core.ts'
 import { DEFAULT_CALIBRATION_SAMPLE_SIZE } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
 import type { CalibrationSample, Grader, GraderResult, TrajectoryStep } from '../schemas.ts'
 // ============================================================================
@@ -293,15 +293,7 @@ Examples:
   }
   // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
-      process.exit(1)
-    }
-  }
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runCalibrate({
     resultsPath,

package/src/commands/capture.ts CHANGED Viewed

@@ -11,28 +11,22 @@
  * @packageDocumentation
  */
-import { mkdir } from 'node:fs/promises'
 import { parseArgs } from 'node:util'
 import {
   createWorkspaceDir,
-  createWriteMutex,
   detectTrajectoryRichness,
   extractOutput,
   extractTrajectory,
   getInputPreview,
   hasToolErrors,
-  loadPrompts,
   logProgress,
-  resolvePath,
-  runWorkerPool,
-  writeOutput,
+  readStdinPrompts,
 } from '../core.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
 import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
-import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
+import type { ProcessExitInfo, PromptResult } from '../headless/headless-session-manager.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
+import type { CaptureResult, PromptCase, TrajectoryRichness } from '../schemas.ts'
+import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
 // ============================================================================
 // Re-exports for backward compatibility
@@ -55,30 +49,7 @@ export {
 // ============================================================================
 /** Configuration for capture command */
-export type CaptureConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Path to agent schema JSON file */
-  schemaPath: string
-  /** Output file path (undefined for stdout) */
-  outputPath?: string
-  /** Working directory for agent */
-  cwd?: string
-  /** Timeout per prompt in milliseconds (overrides schema default) */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Append to output file instead of overwriting */
-  append?: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Enable debug mode for detailed output */
-  debug?: boolean
-  /** Number of concurrent workers (default: 1 for sequential) */
-  concurrency?: number
-  /** Base directory for per-prompt workspace isolation */
-  workspaceDir?: string
-}
+export type CaptureConfig = BaseExecutionConfig
 // ============================================================================
 // Capture Implementation
@@ -95,51 +66,25 @@ export type CaptureConfig = {
  * @returns Array of capture results
  */
 export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
+  const ctx = await prepareExecution(config)
   const {
-    promptsPath,
-    schemaPath,
-    outputPath,
-    cwd,
-    timeout,
-    progress = false,
-    append = false,
+    schema,
+    prompts,
+    sessions,
+    resolvedOutputPath,
+    resolvedWorkspaceDir,
+    defaultWorkingDir,
+    progress,
     grader,
-    debug = false,
-    concurrency = 1,
-    workspaceDir,
-  } = config
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
-  }
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
-  }
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-  // Resolve paths
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
-  // Determine effective timeout (CLI flag > schema default > harness default)
-  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
-  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
+    debug,
+  } = ctx
   // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
-  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
-  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
-  if (concurrency > 1) {
-    logProgress(`Concurrency: ${concurrency} workers`, progress)
+  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
+  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
+  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
+  if (ctx.concurrency > 1) {
+    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
   }
   if (resolvedWorkspaceDir) {
     logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
@@ -151,31 +96,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
     logProgress(`Debug mode: enabled`, progress)
   }
-  // Create session manager with schema
-  const sessions = createSessionManager({
-    schema,
-    timeout: effectiveTimeout,
-    verbose: progress,
-    debug,
-  })
-  // Clear output file if not appending
-  if (resolvedOutputPath && !append) {
-    await Bun.write(resolvedOutputPath, '')
-  }
-  // Create workspace base directory if specified
-  // Uses fs.mkdir instead of shell to prevent command injection
-  if (resolvedWorkspaceDir) {
-    await mkdir(resolvedWorkspaceDir, { recursive: true })
-  }
-  const defaultWorkingDir = cwd ?? process.cwd()
-  // Create write mutex for coordinating JSONL output
-  const writeMutex = createWriteMutex()
-  let isFirstOutput = true
   // Process a single prompt (used by worker pool)
   const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
     // Determine working directory (per-prompt workspace or default)
@@ -301,11 +221,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
     }
     // Write result immediately (coordinated via mutex for concurrent writes)
-    await writeMutex.write(async () => {
-      const formatted = JSON.stringify(result)
-      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-      isFirstOutput = false
-    })
+    await ctx.writeResult(result)
     const statusIcon = result.toolErrors ? '!' : '✓'
     const exitInfo = result.metadata?.timedOut
@@ -319,20 +235,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
   }
   // Run with worker pool
-  const { results, errors } = await runWorkerPool(prompts, processPrompt, {
-    concurrency,
-    onProgress: (completed, total) => {
-      logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
-    },
-  })
-  // Log any errors that occurred
-  if (errors.length > 0) {
-    logProgress(`Completed with ${errors.length} error(s)`, progress)
-  }
-  logProgress('Done!', progress)
-  return results
+  return executePrompts(ctx, processPrompt)
 }
 // ============================================================================
@@ -356,6 +259,7 @@ export const capture = async (args: string[]): Promise<void> => {
       append: { type: 'boolean', default: false },
       grader: { type: 'string', short: 'g' },
       debug: { type: 'boolean', default: false },
+      stdin: { type: 'boolean', default: false },
       concurrency: { type: 'string', short: 'j' },
       'workspace-dir': { type: 'string' },
       help: { type: 'boolean', short: 'h' },
@@ -366,6 +270,7 @@ export const capture = async (args: string[]): Promise<void> => {
   if (values.help) {
     console.log(`
 Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
+       cat prompts.jsonl | agent-eval-harness capture --stdin --schema <schema.json> [options]
 Arguments:
   prompts.jsonl     Input file with evaluation prompts
@@ -376,6 +281,7 @@ Options:
   -c, --cwd         Working directory for agent
   -t, --timeout     Request timeout in ms (overrides schema default)
   -j, --concurrency Number of concurrent workers (default: 1)
+  --stdin           Read prompts from stdin (mutually exclusive with file arg)
   --workspace-dir   Base directory for per-prompt workspace isolation
   --progress        Show progress to stderr
   --append          Append to output file instead of overwriting
@@ -401,6 +307,11 @@ Parallelization:
   Each prompt gets its own agent session for isolation.
   Results are written as they complete (order may differ from input).
+  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
+  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
+  In memory-constrained environments (Docker, CI) this can cause OOM kills.
+  Use --stdin to pipe prompts for container-level orchestration.
 Workspace Isolation:
   Use --workspace-dir to create per-prompt directories.
   Each prompt runs in {workspace-dir}/prompt-{id}/.
@@ -422,13 +333,24 @@ Examples:
   # With debug mode
   agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
+  # Read prompts from stdin (container orchestration)
+  cat prompts.jsonl | agent-eval-harness capture --stdin -s claude.json -o results.jsonl
 `)
     return
   }
   const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
+  const useStdin = values.stdin ?? false
+  // Mutual exclusivity: --stdin and positional file
+  if (useStdin && promptsPath) {
+    console.error('Error: --stdin and prompts file argument are mutually exclusive')
+    process.exit(1)
+  }
+  if (!useStdin && !promptsPath) {
+    console.error('Error: prompts.jsonl path is required (or use --stdin)')
     process.exit(1)
   }
@@ -438,30 +360,23 @@ Examples:
     process.exit(1)
   }
-  // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+  // Read prompts from stdin if requested
+  let prompts: PromptCase[] | undefined
+  if (useStdin) {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: no prompts received on stdin')
       process.exit(1)
     }
+    prompts = stdinPrompts
   }
-  // Validate and parse concurrency
-  let concurrency = 1
-  if (values.concurrency) {
-    const parsed = Number.parseInt(values.concurrency, 10)
-    if (Number.isNaN(parsed) || parsed < 1) {
-      console.error('Error: --concurrency must be a positive integer')
-      process.exit(1)
-    }
-    concurrency = parsed
-  }
+  // Load grader if specified
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runCapture({
-    promptsPath,
+    promptsPath: promptsPath ?? undefined,
+    prompts,
     schemaPath: values.schema,
     outputPath: values.output,
     cwd: values.cwd,
@@ -470,7 +385,7 @@ Examples:
     append: values.append ?? false,
     grader,
     debug: values.debug ?? false,
-    concurrency,
+    concurrency: parseConcurrency(values.concurrency),
     workspaceDir: values['workspace-dir'],
   })
 }

package/src/commands/execution.ts ADDED Viewed

@@ -0,0 +1,245 @@
+/**
+ * Shared execution utilities for capture and trials commands.
+ *
+ * @remarks
+ * Extracts common setup logic: schema loading, prompt loading, path resolution,
+ * session manager creation, output initialization, and worker pool execution.
+ *
+ * @packageDocumentation
+ */
+import { mkdir } from 'node:fs/promises'
+import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
+import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
+import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
+import type { Grader, PromptCase } from '../schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Base configuration shared by capture and trials commands */
+export type BaseExecutionConfig = {
+  /** Path to prompts.jsonl file (required unless prompts provided) */
+  promptsPath?: string
+  /** Path to agent schema JSON file */
+  schemaPath: string
+  /** Pre-loaded prompt cases (from stdin); skips file loading when set */
+  prompts?: PromptCase[]
+  /** Output file path (undefined for stdout) */
+  outputPath?: string
+  /** Working directory for agent */
+  cwd?: string
+  /** Timeout per prompt in milliseconds (overrides schema default) */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+  /** Append to output file instead of overwriting */
+  append?: boolean
+  /** Optional grader function */
+  grader?: Grader
+  /** Enable debug mode */
+  debug?: boolean
+  /** Number of concurrent workers (default: 1 for sequential) */
+  concurrency?: number
+  /** Base directory for per-prompt workspace isolation */
+  workspaceDir?: string
+}
+/** Prepared execution context returned by prepareExecution */
+export type ExecutionContext = {
+  /** Parsed and validated headless adapter schema */
+  schema: HeadlessAdapterConfig
+  /** Loaded and validated prompt cases */
+  prompts: PromptCase[]
+  /** Session manager for creating/destroying agent sessions */
+  sessions: SessionManager
+  /** Resolved absolute output path (undefined for stdout) */
+  resolvedOutputPath?: string
+  /** Resolved absolute workspace directory path */
+  resolvedWorkspaceDir?: string
+  /** Effective timeout in milliseconds */
+  effectiveTimeout: number
+  /** Default working directory for agent sessions */
+  defaultWorkingDir: string
+  /** Number of concurrent workers */
+  concurrency: number
+  /** Whether to show progress output */
+  progress: boolean
+  /** Optional grader function */
+  grader?: Grader
+  /** Whether debug mode is enabled */
+  debug: boolean
+  /** Write a result object as JSONL, coordinated via mutex */
+  writeResult: (result: unknown) => Promise<void>
+}
+// ============================================================================
+// Execution Setup
+// ============================================================================
+/**
+ * Prepare execution context from base configuration.
+ *
+ * @remarks
+ * Handles all shared setup: schema loading/validation, prompt loading,
+ * path resolution, session manager creation, output file initialization,
+ * workspace directory creation, and write mutex coordination.
+ *
+ * @param config - Base execution configuration
+ * @returns Prepared execution context
+ * @throws Error if schema file not found, invalid, or prompts missing
+ *
+ * @public
+ */
+export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
+  const {
+    promptsPath,
+    schemaPath,
+    outputPath,
+    cwd,
+    timeout,
+    progress = false,
+    append = false,
+    grader,
+    debug = false,
+    concurrency = 1,
+    workspaceDir,
+  } = config
+  // Validate prompt source
+  if (!config.prompts && !promptsPath) {
+    throw new Error('Either promptsPath or prompts must be provided')
+  }
+  // Load and validate schema
+  const schemaFile = Bun.file(schemaPath)
+  if (!(await schemaFile.exists())) {
+    throw new Error(`Schema file not found: ${schemaPath}`)
+  }
+  let schema: HeadlessAdapterConfig
+  try {
+    const rawSchema = await schemaFile.json()
+    schema = parseHeadlessConfig(rawSchema)
+  } catch (error) {
+    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
+  }
+  // Load prompts
+  const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
+  // Resolve paths
+  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
+  // Determine effective timeout (CLI flag > schema default > harness default)
+  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
+  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
+  // Create session manager
+  const sessions = createSessionManager({
+    schema,
+    timeout: effectiveTimeout,
+    verbose: progress,
+    debug,
+  })
+  // Initialize output file (clear if not appending)
+  if (resolvedOutputPath && !append) {
+    await Bun.write(resolvedOutputPath, '')
+  }
+  // Create workspace base directory if specified
+  if (resolvedWorkspaceDir) {
+    await mkdir(resolvedWorkspaceDir, { recursive: true })
+  }
+  const defaultWorkingDir = cwd ?? process.cwd()
+  // Create write mutex with closure for coordinated result writing
+  const writeMutex = createWriteMutex()
+  let isFirstOutput = true
+  const writeResult = async (result: unknown) => {
+    await writeMutex.write(async () => {
+      const formatted = JSON.stringify(result)
+      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+      isFirstOutput = false
+    })
+  }
+  return {
+    schema,
+    prompts,
+    sessions,
+    resolvedOutputPath,
+    resolvedWorkspaceDir,
+    effectiveTimeout,
+    defaultWorkingDir,
+    concurrency,
+    progress,
+    grader,
+    debug,
+    writeResult,
+  }
+}
+// ============================================================================
+// Worker Pool Execution
+// ============================================================================
+/**
+ * Execute prompts through a worker pool with progress logging.
+ *
+ * @remarks
+ * Common wrapper for the runWorkerPool pattern used by both capture and trials.
+ * Handles progress callbacks, error logging, and completion logging.
+ *
+ * @param ctx - Execution context from prepareExecution
+ * @param processFn - Function to process each prompt
+ * @returns Array of results
+ *
+ * @public
+ */
+export const executePrompts = async <T>(
+  ctx: ExecutionContext,
+  processFn: (promptCase: PromptCase, index: number) => Promise<T>,
+): Promise<T[]> => {
+  const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
+    concurrency: ctx.concurrency,
+    onProgress: (completed, total) => {
+      logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
+    },
+  })
+  if (errors.length > 0) {
+    logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
+  }
+  logProgress('Done!', ctx.progress)
+  return results
+}
+// ============================================================================
+// CLI Helpers
+// ============================================================================
+/**
+ * Parse and validate concurrency CLI argument.
+ *
+ * @param value - Raw string value from parseArgs
+ * @returns Validated positive integer (default: 1)
+ *
+ * @public
+ */
+export const parseConcurrency = (value: string | undefined): number => {
+  if (!value) return 1
+  const parsed = Number.parseInt(value, 10)
+  if (Number.isNaN(parsed) || parsed < 1) {
+    console.error('Error: --concurrency must be a positive integer')
+    process.exit(1)
+  }
+  return parsed
+}

package/src/commands/tests/capture-cli.spec.ts CHANGED Viewed

@@ -142,6 +142,16 @@ describe('runCapture configuration', () => {
     expect(config.concurrency).toBeUndefined()
     expect(config.workspaceDir).toBeUndefined()
   })
+  test('CaptureConfig accepts prompts without promptsPath', () => {
+    const config: CaptureConfig = {
+      schemaPath: './test-schema.json',
+      prompts: [{ id: 't1', input: 'hello' }],
+    }
+    expect(config.promptsPath).toBeUndefined()
+    expect(config.prompts).toHaveLength(1)
+  })
 })
 // ============================================================================
@@ -168,6 +178,23 @@ describe('capture CLI', () => {
     expect(stdout).toContain('-s, --schema')
     expect(stdout).toContain('-j, --concurrency')
     expect(stdout).toContain('--workspace-dir')
+    expect(stdout).toContain('--stdin')
+  })
+  test('shows error for --stdin with positional file', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
   })
   test('shows error for missing prompts file argument', async () => {

package/src/commands/tests/trials-cli.spec.ts CHANGED Viewed

@@ -44,6 +44,17 @@ describe('TrialsConfig configuration', () => {
     expect(config.concurrency).toBeUndefined()
     expect(config.workspaceDir).toBeUndefined()
   })
+  test('TrialsConfig accepts prompts without promptsPath', () => {
+    const config: TrialsConfig = {
+      schemaPath: './test-schema.json',
+      k: 3,
+      prompts: [{ id: 't1', input: 'hello' }],
+    }
+    expect(config.promptsPath).toBeUndefined()
+    expect(config.prompts).toHaveLength(1)
+  })
 })
 // ============================================================================
@@ -72,6 +83,23 @@ describe('trials CLI', () => {
     expect(stdout).toContain('pass@k')
     expect(stdout).toContain('-j, --concurrency')
     expect(stdout).toContain('--workspace-dir')
+    expect(stdout).toContain('--stdin')
+  })
+  test('shows error for --stdin with positional file', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
   })
   test('shows error for missing prompts file argument', async () => {

package/src/commands/trials.ts CHANGED Viewed

@@ -11,25 +11,13 @@
  * @packageDocumentation
  */
-import { mkdir } from 'node:fs/promises'
 import { parseArgs } from 'node:util'
-import {
-  createWorkspaceDir,
-  createWriteMutex,
-  extractOutput,
-  extractTrajectory,
-  loadPrompts,
-  logProgress,
-  resolvePath,
-  runWorkerPool,
-  writeOutput,
-} from '../core.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
 import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import { createSessionManager } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
-import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
+import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
+import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
+import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
 // ============================================================================
 // Pass@k/Pass^k Calculation
@@ -85,31 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
 // ============================================================================
 /** Configuration for trials command */
-export type TrialsConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Path to agent schema JSON file */
-  schemaPath: string
+export type TrialsConfig = BaseExecutionConfig & {
   /** Number of trials per prompt */
   k: number
-  /** Output file path */
-  outputPath?: string
-  /** Working directory for agent */
-  cwd?: string
-  /** Timeout per prompt in milliseconds (overrides schema default) */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Append to output file */
-  append?: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Enable debug mode */
-  debug?: boolean
-  /** Number of concurrent workers (default: 1 for sequential) */
-  concurrency?: number
-  /** Base directory for per-prompt workspace isolation */
-  workspaceDir?: string
 }
 // ============================================================================
@@ -123,53 +89,17 @@ export type TrialsConfig = {
  * @returns Array of trial results
  */
 export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
-  const {
-    promptsPath,
-    schemaPath,
-    k,
-    outputPath,
-    cwd,
-    timeout,
-    progress = false,
-    append = false,
-    grader,
-    debug = false,
-    concurrency = 1,
-    workspaceDir,
-  } = config
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
-  }
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
-  }
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-  // Resolve paths
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
-  // Determine effective timeout (CLI flag > schema default > harness default)
-  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
-  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
+  const { k } = config
+  const ctx = await prepareExecution(config)
+  const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
   // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
+  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
   logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
-  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
-  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
-  if (concurrency > 1) {
-    logProgress(`Concurrency: ${concurrency} workers`, progress)
+  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
+  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
+  if (ctx.concurrency > 1) {
+    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
   }
   if (resolvedWorkspaceDir) {
     logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
@@ -178,31 +108,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
     logProgress('Grader: enabled (will compute pass@k metrics)', progress)
   }
-  // Create session manager with schema
-  const sessions = createSessionManager({
-    schema,
-    timeout: effectiveTimeout,
-    verbose: progress,
-    debug,
-  })
-  // Clear output file if not appending
-  if (resolvedOutputPath && !append) {
-    await Bun.write(resolvedOutputPath, '')
-  }
-  // Create workspace base directory if specified
-  // Uses fs.mkdir instead of shell to prevent command injection
-  if (resolvedWorkspaceDir) {
-    await mkdir(resolvedWorkspaceDir, { recursive: true })
-  }
-  const defaultWorkingDir = cwd ?? process.cwd()
-  // Create write mutex for coordinating JSONL output
-  const writeMutex = createWriteMutex()
-  let isFirstOutput = true
   // Process all trials for a single prompt
   const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
     logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
@@ -308,11 +213,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
     }
     // Write result immediately (coordinated via mutex for concurrent writes)
-    await writeMutex.write(async () => {
-      const formatted = JSON.stringify(result)
-      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-      isFirstOutput = false
-    })
+    await ctx.writeResult(result)
     if (grader) {
       logProgress(
@@ -325,20 +226,7 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
   }
   // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
-  const { results, errors } = await runWorkerPool(prompts, processPromptTrials, {
-    concurrency,
-    onProgress: (completed, total) => {
-      logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
-    },
-  })
-  // Log any errors that occurred
-  if (errors.length > 0) {
-    logProgress(`Completed with ${errors.length} error(s)`, progress)
-  }
-  logProgress('Done!', progress)
-  return results
+  return executePrompts(ctx, processPromptTrials)
 }
 // ============================================================================
@@ -363,6 +251,7 @@ export const trials = async (args: string[]): Promise<void> => {
       append: { type: 'boolean', default: false },
       grader: { type: 'string', short: 'g' },
       debug: { type: 'boolean', default: false },
+      stdin: { type: 'boolean', default: false },
       concurrency: { type: 'string', short: 'j' },
       'workspace-dir': { type: 'string' },
       help: { type: 'boolean', short: 'h' },
@@ -373,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
   if (values.help) {
     console.log(`
 Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
+       cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
 Arguments:
   prompts.jsonl     Input file with evaluation prompts
@@ -384,6 +274,7 @@ Options:
   -c, --cwd         Working directory for agent
   -t, --timeout     Request timeout in ms (overrides schema default)
   -j, --concurrency Number of concurrent workers (default: 1)
+  --stdin           Read prompts from stdin (mutually exclusive with file arg)
   --workspace-dir   Base directory for per-trial workspace isolation
   --progress        Show progress to stderr
   --append          Append to output file
@@ -404,6 +295,11 @@ Parallelization:
   Each prompt's k trials still run sequentially (required for aggregation).
   With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
+  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
+  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
+  In memory-constrained environments (Docker, CI) this can cause OOM kills.
+  Use --stdin to pipe prompts for container-level orchestration.
 Workspace Isolation:
   Use --workspace-dir to create per-trial directories.
   Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
@@ -422,13 +318,24 @@ Examples:
   # With TypeScript grader
   agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
+  # Read prompts from stdin (container orchestration)
+  cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
 `)
     return
   }
   const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
+  const useStdin = values.stdin ?? false
+  // Mutual exclusivity: --stdin and positional file
+  if (useStdin && promptsPath) {
+    console.error('Error: --stdin and prompts file argument are mutually exclusive')
+    process.exit(1)
+  }
+  if (!useStdin && !promptsPath) {
+    console.error('Error: prompts.jsonl path is required (or use --stdin)')
     process.exit(1)
   }
@@ -438,30 +345,23 @@ Examples:
     process.exit(1)
   }
-  // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+  // Read prompts from stdin if requested
+  let prompts: PromptCase[] | undefined
+  if (useStdin) {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: no prompts received on stdin')
       process.exit(1)
     }
+    prompts = stdinPrompts
   }
-  // Validate and parse concurrency
-  let concurrency = 1
-  if (values.concurrency) {
-    const parsed = Number.parseInt(values.concurrency, 10)
-    if (Number.isNaN(parsed) || parsed < 1) {
-      console.error('Error: --concurrency must be a positive integer')
-      process.exit(1)
-    }
-    concurrency = parsed
-  }
+  // Load grader if specified
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runTrials({
-    promptsPath,
+    promptsPath: promptsPath ?? undefined,
+    prompts,
     schemaPath: values.schema,
     k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
     outputPath: values.output,
@@ -471,7 +371,7 @@ Examples:
     append: values.append ?? false,
     grader,
     debug: values.debug ?? false,
-    concurrency,
+    concurrency: parseConcurrency(values.concurrency),
     workspaceDir: values['workspace-dir'],
   })
 }

package/src/commands/validate-refs.ts CHANGED Viewed

@@ -9,9 +9,9 @@
  */
 import { parseArgs } from 'node:util'
-import { loadGrader } from '../schemas/grader-loader.ts'
+import { loadPrompts, resolvePath } from '../core.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
 import type { Grader, ValidationResult } from '../schemas.ts'
-import { loadPrompts } from './capture.ts'
 // ============================================================================
 // Types
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
   grader: Grader
 }
-// ============================================================================
-// Helpers
-// ============================================================================
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 // ============================================================================
 // Validate-Refs Implementation
 // ============================================================================
@@ -171,13 +161,7 @@ Examples:
   }
   // Load grader
-  let grader: Grader
-  try {
-    grader = await loadGrader(values.grader)
-  } catch (error) {
-    console.error(`Error: ${error instanceof Error ? error.message : error}`)
-    process.exit(1)
-  }
+  const grader = await loadGraderOrExit(values.grader)
   await runValidateRefs({
     promptsPath,

package/src/core/core.ts CHANGED Viewed

@@ -11,7 +11,15 @@
  */
 // Loading utilities
-export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
+export {
+  buildResultsIndex,
+  countLines,
+  loadJsonl,
+  loadPrompts,
+  loadResults,
+  readStdinPrompts,
+  streamResults,
+} from './loading.ts'
 // Output utilities
 export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
 // Native streaming utilities

package/src/core/loading.ts CHANGED Viewed

@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
     })
 }
+/**
+ * Read prompts from stdin as JSONL.
+ *
+ * @remarks
+ * Reads all data from stdin, parses each line as JSON, and validates against
+ * PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
+ * Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
+ *
+ * @returns Parsed and validated prompt cases, or null if stdin is a TTY
+ * @throws Error if any line is invalid JSON or fails schema validation
+ *
+ * @public
+ */
+export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
+  if (process.stdin.isTTY) {
+    return null
+  }
+  const chunks: Buffer[] = []
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk)
+  }
+  const content = Buffer.concat(chunks).toString('utf-8').trim()
+  if (!content) return null
+  return content
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return PromptCaseSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
 /**
  * Load capture results from a JSONL file.
  *

package/src/core.ts CHANGED Viewed

@@ -31,6 +31,7 @@ export {
   loadResults,
   logProgress,
   type ProgressCallback,
+  readStdinPrompts,
   resolvePath,
   runWorkerPool,
   streamJsonl,

package/src/schemas/grader-loader.ts CHANGED Viewed

@@ -13,6 +13,7 @@
  * @packageDocumentation
  */
+import { resolvePath } from '../core.ts'
 import type { Grader, TrajectoryStep } from './schemas.ts'
 import { GraderResultSchema } from './schemas.ts'
@@ -30,12 +31,6 @@ const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
 /** Check if a file path is a JavaScript/TypeScript module */
 const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 // ============================================================================
 // Executable Grader
 // ============================================================================
@@ -169,6 +164,28 @@ const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
  * const grader = await loadGrader('./my-grader')
  * ```
  */
+/**
+ * Load a grader from a file path, exiting on failure.
+ *
+ * @remarks
+ * CLI-friendly wrapper around `loadGrader` that prints the error to stderr
+ * and calls `process.exit(1)` on failure. Eliminates the duplicated
+ * try/catch pattern across CLI handlers.
+ *
+ * @param graderPath - Path to the grader (relative or absolute)
+ * @returns Grader function (never returns on failure)
+ *
+ * @public
+ */
+export const loadGraderOrExit = async (graderPath: string): Promise<Grader> => {
+  try {
+    return await loadGrader(graderPath)
+  } catch (error) {
+    console.error(`Error: ${error instanceof Error ? error.message : error}`)
+    process.exit(1)
+  }
+}
 export const loadGrader = async (graderPath: string): Promise<Grader> => {
   const resolvedPath = resolvePath(graderPath)

package/src/schemas/schemas-cli.ts CHANGED Viewed

@@ -10,6 +10,7 @@
 import { parseArgs } from 'node:util'
 import { z } from 'zod'
+import { resolvePath } from '../core.ts'
 import * as schemas from './schemas.ts'
 // ============================================================================
@@ -57,12 +58,6 @@ export type SchemasConfig = {
 // Helpers
 // ============================================================================
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 /** Generate JSON Schema from Zod schema */
 const toJsonSchema = (schema: z.ZodSchema, name: string): object => {
   try {

package/src/schemas.ts CHANGED Viewed

@@ -18,7 +18,7 @@ export {
   TAIL_LINES,
 } from './schemas/constants.ts'
 // Grader loader
-export { loadGrader } from './schemas/grader-loader.ts'
+export { loadGrader, loadGraderOrExit } from './schemas/grader-loader.ts'
 // Core session types
 // JSON-RPC types (MCP compatibility)
 // MCP server configuration