npm - @plaited/agent-eval-harness - Versions diffs - 0.9.0 → 0.11.0 - Mend

@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +10 -0
package/package.json +1 -1
package/src/commands/balance.ts +1 -11
package/src/commands/calibrate.ts +2 -10
package/src/commands/capture.ts +104 -114
package/src/commands/execution.ts +245 -0
package/src/commands/tests/capture-cli.spec.ts +84 -0
package/src/commands/tests/trials-cli.spec.ts +68 -0
package/src/commands/trials.ts +98 -115
package/src/commands/validate-refs.ts +3 -19
package/src/core/core.ts +27 -1
package/src/core/loading.ts +53 -19
package/src/core/streaming.ts +172 -0
package/src/core/tests/streaming.spec.ts +399 -0
package/src/core/tests/worker-pool.spec.ts +377 -0
package/src/core/worker-pool.ts +220 -0
package/src/core.ts +15 -0
package/src/schemas/grader-loader.ts +23 -6
package/src/schemas/schemas-cli.ts +1 -6
package/src/schemas/schemas.ts +2 -0
package/src/schemas.ts +1 -1

package/src/commands/trials.ts CHANGED Viewed

@@ -12,13 +12,12 @@
  */
 import { parseArgs } from 'node:util'
-import { extractOutput, extractTrajectory, loadPrompts, logProgress, resolvePath, writeOutput } from '../core.ts'
-import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
 import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
-import { createSessionManager } from '../headless/headless-session-manager.ts'
-import { DEFAULT_HARNESS_TIMEOUT, DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
-import { loadGrader } from '../schemas/grader-loader.ts'
-import type { Grader, TrialEntry, TrialResult } from '../schemas.ts'
+import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
+import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
+import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
 // ============================================================================
 // Pass@k/Pass^k Calculation
@@ -74,27 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
 // ============================================================================
 /** Configuration for trials command */
-export type TrialsConfig = {
-  /** Path to prompts.jsonl file */
-  promptsPath: string
-  /** Path to agent schema JSON file */
-  schemaPath: string
+export type TrialsConfig = BaseExecutionConfig & {
   /** Number of trials per prompt */
   k: number
-  /** Output file path */
-  outputPath?: string
-  /** Working directory for agent */
-  cwd?: string
-  /** Timeout per prompt in milliseconds (overrides schema default) */
-  timeout?: number
-  /** Show progress to stderr */
-  progress?: boolean
-  /** Append to output file */
-  append?: boolean
-  /** Optional grader function */
-  grader?: Grader
-  /** Enable debug mode */
-  debug?: boolean
 }
 // ============================================================================
@@ -108,79 +89,38 @@ export type TrialsConfig = {
  * @returns Array of trial results
  */
 export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
-  const {
-    promptsPath,
-    schemaPath,
-    k,
-    outputPath,
-    cwd,
-    timeout,
-    progress = false,
-    append = false,
-    grader,
-    debug = false,
-  } = config
+  const { k } = config
+  const ctx = await prepareExecution(config)
+  const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
-  // Load and validate schema
-  const schemaFile = Bun.file(schemaPath)
-  if (!(await schemaFile.exists())) {
-    throw new Error(`Schema file not found: ${schemaPath}`)
+  // Log progress info
+  logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
+  logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
+  logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
+  logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
+  if (ctx.concurrency > 1) {
+    logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
   }
-  let schema: HeadlessAdapterConfig
-  try {
-    const rawSchema = await schemaFile.json()
-    schema = parseHeadlessConfig(rawSchema)
-  } catch (error) {
-    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
+  if (resolvedWorkspaceDir) {
+    logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
   }
-  // Load prompts
-  const prompts = await loadPrompts(promptsPath)
-  // Resolve output path
-  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
-  // Determine effective timeout (CLI flag > schema default > harness default)
-  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
-  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
-  // Log progress info
-  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
-  logProgress(`Running ${k} trials per prompt`, progress)
-  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
-  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
   if (grader) {
     logProgress('Grader: enabled (will compute pass@k metrics)', progress)
   }
-  // Create session manager with schema
-  const sessions = createSessionManager({
-    schema,
-    timeout: effectiveTimeout,
-    verbose: progress,
-    debug,
-  })
-  // Clear output file if not appending
-  if (resolvedOutputPath && !append) {
-    await Bun.write(resolvedOutputPath, '')
-  }
-  const workingDir = cwd ?? process.cwd()
-  const results: TrialResult[] = []
-  let isFirstOutput = true
-  // Run evaluations
-  for (let i = 0; i < prompts.length; i++) {
-    const promptCase = prompts[i]
-    if (!promptCase) continue
-    logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
+  // Process all trials for a single prompt
+  const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
+    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
     const trialEntries: TrialEntry[] = []
     for (let trialNum = 1; trialNum <= k; trialNum++) {
+      // Determine working directory (per-prompt workspace or default)
+      // For trials, include trial number in workspace path for isolation
+      const workingDir = resolvedWorkspaceDir
+        ? await createWorkspaceDir(resolvedWorkspaceDir, `${promptCase.id}-trial-${trialNum}`)
+        : defaultWorkingDir
       // Create fresh session for each trial
       const session = await sessions.create(workingDir)
       const startTime = Date.now()
@@ -190,8 +130,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
         const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
         const allUpdates: ParsedUpdate[] = []
-        // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
         // Execute each turn sequentially
         for (const turnInput of inputs) {
           const turnResult = await sessions.prompt(session.id, turnInput)
@@ -223,7 +161,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
           entry.score = graderResult.score
           entry.reasoning = graderResult.reasoning
-          // Merge outcome from grader if present
           if (graderResult.outcome) {
             entry.outcome = graderResult.outcome
           }
@@ -234,9 +171,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
           `    Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
           progress,
         )
-        // Clean up session
-        sessions.destroy(session.id)
       } catch (error) {
         const endTime = Date.now()
         const message = error instanceof Error ? error.message : String(error)
@@ -250,6 +184,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
           reasoning: `Error: ${message}`,
         })
         logProgress(`    Trial ${trialNum}/${k}: ! (error)`, progress)
+      } finally {
+        // Always clean up session
+        sessions.destroy(session.id)
       }
     }
@@ -260,6 +197,11 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
       ...(promptCase.hint && { hint: promptCase.hint }),
       k,
       trials: trialEntries,
+      metadata: {
+        ...promptCase.metadata,
+        agent: schema.name,
+        ...(resolvedWorkspaceDir && { workspaceDir: resolvedWorkspaceDir }),
+      },
     }
     // Calculate metrics if grader was used
@@ -270,23 +212,21 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
       result.passExpK = calculatePassExpK(passes, k)
     }
-    results.push(result)
-    // Write result immediately
-    const formatted = JSON.stringify(result)
-    await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-    isFirstOutput = false
+    // Write result immediately (coordinated via mutex for concurrent writes)
+    await ctx.writeResult(result)
     if (grader) {
       logProgress(
-        `  → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
+        `  → ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
         progress,
       )
     }
+    return result
   }
-  logProgress('Done!', progress)
-  return results
+  // Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
+  return executePrompts(ctx, processPromptTrials)
 }
 // ============================================================================
@@ -311,6 +251,9 @@ export const trials = async (args: string[]): Promise<void> => {
       append: { type: 'boolean', default: false },
       grader: { type: 'string', short: 'g' },
       debug: { type: 'boolean', default: false },
+      stdin: { type: 'boolean', default: false },
+      concurrency: { type: 'string', short: 'j' },
+      'workspace-dir': { type: 'string' },
       help: { type: 'boolean', short: 'h' },
     },
     allowPositionals: true,
@@ -319,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
   if (values.help) {
     console.log(`
 Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
+       cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
 Arguments:
   prompts.jsonl     Input file with evaluation prompts
@@ -329,6 +273,9 @@ Options:
   -k                Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
   -c, --cwd         Working directory for agent
   -t, --timeout     Request timeout in ms (overrides schema default)
+  -j, --concurrency Number of concurrent workers (default: 1)
+  --stdin           Read prompts from stdin (mutually exclusive with file arg)
+  --workspace-dir   Base directory for per-trial workspace isolation
   --progress        Show progress to stderr
   --append          Append to output file
   -g, --grader      Path to grader (.ts/.js module or executable script)
@@ -343,22 +290,52 @@ Graders:
   TS/JS modules must export a 'grade' function.
   Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Parallelization:
+  Use -j/--concurrency to run multiple prompts' trials in parallel.
+  Each prompt's k trials still run sequentially (required for aggregation).
+  With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
+  Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
+  at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
+  In memory-constrained environments (Docker, CI) this can cause OOM kills.
+  Use --stdin to pipe prompts for container-level orchestration.
+Workspace Isolation:
+  Use --workspace-dir to create per-trial directories.
+  Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
+  Useful for code generation tasks requiring filesystem isolation.
 Examples:
-  # Capture only
+  # Basic trials
   agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
+  # Run 4 prompts' trials in parallel (4x faster for 151 prompts)
+  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 -o trials.jsonl
+  # With workspace isolation for code generation
+  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 \\
+    --workspace-dir ./workspaces -o trials.jsonl
   # With TypeScript grader
   agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
-  # With Python grader
-  agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.py -o trials.jsonl
+  # Read prompts from stdin (container orchestration)
+  cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
 `)
     return
   }
   const promptsPath = positionals[0]
-  if (!promptsPath) {
-    console.error('Error: prompts.jsonl path is required')
+  const useStdin = values.stdin ?? false
+  // Mutual exclusivity: --stdin and positional file
+  if (useStdin && promptsPath) {
+    console.error('Error: --stdin and prompts file argument are mutually exclusive')
+    process.exit(1)
+  }
+  if (!useStdin && !promptsPath) {
+    console.error('Error: prompts.jsonl path is required (or use --stdin)')
     process.exit(1)
   }
@@ -368,19 +345,23 @@ Examples:
     process.exit(1)
   }
-  // Load grader if specified
-  let grader: Grader | undefined
-  if (values.grader) {
-    try {
-      grader = await loadGrader(values.grader)
-    } catch (error) {
-      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+  // Read prompts from stdin if requested
+  let prompts: PromptCase[] | undefined
+  if (useStdin) {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: no prompts received on stdin')
       process.exit(1)
     }
+    prompts = stdinPrompts
   }
+  // Load grader if specified
+  const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
   await runTrials({
-    promptsPath,
+    promptsPath: promptsPath ?? undefined,
+    prompts,
     schemaPath: values.schema,
     k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
     outputPath: values.output,
@@ -390,5 +371,7 @@ Examples:
     append: values.append ?? false,
     grader,
     debug: values.debug ?? false,
+    concurrency: parseConcurrency(values.concurrency),
+    workspaceDir: values['workspace-dir'],
   })
 }

package/src/commands/validate-refs.ts CHANGED Viewed

@@ -9,9 +9,9 @@
  */
 import { parseArgs } from 'node:util'
-import { loadGrader } from '../schemas/grader-loader.ts'
+import { loadPrompts, resolvePath } from '../core.ts'
+import { loadGraderOrExit } from '../schemas/grader-loader.ts'
 import type { Grader, ValidationResult } from '../schemas.ts'
-import { loadPrompts } from './capture.ts'
 // ============================================================================
 // Types
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
   grader: Grader
 }
-// ============================================================================
-// Helpers
-// ============================================================================
-/** Resolve path relative to process.cwd() */
-const resolvePath = (path: string): string => {
-  if (path.startsWith('/')) return path
-  return `${process.cwd()}/${path}`
-}
 // ============================================================================
 // Validate-Refs Implementation
 // ============================================================================
@@ -171,13 +161,7 @@ Examples:
   }
   // Load grader
-  let grader: Grader
-  try {
-    grader = await loadGrader(values.grader)
-  } catch (error) {
-    console.error(`Error: ${error instanceof Error ? error.message : error}`)
-    process.exit(1)
-  }
+  const grader = await loadGraderOrExit(values.grader)
   await runValidateRefs({
     promptsPath,

package/src/core/core.ts CHANGED Viewed

@@ -11,9 +11,25 @@
  */
 // Loading utilities
-export { buildResultsIndex, countLines, loadJsonl, loadPrompts, loadResults, streamResults } from './loading.ts'
+export {
+  buildResultsIndex,
+  countLines,
+  loadJsonl,
+  loadPrompts,
+  loadResults,
+  readStdinPrompts,
+  streamResults,
+} from './loading.ts'
 // Output utilities
 export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
+// Native streaming utilities
+export {
+  countLinesStreaming,
+  streamJsonl,
+  streamPrompts,
+  streamResultsNative,
+  streamTrialResults,
+} from './streaming.ts'
 // Trajectory utilities
 export {
   detectTrajectoryRichness,
@@ -23,3 +39,13 @@ export {
   extractTrajectory,
   hasToolErrors,
 } from './trajectory.ts'
+// Worker pool utilities
+export {
+  createWorkspaceDir,
+  createWriteMutex,
+  type ProgressCallback,
+  runWorkerPool,
+  type WorkerPoolOptions,
+  type WorkerPoolResult,
+  type WriteMutex,
+} from './worker-pool.ts'

package/src/core/loading.ts CHANGED Viewed

@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
     })
 }
+/**
+ * Read prompts from stdin as JSONL.
+ *
+ * @remarks
+ * Reads all data from stdin, parses each line as JSON, and validates against
+ * PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
+ * Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
+ *
+ * @returns Parsed and validated prompt cases, or null if stdin is a TTY
+ * @throws Error if any line is invalid JSON or fails schema validation
+ *
+ * @public
+ */
+export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
+  if (process.stdin.isTTY) {
+    return null
+  }
+  const chunks: Buffer[] = []
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk)
+  }
+  const content = Buffer.concat(chunks).toString('utf-8').trim()
+  if (!content) return null
+  return content
+    .split('\n')
+    .filter(Boolean)
+    .map((line, index) => {
+      try {
+        return PromptCaseSchema.parse(JSON.parse(line))
+      } catch (error) {
+        throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
+      }
+    })
+}
 /**
  * Load capture results from a JSONL file.
  *
@@ -99,12 +137,21 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
 // Streaming Loading
 // ============================================================================
+// Re-export native streaming functions for backward compatibility
+export {
+  countLinesStreaming,
+  streamJsonl,
+  streamPrompts,
+  streamResultsNative,
+  streamTrialResults,
+} from './streaming.ts'
 /**
  * Stream capture results from a JSONL file.
  *
  * @remarks
  * Memory-efficient alternative to loadResults for large files.
- * Yields results one at a time using an async generator.
+ * Uses native streaming via Bun.file().stream() for O(1) memory usage.
  *
  * @param path - Path to the results.jsonl file
  * @yields Parsed and validated capture results
@@ -113,20 +160,8 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
  * @public
  */
 export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
-  const file = Bun.file(path)
-  const text = await file.text()
-  const lines = text.split('\n')
-  for (let i = 0; i < lines.length; i++) {
-    const line = lines[i]?.trim()
-    if (!line) continue
-    try {
-      yield CaptureResultSchema.parse(JSON.parse(line))
-    } catch (error) {
-      throw new Error(`Invalid result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
-    }
-  }
+  const { streamResultsNative } = await import('./streaming.ts')
+  yield* streamResultsNative(path)
 }
 /**
@@ -159,7 +194,7 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
  *
  * @remarks
  * Useful for detecting large files that should use streaming mode.
- * Uses byte-level scanning for efficiency.
+ * Uses native streaming for O(1) memory usage.
  *
  * @param path - Path to the JSONL file
  * @returns Number of non-empty lines
@@ -167,7 +202,6 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
  * @public
  */
 export const countLines = async (path: string): Promise<number> => {
-  const file = Bun.file(path)
-  const text = await file.text()
-  return text.split('\n').filter((line) => line.trim()).length
+  const { countLinesStreaming } = await import('./streaming.ts')
+  return countLinesStreaming(path)
 }