npm - @plaited/agent-eval-harness - Versions diffs - 0.9.0 → 0.10.0 - Mend

@plaited/agent-eval-harness 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +10 -0
package/package.json +1 -1
package/src/commands/capture.ts +101 -26
package/src/commands/tests/capture-cli.spec.ts +57 -0
package/src/commands/tests/trials-cli.spec.ts +40 -0
package/src/commands/trials.ts +111 -28
package/src/core/core.ts +18 -0
package/src/core/loading.ts +15 -19
package/src/core/streaming.ts +172 -0
package/src/core/tests/streaming.spec.ts +399 -0
package/src/core/tests/worker-pool.spec.ts +377 -0
package/src/core/worker-pool.ts +220 -0
package/src/core.ts +14 -0
package/src/schemas/schemas.ts +2 -0

package/README.md CHANGED Viewed

@@ -58,11 +58,21 @@ bunx @plaited/agent-eval-harness capture prompts.jsonl \
   --schema ./schemas/claude-headless.json \
   -o results.jsonl
+# Parallel capture (4x faster with 4 workers)
+bunx @plaited/agent-eval-harness capture prompts.jsonl \
+  --schema ./schemas/claude-headless.json \
+  -j 4 -o results.jsonl
 # Run trials for pass@k analysis with debug mode
 bunx @plaited/agent-eval-harness trials prompts.jsonl \
   --schema ./schemas/claude-headless.json \
   -k 5 --grader ./grader.ts --debug
+# Parallel trials (4 prompts running trials concurrently)
+bunx @plaited/agent-eval-harness trials prompts.jsonl \
+  --schema ./schemas/claude-headless.json \
+  -k 5 -j 4 --workspace-dir ./workspaces -o trials.jsonl
 # Summarize results
 bunx @plaited/agent-eval-harness summarize results.jsonl -o summary.jsonl

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.9.0",
+  "version": "0.10.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {

package/src/commands/capture.ts CHANGED Viewed

@@ -11,8 +11,11 @@
  * @packageDocumentation
  */
+import { mkdir } from 'node:fs/promises'
 import { parseArgs } from 'node:util'
 import {
+  createWorkspaceDir,
+  createWriteMutex,
   detectTrajectoryRichness,
   extractOutput,
   extractTrajectory,
@@ -21,6 +24,7 @@ import {
   loadPrompts,
   logProgress,
   resolvePath,
+  runWorkerPool,
   writeOutput,
 } from '../core.ts'
 import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
@@ -70,6 +74,10 @@ export type CaptureConfig = {
   grader?: Grader
   /** Enable debug mode for detailed output */
   debug?: boolean
+  /** Number of concurrent workers (default: 1 for sequential) */
+  concurrency?: number
+  /** Base directory for per-prompt workspace isolation */
+  workspaceDir?: string
 }
 // ============================================================================
@@ -97,6 +105,8 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
     append = false,
     grader,
     debug = false,
+    concurrency = 1,
+    workspaceDir,
   } = config
   // Load and validate schema
@@ -116,8 +126,9 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
   // Load prompts
   const prompts = await loadPrompts(promptsPath)
-  // Resolve output path
+  // Resolve paths
   const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
   // Determine effective timeout (CLI flag > schema default > harness default)
   const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
@@ -127,6 +138,12 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
   logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
   logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
   logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
+  if (concurrency > 1) {
+    logProgress(`Concurrency: ${concurrency} workers`, progress)
+  }
+  if (resolvedWorkspaceDir) {
+    logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
+  }
   if (resolvedOutputPath) {
     logProgress(`Output: ${resolvedOutputPath}`, progress)
   }
@@ -147,24 +164,36 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
     await Bun.write(resolvedOutputPath, '')
   }
-  const workingDir = cwd ?? process.cwd()
-  const results: CaptureResult[] = []
+  // Create workspace base directory if specified
+  // Uses fs.mkdir instead of shell to prevent command injection
+  if (resolvedWorkspaceDir) {
+    await mkdir(resolvedWorkspaceDir, { recursive: true })
+  }
+  const defaultWorkingDir = cwd ?? process.cwd()
+  // Create write mutex for coordinating JSONL output
+  const writeMutex = createWriteMutex()
   let isFirstOutput = true
-  // Run evaluations sequentially - fresh session per entry
-  for (let i = 0; i < prompts.length; i++) {
-    const promptCase = prompts[i]
-    if (!promptCase) continue
+  // Process a single prompt (used by worker pool)
+  const processPrompt = async (promptCase: (typeof prompts)[number], index: number): Promise<CaptureResult> => {
+    // Determine working directory (per-prompt workspace or default)
+    const workingDir = resolvedWorkspaceDir
+      ? await createWorkspaceDir(resolvedWorkspaceDir, promptCase.id)
+      : defaultWorkingDir
-    logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
+    logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
     const startTime = Date.now()
     let result: CaptureResult
+    let sessionId: string | undefined
     try {
       // Create fresh session for each entry (ensures isolation)
       const sessionStart = Date.now()
       const session = await sessions.create(workingDir)
+      sessionId = session.id
       const sessionCreation = Date.now() - sessionStart
       logProgress(`  Session: ${session.id}`, progress)
@@ -177,9 +206,6 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       let lastExitInfo: ProcessExitInfo | undefined
       let lastOutput = ''
-      // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
-      // The session manager would need to accept timeout per-call to support this
       // Execute each turn sequentially in the same session
       for (const turnInput of inputs) {
         const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
@@ -198,7 +224,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       result = {
         id: promptCase.id,
-        input: promptCase.input, // Preserve original (string or array)
+        input: promptCase.input,
         output,
         ...(promptCase.hint && { hint: promptCase.hint }),
         trajectory,
@@ -207,6 +233,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
           agent: schema.name,
           trajectoryRichness,
           turnCount,
+          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
           ...(lastExitInfo && {
             exitCode: lastExitInfo.exitCode,
             signal: lastExitInfo.signal,
@@ -236,14 +263,10 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
         result.score = graderResult
-        // Merge outcome from grader if present
         if (graderResult.outcome) {
           result.outcome = graderResult.outcome
         }
       }
-      // Clean up session
-      sessions.destroy(session.id)
     } catch (error) {
       const endTime = Date.now()
       const message = error instanceof Error ? error.message : String(error)
@@ -259,6 +282,7 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
           agent: schema.name,
           trajectoryRichness: 'minimal' as TrajectoryRichness,
           turnCount: inputs.length,
+          ...(resolvedWorkspaceDir && { workspaceDir: workingDir }),
         },
         timing: {
           start: startTime,
@@ -269,14 +293,19 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
         toolErrors: true,
         errors: [message],
       }
+    } finally {
+      // Always clean up session if it was created
+      if (sessionId) {
+        sessions.destroy(sessionId)
+      }
     }
-    results.push(result)
-    // Write result immediately
-    const formatted = JSON.stringify(result)
-    await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
-    isFirstOutput = false
+    // Write result immediately (coordinated via mutex for concurrent writes)
+    await writeMutex.write(async () => {
+      const formatted = JSON.stringify(result)
+      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+      isFirstOutput = false
+    })
     const statusIcon = result.toolErrors ? '!' : '✓'
     const exitInfo = result.metadata?.timedOut
@@ -284,7 +313,22 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
       : result.metadata?.exitCode && result.metadata.exitCode !== 0
         ? ` - exit ${result.metadata.exitCode}`
         : ''
-    logProgress(`  ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
+    logProgress(`  ${statusIcon} ${promptCase.id} (${result.timing.total}ms)${exitInfo}`, progress)
+    return result
+  }
+  // Run with worker pool
+  const { results, errors } = await runWorkerPool(prompts, processPrompt, {
+    concurrency,
+    onProgress: (completed, total) => {
+      logProgress(`Progress: ${completed}/${total} prompts completed`, progress)
+    },
+  })
+  // Log any errors that occurred
+  if (errors.length > 0) {
+    logProgress(`Completed with ${errors.length} error(s)`, progress)
   }
   logProgress('Done!', progress)
@@ -312,6 +356,8 @@ export const capture = async (args: string[]): Promise<void> => {
       append: { type: 'boolean', default: false },
       grader: { type: 'string', short: 'g' },
       debug: { type: 'boolean', default: false },
+      concurrency: { type: 'string', short: 'j' },
+      'workspace-dir': { type: 'string' },
       help: { type: 'boolean', short: 'h' },
     },
     allowPositionals: true,
@@ -329,6 +375,8 @@ Options:
   -o, --output      Output file (default: stdout)
   -c, --cwd         Working directory for agent
   -t, --timeout     Request timeout in ms (overrides schema default)
+  -j, --concurrency Number of concurrent workers (default: 1)
+  --workspace-dir   Base directory for per-prompt workspace isolation
   --progress        Show progress to stderr
   --append          Append to output file instead of overwriting
   -g, --grader      Path to grader (.ts/.js module or executable script)
@@ -348,18 +396,32 @@ Graders:
   TS/JS modules must export a 'grade' function.
   Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Parallelization:
+  Use -j/--concurrency to run multiple prompts in parallel.
+  Each prompt gets its own agent session for isolation.
+  Results are written as they complete (order may differ from input).
+Workspace Isolation:
+  Use --workspace-dir to create per-prompt directories.
+  Each prompt runs in {workspace-dir}/prompt-{id}/.
+  Useful for code generation tasks requiring filesystem isolation.
 Examples:
   # Basic capture with schema
   agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
+  # Run 4 prompts in parallel
+  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 -o results.jsonl
+  # With workspace isolation for code generation
+  agent-eval-harness capture prompts.jsonl -s claude.json -j 4 \\
+    --workspace-dir ./workspaces -o results.jsonl
   # With TypeScript grader
   agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
   # With debug mode
   agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
-  # With per-prompt timeout override (in prompts.jsonl):
-  {"id": "slow-task", "input": "...", "timeout": 180000}
 `)
     return
   }
@@ -387,6 +449,17 @@ Examples:
     }
   }
+  // Validate and parse concurrency
+  let concurrency = 1
+  if (values.concurrency) {
+    const parsed = Number.parseInt(values.concurrency, 10)
+    if (Number.isNaN(parsed) || parsed < 1) {
+      console.error('Error: --concurrency must be a positive integer')
+      process.exit(1)
+    }
+    concurrency = parsed
+  }
   await runCapture({
     promptsPath,
     schemaPath: values.schema,
@@ -397,5 +470,7 @@ Examples:
     append: values.append ?? false,
     grader,
     debug: values.debug ?? false,
+    concurrency,
+    workspaceDir: values['workspace-dir'],
   })
 }

package/src/commands/tests/capture-cli.spec.ts CHANGED Viewed

@@ -117,10 +117,14 @@ describe('runCapture configuration', () => {
       progress: true,
       append: false,
       debug: false,
+      concurrency: 4,
+      workspaceDir: '/tmp/workspaces',
     }
     expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
     expect(config.schemaPath).toBe('./schemas/claude-headless.json')
+    expect(config.concurrency).toBe(4)
+    expect(config.workspaceDir).toBe('/tmp/workspaces')
   })
   test('CaptureConfig allows minimal configuration', () => {
@@ -135,6 +139,8 @@ describe('runCapture configuration', () => {
     expect(config.progress).toBeUndefined()
     expect(config.append).toBeUndefined()
     expect(config.grader).toBeUndefined()
+    expect(config.concurrency).toBeUndefined()
+    expect(config.workspaceDir).toBeUndefined()
   })
 })
@@ -160,6 +166,8 @@ describe('capture CLI', () => {
     expect(stdout).toContain('--progress')
     expect(stdout).toContain('-g, --grader')
     expect(stdout).toContain('-s, --schema')
+    expect(stdout).toContain('-j, --concurrency')
+    expect(stdout).toContain('--workspace-dir')
   })
   test('shows error for missing prompts file argument', async () => {
@@ -187,4 +195,53 @@ describe('capture CLI', () => {
     expect(exitCode).not.toBe(0)
     expect(stderr).toContain('--schema is required')
   })
+  test('shows error for invalid concurrency value', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for zero concurrency', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for negative concurrency', async () => {
+    // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
 })

package/src/commands/tests/trials-cli.spec.ts CHANGED Viewed

@@ -17,11 +17,15 @@ describe('TrialsConfig configuration', () => {
       progress: true,
       append: false,
       debug: false,
+      concurrency: 4,
+      workspaceDir: '/tmp/workspaces',
     }
     expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
     expect(config.schemaPath).toBe('./schemas/claude-headless.json')
     expect(config.k).toBe(5)
+    expect(config.concurrency).toBe(4)
+    expect(config.workspaceDir).toBe('/tmp/workspaces')
   })
   test('TrialsConfig allows minimal configuration', () => {
@@ -37,6 +41,8 @@ describe('TrialsConfig configuration', () => {
     expect(config.progress).toBeUndefined()
     expect(config.append).toBeUndefined()
     expect(config.grader).toBeUndefined()
+    expect(config.concurrency).toBeUndefined()
+    expect(config.workspaceDir).toBeUndefined()
   })
 })
@@ -64,6 +70,8 @@ describe('trials CLI', () => {
     expect(stdout).toContain('-g, --grader')
     expect(stdout).toContain('-s, --schema')
     expect(stdout).toContain('pass@k')
+    expect(stdout).toContain('-j, --concurrency')
+    expect(stdout).toContain('--workspace-dir')
   })
   test('shows error for missing prompts file argument', async () => {
@@ -91,6 +99,38 @@ describe('trials CLI', () => {
     expect(exitCode).not.toBe(0)
     expect(stderr).toContain('--schema is required')
   })
+  test('shows error for invalid concurrency value', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for zero concurrency', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
 })
 // ============================================================================