npm - @plaited/agent-eval-harness - Versions diffs - 0.9.0 → 0.11.0 - Mend

@plaited/agent-eval-harness 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +10 -0
package/package.json +1 -1
package/src/commands/balance.ts +1 -11
package/src/commands/calibrate.ts +2 -10
package/src/commands/capture.ts +104 -114
package/src/commands/execution.ts +245 -0
package/src/commands/tests/capture-cli.spec.ts +84 -0
package/src/commands/tests/trials-cli.spec.ts +68 -0
package/src/commands/trials.ts +98 -115
package/src/commands/validate-refs.ts +3 -19
package/src/core/core.ts +27 -1
package/src/core/loading.ts +53 -19
package/src/core/streaming.ts +172 -0
package/src/core/tests/streaming.spec.ts +399 -0
package/src/core/tests/worker-pool.spec.ts +377 -0
package/src/core/worker-pool.ts +220 -0
package/src/core.ts +15 -0
package/src/schemas/grader-loader.ts +23 -6
package/src/schemas/schemas-cli.ts +1 -6
package/src/schemas/schemas.ts +2 -0
package/src/schemas.ts +1 -1

package/src/commands/execution.ts ADDED Viewed

@@ -0,0 +1,245 @@
+/**
+ * Shared execution utilities for capture and trials commands.
+ *
+ * @remarks
+ * Extracts common setup logic: schema loading, prompt loading, path resolution,
+ * session manager creation, output initialization, and worker pool execution.
+ *
+ * @packageDocumentation
+ */
+import { mkdir } from 'node:fs/promises'
+import { createWriteMutex, loadPrompts, logProgress, resolvePath, runWorkerPool, writeOutput } from '../core.ts'
+import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import { createSessionManager, type SessionManager } from '../headless/headless-session-manager.ts'
+import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
+import type { Grader, PromptCase } from '../schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Base configuration shared by capture and trials commands */
+export type BaseExecutionConfig = {
+  /** Path to prompts.jsonl file (required unless prompts provided) */
+  promptsPath?: string
+  /** Path to agent schema JSON file */
+  schemaPath: string
+  /** Pre-loaded prompt cases (from stdin); skips file loading when set */
+  prompts?: PromptCase[]
+  /** Output file path (undefined for stdout) */
+  outputPath?: string
+  /** Working directory for agent */
+  cwd?: string
+  /** Timeout per prompt in milliseconds (overrides schema default) */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+  /** Append to output file instead of overwriting */
+  append?: boolean
+  /** Optional grader function */
+  grader?: Grader
+  /** Enable debug mode */
+  debug?: boolean
+  /** Number of concurrent workers (default: 1 for sequential) */
+  concurrency?: number
+  /** Base directory for per-prompt workspace isolation */
+  workspaceDir?: string
+}
+/** Prepared execution context returned by prepareExecution */
+export type ExecutionContext = {
+  /** Parsed and validated headless adapter schema */
+  schema: HeadlessAdapterConfig
+  /** Loaded and validated prompt cases */
+  prompts: PromptCase[]
+  /** Session manager for creating/destroying agent sessions */
+  sessions: SessionManager
+  /** Resolved absolute output path (undefined for stdout) */
+  resolvedOutputPath?: string
+  /** Resolved absolute workspace directory path */
+  resolvedWorkspaceDir?: string
+  /** Effective timeout in milliseconds */
+  effectiveTimeout: number
+  /** Default working directory for agent sessions */
+  defaultWorkingDir: string
+  /** Number of concurrent workers */
+  concurrency: number
+  /** Whether to show progress output */
+  progress: boolean
+  /** Optional grader function */
+  grader?: Grader
+  /** Whether debug mode is enabled */
+  debug: boolean
+  /** Write a result object as JSONL, coordinated via mutex */
+  writeResult: (result: unknown) => Promise<void>
+}
+// ============================================================================
+// Execution Setup
+// ============================================================================
+/**
+ * Prepare execution context from base configuration.
+ *
+ * @remarks
+ * Handles all shared setup: schema loading/validation, prompt loading,
+ * path resolution, session manager creation, output file initialization,
+ * workspace directory creation, and write mutex coordination.
+ *
+ * @param config - Base execution configuration
+ * @returns Prepared execution context
+ * @throws Error if schema file not found, invalid, or prompts missing
+ *
+ * @public
+ */
+export const prepareExecution = async (config: BaseExecutionConfig): Promise<ExecutionContext> => {
+  const {
+    promptsPath,
+    schemaPath,
+    outputPath,
+    cwd,
+    timeout,
+    progress = false,
+    append = false,
+    grader,
+    debug = false,
+    concurrency = 1,
+    workspaceDir,
+  } = config
+  // Validate prompt source
+  if (!config.prompts && !promptsPath) {
+    throw new Error('Either promptsPath or prompts must be provided')
+  }
+  // Load and validate schema
+  const schemaFile = Bun.file(schemaPath)
+  if (!(await schemaFile.exists())) {
+    throw new Error(`Schema file not found: ${schemaPath}`)
+  }
+  let schema: HeadlessAdapterConfig
+  try {
+    const rawSchema = await schemaFile.json()
+    schema = parseHeadlessConfig(rawSchema)
+  } catch (error) {
+    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
+  }
+  // Load prompts
+  const prompts = config.prompts ?? (await loadPrompts(promptsPath!))
+  // Resolve paths
+  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  const resolvedWorkspaceDir = workspaceDir ? resolvePath(workspaceDir) : undefined
+  // Determine effective timeout (CLI flag > schema default > harness default)
+  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
+  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
+  // Create session manager
+  const sessions = createSessionManager({
+    schema,
+    timeout: effectiveTimeout,
+    verbose: progress,
+    debug,
+  })
+  // Initialize output file (clear if not appending)
+  if (resolvedOutputPath && !append) {
+    await Bun.write(resolvedOutputPath, '')
+  }
+  // Create workspace base directory if specified
+  if (resolvedWorkspaceDir) {
+    await mkdir(resolvedWorkspaceDir, { recursive: true })
+  }
+  const defaultWorkingDir = cwd ?? process.cwd()
+  // Create write mutex with closure for coordinated result writing
+  const writeMutex = createWriteMutex()
+  let isFirstOutput = true
+  const writeResult = async (result: unknown) => {
+    await writeMutex.write(async () => {
+      const formatted = JSON.stringify(result)
+      await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+      isFirstOutput = false
+    })
+  }
+  return {
+    schema,
+    prompts,
+    sessions,
+    resolvedOutputPath,
+    resolvedWorkspaceDir,
+    effectiveTimeout,
+    defaultWorkingDir,
+    concurrency,
+    progress,
+    grader,
+    debug,
+    writeResult,
+  }
+}
+// ============================================================================
+// Worker Pool Execution
+// ============================================================================
+/**
+ * Execute prompts through a worker pool with progress logging.
+ *
+ * @remarks
+ * Common wrapper for the runWorkerPool pattern used by both capture and trials.
+ * Handles progress callbacks, error logging, and completion logging.
+ *
+ * @param ctx - Execution context from prepareExecution
+ * @param processFn - Function to process each prompt
+ * @returns Array of results
+ *
+ * @public
+ */
+export const executePrompts = async <T>(
+  ctx: ExecutionContext,
+  processFn: (promptCase: PromptCase, index: number) => Promise<T>,
+): Promise<T[]> => {
+  const { results, errors } = await runWorkerPool(ctx.prompts, processFn, {
+    concurrency: ctx.concurrency,
+    onProgress: (completed, total) => {
+      logProgress(`Progress: ${completed}/${total} prompts completed`, ctx.progress)
+    },
+  })
+  if (errors.length > 0) {
+    logProgress(`Completed with ${errors.length} error(s)`, ctx.progress)
+  }
+  logProgress('Done!', ctx.progress)
+  return results
+}
+// ============================================================================
+// CLI Helpers
+// ============================================================================
+/**
+ * Parse and validate concurrency CLI argument.
+ *
+ * @param value - Raw string value from parseArgs
+ * @returns Validated positive integer (default: 1)
+ *
+ * @public
+ */
+export const parseConcurrency = (value: string | undefined): number => {
+  if (!value) return 1
+  const parsed = Number.parseInt(value, 10)
+  if (Number.isNaN(parsed) || parsed < 1) {
+    console.error('Error: --concurrency must be a positive integer')
+    process.exit(1)
+  }
+  return parsed
+}

package/src/commands/tests/capture-cli.spec.ts CHANGED Viewed

@@ -117,10 +117,14 @@ describe('runCapture configuration', () => {
       progress: true,
       append: false,
       debug: false,
+      concurrency: 4,
+      workspaceDir: '/tmp/workspaces',
     }
     expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
     expect(config.schemaPath).toBe('./schemas/claude-headless.json')
+    expect(config.concurrency).toBe(4)
+    expect(config.workspaceDir).toBe('/tmp/workspaces')
   })
   test('CaptureConfig allows minimal configuration', () => {
@@ -135,6 +139,18 @@ describe('runCapture configuration', () => {
     expect(config.progress).toBeUndefined()
     expect(config.append).toBeUndefined()
     expect(config.grader).toBeUndefined()
+    expect(config.concurrency).toBeUndefined()
+    expect(config.workspaceDir).toBeUndefined()
+  })
+  test('CaptureConfig accepts prompts without promptsPath', () => {
+    const config: CaptureConfig = {
+      schemaPath: './test-schema.json',
+      prompts: [{ id: 't1', input: 'hello' }],
+    }
+    expect(config.promptsPath).toBeUndefined()
+    expect(config.prompts).toHaveLength(1)
   })
 })
@@ -160,6 +176,25 @@ describe('capture CLI', () => {
     expect(stdout).toContain('--progress')
     expect(stdout).toContain('-g, --grader')
     expect(stdout).toContain('-s, --schema')
+    expect(stdout).toContain('-j, --concurrency')
+    expect(stdout).toContain('--workspace-dir')
+    expect(stdout).toContain('--stdin')
+  })
+  test('shows error for --stdin with positional file', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
   })
   test('shows error for missing prompts file argument', async () => {
@@ -187,4 +222,53 @@ describe('capture CLI', () => {
     expect(exitCode).not.toBe(0)
     expect(stderr).toContain('--schema is required')
   })
+  test('shows error for invalid concurrency value', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for zero concurrency', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for negative concurrency', async () => {
+    // Note: Using --concurrency=-1 format because -j -1 is ambiguous to parseArgs
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'capture', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '--concurrency=-1'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
 })

package/src/commands/tests/trials-cli.spec.ts CHANGED Viewed

@@ -17,11 +17,15 @@ describe('TrialsConfig configuration', () => {
       progress: true,
       append: false,
       debug: false,
+      concurrency: 4,
+      workspaceDir: '/tmp/workspaces',
     }
     expect(config.promptsPath).toBe('/tmp/prompts.jsonl')
     expect(config.schemaPath).toBe('./schemas/claude-headless.json')
     expect(config.k).toBe(5)
+    expect(config.concurrency).toBe(4)
+    expect(config.workspaceDir).toBe('/tmp/workspaces')
   })
   test('TrialsConfig allows minimal configuration', () => {
@@ -37,6 +41,19 @@ describe('TrialsConfig configuration', () => {
     expect(config.progress).toBeUndefined()
     expect(config.append).toBeUndefined()
     expect(config.grader).toBeUndefined()
+    expect(config.concurrency).toBeUndefined()
+    expect(config.workspaceDir).toBeUndefined()
+  })
+  test('TrialsConfig accepts prompts without promptsPath', () => {
+    const config: TrialsConfig = {
+      schemaPath: './test-schema.json',
+      k: 3,
+      prompts: [{ id: 't1', input: 'hello' }],
+    }
+    expect(config.promptsPath).toBeUndefined()
+    expect(config.prompts).toHaveLength(1)
   })
 })
@@ -64,6 +81,25 @@ describe('trials CLI', () => {
     expect(stdout).toContain('-g, --grader')
     expect(stdout).toContain('-s, --schema')
     expect(stdout).toContain('pass@k')
+    expect(stdout).toContain('-j, --concurrency')
+    expect(stdout).toContain('--workspace-dir')
+    expect(stdout).toContain('--stdin')
+  })
+  test('shows error for --stdin with positional file', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '--stdin', '-s', '/tmp/schema.json'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--stdin and prompts file argument are mutually exclusive')
   })
   test('shows error for missing prompts file argument', async () => {
@@ -91,6 +127,38 @@ describe('trials CLI', () => {
     expect(exitCode).not.toBe(0)
     expect(stderr).toContain('--schema is required')
   })
+  test('shows error for invalid concurrency value', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', 'abc'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
+  test('shows error for zero concurrency', async () => {
+    const proc = Bun.spawn(
+      ['bun', './bin/cli.ts', 'trials', '/tmp/prompts.jsonl', '-s', '/tmp/schema.json', '-j', '0'],
+      {
+        stdout: 'pipe',
+        stderr: 'pipe',
+      },
+    )
+    const stderr = await new Response(proc.stderr).text()
+    const exitCode = await proc.exited
+    expect(exitCode).not.toBe(0)
+    expect(stderr).toContain('--concurrency must be a positive integer')
+  })
 })
 // ============================================================================