npm - @plaited/agent-eval-harness - Versions diffs - 0.5.0 - Mend

@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/LICENSE +15 -0
package/README.md +273 -0
package/bin/cli.ts +162 -0
package/bin/tests/cli.spec.ts +529 -0
package/package.json +67 -0
package/src/commands/balance.ts +257 -0
package/src/commands/calibrate.ts +313 -0
package/src/commands/capture.ts +393 -0
package/src/commands/summarize.ts +228 -0
package/src/commands/tests/balance-helpers.spec.ts +279 -0
package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
package/src/commands/tests/capture-cli.spec.ts +190 -0
package/src/commands/tests/capture-helpers.spec.ts +524 -0
package/src/commands/tests/summarize-helpers.spec.ts +339 -0
package/src/commands/tests/trials-calculations.spec.ts +209 -0
package/src/commands/tests/trials-cli.spec.ts +147 -0
package/src/commands/trials.ts +388 -0
package/src/commands/validate-refs.ts +188 -0
package/src/commands.ts +33 -0
package/src/core/core.ts +25 -0
package/src/core/loading.ts +96 -0
package/src/core/output.ts +121 -0
package/src/core/tests/core.spec.ts +309 -0
package/src/core/trajectory.ts +166 -0
package/src/core.ts +28 -0
package/src/harness.ts +46 -0
package/src/headless/headless-cli.ts +430 -0
package/src/headless/headless-history-builder.ts +141 -0
package/src/headless/headless-output-parser.ts +366 -0
package/src/headless/headless-session-manager.ts +587 -0
package/src/headless/headless.schemas.ts +310 -0
package/src/headless/headless.types.ts +19 -0
package/src/headless/tests/headless.spec.ts +678 -0
package/src/headless.ts +72 -0
package/src/integration_tests/claude.spec.ts +157 -0
package/src/integration_tests/gemini.spec.ts +139 -0
package/src/pipeline/compare.ts +325 -0
package/src/pipeline/extract.ts +241 -0
package/src/pipeline/format.ts +292 -0
package/src/pipeline/grade.ts +169 -0
package/src/pipeline/pipeline.ts +41 -0
package/src/pipeline/pipeline.types.ts +241 -0
package/src/pipeline/run.ts +412 -0
package/src/pipeline/tests/pipeline.spec.ts +356 -0
package/src/pipeline.ts +34 -0
package/src/schemas/constants.ts +94 -0
package/src/schemas/grader-loader.ts +174 -0
package/src/schemas/schemas-cli.ts +239 -0
package/src/schemas/schemas.ts +558 -0
package/src/schemas/tests/constants.spec.ts +121 -0
package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/schemas/tests/fixtures/grader-exec.py +29 -0
package/src/schemas/tests/fixtures/grader-module.ts +14 -0
package/src/schemas/tests/grader-loader.spec.ts +153 -0
package/src/schemas/tests/schemas-cli.spec.ts +142 -0
package/src/schemas/tests/schemas.spec.ts +606 -0
package/src/schemas.ts +90 -0

package/src/pipeline/pipeline.ts ADDED Viewed

@@ -0,0 +1,41 @@
+/**
+ * Pipeline commands for Unix-style composable evaluation.
+ *
+ * @remarks
+ * Re-exports pipeline commands and types.
+ *
+ * Commands:
+ * - run: Execute prompts and output raw results
+ * - extract: Parse raw output into trajectories
+ * - grade: Apply grader to extracted results
+ * - format: Convert results to different output formats
+ * - compare: Compare multiple runs of the same prompts
+ *
+ * @packageDocumentation
+ */
+// Commands
+export { compare } from './compare.ts'
+export { extract } from './extract.ts'
+export { format } from './format.ts'
+export { grade } from './grade.ts'
+// Types
+export type {
+  CompareConfig,
+  ComparisonGrader,
+  ComparisonGraderInput,
+  ComparisonGraderResult,
+  ComparisonRanking,
+  ComparisonResult,
+  ExtractConfig,
+  ExtractedResult,
+  FormatConfig,
+  FormatStyle,
+  GradeConfig,
+  GradedResult,
+  LabeledRun,
+  RawOutput,
+  RunConfig,
+  RunMode,
+} from './pipeline.types.ts'
+export { run } from './run.ts'

package/src/pipeline/pipeline.types.ts ADDED Viewed

@@ -0,0 +1,241 @@
+/**
+ * Type definitions for pipeline commands.
+ *
+ * @remarks
+ * These types define the data flow between pipeline stages:
+ * run → extract → grade → format
+ *
+ * Each stage transforms the data, enabling Unix-style piping.
+ *
+ * @packageDocumentation
+ */
+import type { GraderResult, TrajectoryStep } from '../schemas.ts'
+/**
+ * Raw output from the `run` command.
+ *
+ * @remarks
+ * Captures the raw agent output before trajectory extraction.
+ * Used when piping `run` output to `extract`.
+ */
+export type RawOutput = {
+  /** Test case identifier */
+  id: string
+  /** Original prompt input (string for single turn, array for multi-turn) */
+  input: string | string[]
+  /** Grader context hint */
+  hint?: string
+  /** Raw output lines from the agent (JSON strings) */
+  rawLines: string[]
+  /** Timing metadata */
+  timing: {
+    start: number
+    end: number
+    total: number
+  }
+  /** Error message if execution failed */
+  error?: string
+}
+/**
+ * Extracted result from the `extract` command.
+ *
+ * @remarks
+ * Converts raw output lines into structured trajectory and output.
+ * Ready for grading or formatting.
+ */
+export type ExtractedResult = {
+  /** Test case identifier */
+  id: string
+  /** Original prompt input */
+  input: string | string[]
+  /** Grader context hint */
+  hint?: string
+  /** Final agent output (extracted from trajectory) */
+  output: string
+  /** Parsed trajectory steps */
+  trajectory: TrajectoryStep[]
+  /** Whether tool errors were detected */
+  toolErrors: boolean
+  /** Timing metadata */
+  timing: {
+    start: number
+    end: number
+    total: number
+  }
+  /** Error message if extraction failed */
+  error?: string
+}
+/**
+ * Graded result from the `grade` command.
+ *
+ * @remarks
+ * Adds grader score to extracted result.
+ */
+export type GradedResult = ExtractedResult & {
+  /** Grader score */
+  score: GraderResult
+}
+/**
+ * Run mode for the pipeline run command.
+ *
+ * @remarks
+ * - `schema`: Use headless adapter with schema file
+ * - `simple`: Use Bun shell with placeholder substitution
+ * - `shell`: Use Bun shell with PROMPT env variable
+ */
+export type RunMode = 'schema' | 'simple' | 'shell'
+/**
+ * Configuration for pipeline run command.
+ */
+export type RunConfig = {
+  /** Run mode */
+  mode: RunMode
+  /** Path to schema file (for 'schema' mode) */
+  schemaPath?: string
+  /** Command template (for 'simple' mode) - {} is replaced with prompt */
+  simpleCommand?: string
+  /** Shell template (for 'shell' mode) - $PROMPT env var is available */
+  shellTemplate?: string
+  /** Working directory */
+  cwd?: string
+  /** Timeout per prompt in milliseconds */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+}
+/**
+ * Configuration for pipeline extract command.
+ */
+export type ExtractConfig = {
+  /** Path to schema file for output parsing */
+  schemaPath: string
+  /** Show progress to stderr */
+  progress?: boolean
+}
+/**
+ * Configuration for pipeline grade command.
+ */
+export type GradeConfig = {
+  /** Path to grader module or executable */
+  graderPath: string
+  /** Show progress to stderr */
+  progress?: boolean
+}
+/**
+ * Output format for pipeline format command.
+ */
+export type FormatStyle = 'jsonl' | 'markdown' | 'csv'
+/**
+ * Configuration for pipeline format command.
+ */
+export type FormatConfig = {
+  /** Output format style */
+  style: FormatStyle
+  /** Show progress to stderr */
+  progress?: boolean
+}
+/**
+ * Labeled run for comparison.
+ *
+ * @remarks
+ * Associates a results file with a human-readable label
+ * for the compare command output.
+ */
+export type LabeledRun = {
+  /** Human-readable label (derived from filename or explicit) */
+  label: string
+  /** Path to results JSONL file */
+  path: string
+}
+/**
+ * Input to comparison grader function.
+ *
+ * @remarks
+ * Provides all runs' results for a single prompt ID
+ * so the grader can compare and rank them.
+ */
+export type ComparisonGraderInput = {
+  /** Test case identifier */
+  id: string
+  /** Original prompt input */
+  input: string | string[]
+  /** Grader context hint */
+  hint?: string
+  /** Results keyed by run label */
+  runs: Record<string, { output: string; trajectory?: TrajectoryStep[] }>
+}
+/**
+ * Single ranking entry in comparison result.
+ */
+export type ComparisonRanking = {
+  /** Run label */
+  run: string
+  /** Rank position (1 = best) */
+  rank: number
+  /** Numeric score */
+  score: number
+}
+/**
+ * Result from comparison grader function.
+ *
+ * @remarks
+ * Rankings should be ordered from best to worst.
+ */
+export type ComparisonGraderResult = {
+  /** Rankings from best to worst */
+  rankings: ComparisonRanking[]
+  /** Optional reasoning for the rankings */
+  reasoning?: string
+}
+/**
+ * Comparison grader function type.
+ *
+ * @remarks
+ * User-provided graders implement this interface to compare
+ * multiple runs of the same prompt.
+ */
+export type ComparisonGrader = (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
+/**
+ * Configuration for pipeline compare command.
+ */
+export type CompareConfig = {
+  /** Labeled runs to compare */
+  runs: LabeledRun[]
+  /** Path to comparison grader */
+  graderPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Show progress to stderr */
+  progress?: boolean
+}
+/**
+ * Comparison result for a single prompt.
+ */
+export type ComparisonResult = {
+  /** Test case identifier */
+  id: string
+  /** Original prompt input */
+  input: string | string[]
+  /** Grader context hint */
+  hint?: string
+  /** Rankings from comparison grader */
+  rankings: ComparisonRanking[]
+  /** Optional reasoning */
+  reasoning?: string
+}

package/src/pipeline/run.ts ADDED Viewed

@@ -0,0 +1,412 @@
+/**
+ * Pipeline run command - execute prompts and output raw results.
+ *
+ * @remarks
+ * Supports three modes:
+ * - `schema`: Use headless adapter with schema file (full trajectory capture)
+ * - `simple`: Use Bun shell with `{}` placeholder for prompt
+ * - `shell`: Use Bun shell with `$PROMPT` environment variable
+ *
+ * Output is RawOutput JSONL suitable for piping to `extract`.
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { loadPrompts, logProgress, writeOutput } from '../core.ts'
+import { parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import { createSessionManager } from '../headless/headless-session-manager.ts'
+import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
+import type { RawOutput, RunConfig } from './pipeline.types.ts'
+/**
+ * Execute a single prompt in simple mode.
+ *
+ * @remarks
+ * Replaces `{}` placeholder in command with the prompt text.
+ * Uses Bun shell for execution.
+ *
+ * @param prompt - Prompt text to execute
+ * @param command - Command template with `{}` placeholder
+ * @param timeout - Execution timeout in milliseconds
+ * @returns Object with output lines and optional stderr error
+ */
+const runSimple = async (
+  prompt: string,
+  command: string,
+  timeout: number,
+): Promise<{ lines: string[]; error?: string }> => {
+  const escapedPrompt = prompt.replace(/'/g, "'\\''")
+  const finalCmd = command.replace('{}', `'${escapedPrompt}'`)
+  const proc = Bun.spawn(['sh', '-c', finalCmd], {
+    stdout: 'pipe',
+    stderr: 'pipe',
+  })
+  const timeoutId = setTimeout(() => proc.kill(), timeout)
+  try {
+    const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
+    clearTimeout(timeoutId)
+    const lines = stdout.trim().split('\n').filter(Boolean)
+    return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
+  } catch (err) {
+    clearTimeout(timeoutId)
+    return { lines: [], error: err instanceof Error ? err.message : String(err) }
+  }
+}
+/**
+ * Execute a single prompt in shell mode.
+ *
+ * @remarks
+ * Sets PROMPT environment variable and executes shell template.
+ *
+ * @param prompt - Prompt text to execute
+ * @param template - Shell command template
+ * @param timeout - Execution timeout in milliseconds
+ * @returns Object with output lines and optional stderr error
+ */
+const runShell = async (
+  prompt: string,
+  template: string,
+  timeout: number,
+): Promise<{ lines: string[]; error?: string }> => {
+  const proc = Bun.spawn(['sh', '-c', template], {
+    stdout: 'pipe',
+    stderr: 'pipe',
+    env: { ...process.env, PROMPT: prompt },
+  })
+  const timeoutId = setTimeout(() => proc.kill(), timeout)
+  try {
+    const [stdout, stderr] = await Promise.all([new Response(proc.stdout).text(), new Response(proc.stderr).text()])
+    clearTimeout(timeoutId)
+    const lines = stdout.trim().split('\n').filter(Boolean)
+    return stderr.trim() ? { lines, error: stderr.trim() } : { lines }
+  } catch (err) {
+    clearTimeout(timeoutId)
+    return { lines: [], error: err instanceof Error ? err.message : String(err) }
+  }
+}
+/**
+ * Execute pipeline run with configuration object.
+ *
+ * @remarks
+ * Processes prompts from stdin (if available) or from a file,
+ * executing each and outputting RawOutput JSONL.
+ *
+ * @param config - Run configuration
+ * @param prompts - Array of prompts to execute
+ * @param outputPath - Optional output file path
+ */
+export const runPipeline = async (
+  config: RunConfig,
+  prompts: Array<{ id: string; input: string | string[]; hint?: string }>,
+  outputPath?: string,
+): Promise<void> => {
+  const {
+    mode,
+    schemaPath,
+    simpleCommand,
+    shellTemplate,
+    cwd,
+    timeout = DEFAULT_HARNESS_TIMEOUT,
+    progress = false,
+  } = config
+  const workingDir = cwd ?? process.cwd()
+  let isFirstOutput = true
+  // Clear output file if specified
+  if (outputPath) {
+    await Bun.write(outputPath, '')
+  }
+  if (mode === 'schema') {
+    // Schema mode: use headless adapter
+    if (!schemaPath) {
+      throw new Error('Schema path required for schema mode')
+    }
+    const schemaFile = Bun.file(schemaPath)
+    if (!(await schemaFile.exists())) {
+      throw new Error(`Schema file not found: ${schemaPath}`)
+    }
+    const rawSchema = await schemaFile.json()
+    const schema = parseHeadlessConfig(rawSchema)
+    const sessions = createSessionManager({
+      schema,
+      timeout,
+      verbose: progress,
+    })
+    logProgress(`Schema mode: ${schema.name}`, progress)
+    for (let i = 0; i < prompts.length; i++) {
+      const promptCase = prompts[i]
+      if (!promptCase) continue
+      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
+      const startTime = Date.now()
+      const rawLines: string[] = []
+      let error: string | undefined
+      try {
+        const session = await sessions.create(workingDir)
+        const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
+        for (const turnInput of inputs) {
+          const result = await sessions.prompt(session.id, turnInput)
+          // Collect raw JSON lines from updates
+          for (const update of result.updates) {
+            rawLines.push(JSON.stringify(update.raw))
+          }
+        }
+        sessions.destroy(session.id)
+      } catch (err) {
+        error = err instanceof Error ? err.message : String(err)
+      }
+      const endTime = Date.now()
+      const output: RawOutput = {
+        id: promptCase.id,
+        input: promptCase.input,
+        hint: promptCase.hint,
+        rawLines,
+        timing: {
+          start: startTime,
+          end: endTime,
+          total: endTime - startTime,
+        },
+        ...(error && { error }),
+      }
+      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
+      isFirstOutput = false
+    }
+  } else if (mode === 'simple') {
+    // Simple mode: placeholder substitution
+    if (!simpleCommand) {
+      throw new Error('Command required for simple mode')
+    }
+    logProgress(`Simple mode: ${simpleCommand}`, progress)
+    for (let i = 0; i < prompts.length; i++) {
+      const promptCase = prompts[i]
+      if (!promptCase) continue
+      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
+      const startTime = Date.now()
+      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
+      const allLines: string[] = []
+      const errors: string[] = []
+      for (const input of inputs) {
+        const result = await runSimple(input, simpleCommand, timeout)
+        allLines.push(...result.lines)
+        if (result.error) errors.push(result.error)
+      }
+      const endTime = Date.now()
+      const output: RawOutput = {
+        id: promptCase.id,
+        input: promptCase.input,
+        hint: promptCase.hint,
+        rawLines: allLines,
+        timing: {
+          start: startTime,
+          end: endTime,
+          total: endTime - startTime,
+        },
+        ...(errors.length > 0 && { error: errors.join('\n') }),
+      }
+      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
+      isFirstOutput = false
+    }
+  } else if (mode === 'shell') {
+    // Shell mode: PROMPT env variable
+    if (!shellTemplate) {
+      throw new Error('Shell template required for shell mode')
+    }
+    logProgress(`Shell mode: ${shellTemplate}`, progress)
+    for (let i = 0; i < prompts.length; i++) {
+      const promptCase = prompts[i]
+      if (!promptCase) continue
+      logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}`, progress)
+      const startTime = Date.now()
+      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
+      const allLines: string[] = []
+      const errors: string[] = []
+      for (const input of inputs) {
+        const result = await runShell(input, shellTemplate, timeout)
+        allLines.push(...result.lines)
+        if (result.error) errors.push(result.error)
+      }
+      const endTime = Date.now()
+      const output: RawOutput = {
+        id: promptCase.id,
+        input: promptCase.input,
+        hint: promptCase.hint,
+        rawLines: allLines,
+        timing: {
+          start: startTime,
+          end: endTime,
+          total: endTime - startTime,
+        },
+        ...(errors.length > 0 && { error: errors.join('\n') }),
+      }
+      await writeOutput(JSON.stringify(output), outputPath, !isFirstOutput)
+      isFirstOutput = false
+    }
+  }
+  logProgress('Done!', progress)
+}
+/**
+ * Read prompts from stdin if available.
+ *
+ * @returns Array of parsed prompts or null if stdin is empty
+ */
+const readStdinPrompts = async (): Promise<Array<{ id: string; input: string | string[]; hint?: string }> | null> => {
+  // Check if stdin has data (not a TTY)
+  if (process.stdin.isTTY) {
+    return null
+  }
+  const chunks: Buffer[] = []
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk)
+  }
+  const content = Buffer.concat(chunks).toString('utf-8').trim()
+  if (!content) return null
+  return content
+    .split('\n')
+    .filter(Boolean)
+    .map((line) => JSON.parse(line))
+}
+/**
+ * Pipeline run command CLI handler.
+ *
+ * @param args - Command line arguments (after 'run')
+ */
+export const run = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      schema: { type: 'string', short: 's' },
+      simple: { type: 'string' },
+      shell: { type: 'string' },
+      output: { type: 'string', short: 'o' },
+      cwd: { type: 'string', short: 'c' },
+      timeout: { type: 'string', short: 't' },
+      progress: { type: 'boolean', default: false },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: agent-eval-harness run [prompts.jsonl] [options]
+Execute prompts and output raw results for pipeline processing.
+Arguments:
+  prompts.jsonl     Input file (or pipe from stdin)
+Modes (choose one):
+  -s, --schema      Path to headless adapter schema (recommended)
+  --simple          Command template with {} placeholder
+  --shell           Shell template with $PROMPT env variable
+Options:
+  -o, --output      Output file (default: stdout)
+  -c, --cwd         Working directory for agent
+  -t, --timeout     Request timeout in ms (default: ${DEFAULT_HARNESS_TIMEOUT})
+  --progress        Show progress to stderr
+  -h, --help        Show this help message
+Examples:
+  # Schema mode (recommended)
+  agent-eval-harness run prompts.jsonl --schema claude.json | agent-eval-harness extract
+  # Simple mode with placeholder
+  agent-eval-harness run prompts.jsonl --simple "claude -p {} --output-format stream-json"
+  # Shell mode with env variable
+  agent-eval-harness run prompts.jsonl --shell 'claude -p "$PROMPT" --output-format stream-json'
+  # Pipe from stdin
+  cat prompts.jsonl | agent-eval-harness run --schema claude.json
+`)
+    return
+  }
+  // Determine mode
+  let mode: 'schema' | 'simple' | 'shell'
+  if (values.schema) {
+    mode = 'schema'
+  } else if (values.simple) {
+    mode = 'simple'
+  } else if (values.shell) {
+    mode = 'shell'
+  } else {
+    console.error('Error: Must specify --schema, --simple, or --shell mode')
+    process.exit(1)
+  }
+  // Load prompts from file or stdin
+  const promptsPath = positionals[0]
+  let prompts: Array<{ id: string; input: string | string[]; hint?: string }>
+  if (promptsPath) {
+    prompts = await loadPrompts(promptsPath)
+  } else {
+    const stdinPrompts = await readStdinPrompts()
+    if (!stdinPrompts || stdinPrompts.length === 0) {
+      console.error('Error: No prompts provided (use file argument or pipe to stdin)')
+      process.exit(1)
+    }
+    prompts = stdinPrompts
+  }
+  await runPipeline(
+    {
+      mode,
+      schemaPath: values.schema,
+      simpleCommand: values.simple,
+      shellTemplate: values.shell,
+      cwd: values.cwd,
+      timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
+      progress: values.progress,
+    },
+    prompts,
+    values.output,
+  )
+}