npm - @plaited/agent-eval-harness - Versions diffs - 0.5.0 - Mend

@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/LICENSE +15 -0
package/README.md +273 -0
package/bin/cli.ts +162 -0
package/bin/tests/cli.spec.ts +529 -0
package/package.json +67 -0
package/src/commands/balance.ts +257 -0
package/src/commands/calibrate.ts +313 -0
package/src/commands/capture.ts +393 -0
package/src/commands/summarize.ts +228 -0
package/src/commands/tests/balance-helpers.spec.ts +279 -0
package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
package/src/commands/tests/capture-cli.spec.ts +190 -0
package/src/commands/tests/capture-helpers.spec.ts +524 -0
package/src/commands/tests/summarize-helpers.spec.ts +339 -0
package/src/commands/tests/trials-calculations.spec.ts +209 -0
package/src/commands/tests/trials-cli.spec.ts +147 -0
package/src/commands/trials.ts +388 -0
package/src/commands/validate-refs.ts +188 -0
package/src/commands.ts +33 -0
package/src/core/core.ts +25 -0
package/src/core/loading.ts +96 -0
package/src/core/output.ts +121 -0
package/src/core/tests/core.spec.ts +309 -0
package/src/core/trajectory.ts +166 -0
package/src/core.ts +28 -0
package/src/harness.ts +46 -0
package/src/headless/headless-cli.ts +430 -0
package/src/headless/headless-history-builder.ts +141 -0
package/src/headless/headless-output-parser.ts +366 -0
package/src/headless/headless-session-manager.ts +587 -0
package/src/headless/headless.schemas.ts +310 -0
package/src/headless/headless.types.ts +19 -0
package/src/headless/tests/headless.spec.ts +678 -0
package/src/headless.ts +72 -0
package/src/integration_tests/claude.spec.ts +157 -0
package/src/integration_tests/gemini.spec.ts +139 -0
package/src/pipeline/compare.ts +325 -0
package/src/pipeline/extract.ts +241 -0
package/src/pipeline/format.ts +292 -0
package/src/pipeline/grade.ts +169 -0
package/src/pipeline/pipeline.ts +41 -0
package/src/pipeline/pipeline.types.ts +241 -0
package/src/pipeline/run.ts +412 -0
package/src/pipeline/tests/pipeline.spec.ts +356 -0
package/src/pipeline.ts +34 -0
package/src/schemas/constants.ts +94 -0
package/src/schemas/grader-loader.ts +174 -0
package/src/schemas/schemas-cli.ts +239 -0
package/src/schemas/schemas.ts +558 -0
package/src/schemas/tests/constants.spec.ts +121 -0
package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/schemas/tests/fixtures/grader-exec.py +29 -0
package/src/schemas/tests/fixtures/grader-module.ts +14 -0
package/src/schemas/tests/grader-loader.spec.ts +153 -0
package/src/schemas/tests/schemas-cli.spec.ts +142 -0
package/src/schemas/tests/schemas.spec.ts +606 -0
package/src/schemas.ts +90 -0

package/src/commands/capture.ts ADDED Viewed

@@ -0,0 +1,393 @@
+/**
+ * Core trajectory capture command.
+ *
+ * @remarks
+ * Executes prompts against a CLI agent and captures full trajectories.
+ * This is the foundational command - all other views derive from its output.
+ *
+ * Output format is always full trajectory JSONL (`CaptureResultSchema`).
+ * Use `summarize` command to derive compact views.
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import {
+  detectTrajectoryRichness,
+  extractOutput,
+  extractTrajectory,
+  getInputPreview,
+  hasToolErrors,
+  loadPrompts,
+  logProgress,
+  resolvePath,
+  writeOutput,
+} from '../core.ts'
+import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
+import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
+import { createSessionManager, type ProcessExitInfo, type PromptResult } from '../headless/headless-session-manager.ts'
+import { DEFAULT_HARNESS_TIMEOUT } from '../schemas/constants.ts'
+import { loadGrader } from '../schemas/grader-loader.ts'
+import type { CaptureResult, Grader, TrajectoryRichness } from '../schemas.ts'
+// ============================================================================
+// Re-exports for backward compatibility
+// ============================================================================
+// These functions are now in core/ but re-exported here for existing consumers
+export {
+  detectTrajectoryRichness,
+  extractContent,
+  extractFilePath,
+  extractOutput,
+  extractTrajectory,
+  hasToolErrors,
+  headTailPreview,
+  loadPrompts,
+} from '../core.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for capture command */
+export type CaptureConfig = {
+  /** Path to prompts.jsonl file */
+  promptsPath: string
+  /** Path to agent schema JSON file */
+  schemaPath: string
+  /** Output file path (undefined for stdout) */
+  outputPath?: string
+  /** Working directory for agent */
+  cwd?: string
+  /** Timeout per prompt in milliseconds (overrides schema default) */
+  timeout?: number
+  /** Show progress to stderr */
+  progress?: boolean
+  /** Append to output file instead of overwriting */
+  append?: boolean
+  /** Optional grader function */
+  grader?: Grader
+  /** Enable debug mode for detailed output */
+  debug?: boolean
+}
+// ============================================================================
+// Capture Implementation
+// ============================================================================
+/**
+ * Execute capture with configuration object.
+ *
+ * @remarks
+ * Creates a fresh session for each JSONL entry to ensure isolation.
+ * Supports multi-turn conversations via `input: string[]`.
+ *
+ * @param config - Capture configuration
+ * @returns Array of capture results
+ */
+export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]> => {
+  const {
+    promptsPath,
+    schemaPath,
+    outputPath,
+    cwd,
+    timeout,
+    progress = false,
+    append = false,
+    grader,
+    debug = false,
+  } = config
+  // Load and validate schema
+  const schemaFile = Bun.file(schemaPath)
+  if (!(await schemaFile.exists())) {
+    throw new Error(`Schema file not found: ${schemaPath}`)
+  }
+  let schema: HeadlessAdapterConfig
+  try {
+    const rawSchema = await schemaFile.json()
+    schema = parseHeadlessConfig(rawSchema)
+  } catch (error) {
+    throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
+  }
+  // Load prompts
+  const prompts = await loadPrompts(promptsPath)
+  // Resolve output path
+  const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
+  // Determine effective timeout (CLI flag > schema default > harness default)
+  const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
+  const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
+  // Log progress info
+  logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
+  logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
+  logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
+  if (resolvedOutputPath) {
+    logProgress(`Output: ${resolvedOutputPath}`, progress)
+  }
+  if (debug) {
+    logProgress(`Debug mode: enabled`, progress)
+  }
+  // Create session manager with schema
+  const sessions = createSessionManager({
+    schema,
+    timeout: effectiveTimeout,
+    verbose: progress,
+    debug,
+  })
+  // Clear output file if not appending
+  if (resolvedOutputPath && !append) {
+    await Bun.write(resolvedOutputPath, '')
+  }
+  const workingDir = cwd ?? process.cwd()
+  const results: CaptureResult[] = []
+  let isFirstOutput = true
+  // Run evaluations sequentially - fresh session per entry
+  for (let i = 0; i < prompts.length; i++) {
+    const promptCase = prompts[i]
+    if (!promptCase) continue
+    logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: ${getInputPreview(promptCase.input)}...`, progress)
+    const startTime = Date.now()
+    let result: CaptureResult
+    try {
+      // Create fresh session for each entry (ensures isolation)
+      const sessionStart = Date.now()
+      const session = await sessions.create(workingDir)
+      const sessionCreation = Date.now() - sessionStart
+      logProgress(`  Session: ${session.id}`, progress)
+      // Handle string or array input
+      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
+      const turnCount = inputs.length
+      // Collect all updates from all turns
+      const allUpdates: ParsedUpdate[] = []
+      let lastExitInfo: ProcessExitInfo | undefined
+      let lastOutput = ''
+      // TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
+      // The session manager would need to accept timeout per-call to support this
+      // Execute each turn sequentially in the same session
+      for (const turnInput of inputs) {
+        const turnResult: PromptResult = await sessions.prompt(session.id, turnInput)
+        allUpdates.push(...turnResult.updates)
+        lastExitInfo = turnResult.exitInfo
+        lastOutput = turnResult.output
+      }
+      const endTime = Date.now()
+      const trajectory = extractTrajectory(allUpdates, startTime)
+      // Use last turn's output or extract from trajectory
+      const output = lastOutput || extractOutput(trajectory)
+      const toolErrors = hasToolErrors(trajectory) || (lastExitInfo?.timedOut ?? false)
+      const trajectoryRichness = detectTrajectoryRichness(trajectory)
+      result = {
+        id: promptCase.id,
+        input: promptCase.input, // Preserve original (string or array)
+        output,
+        ...(promptCase.hint && { hint: promptCase.hint }),
+        trajectory,
+        metadata: {
+          ...promptCase.metadata,
+          agent: schema.name,
+          trajectoryRichness,
+          turnCount,
+          ...(lastExitInfo && {
+            exitCode: lastExitInfo.exitCode,
+            signal: lastExitInfo.signal,
+            timedOut: lastExitInfo.timedOut,
+          }),
+        },
+        timing: {
+          start: startTime,
+          end: endTime,
+          firstResponse: trajectory.length > 0 ? trajectory[0]?.timestamp : undefined,
+          sessionCreation,
+          total: endTime - startTime,
+        },
+        toolErrors,
+      }
+      // Apply grader if provided
+      if (grader) {
+        result.score = await grader({
+          input: promptCase.input,
+          output,
+          hint: promptCase.hint,
+          trajectory,
+        })
+      }
+      // Clean up session
+      sessions.destroy(session.id)
+    } catch (error) {
+      const endTime = Date.now()
+      const message = error instanceof Error ? error.message : String(error)
+      const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
+      result = {
+        id: promptCase.id,
+        input: promptCase.input,
+        output: '',
+        trajectory: [],
+        metadata: {
+          ...promptCase.metadata,
+          agent: schema.name,
+          trajectoryRichness: 'minimal' as TrajectoryRichness,
+          turnCount: inputs.length,
+        },
+        timing: {
+          start: startTime,
+          end: endTime,
+          sessionCreation: 0,
+          total: endTime - startTime,
+        },
+        toolErrors: true,
+        errors: [message],
+      }
+    }
+    results.push(result)
+    // Write result immediately
+    const formatted = JSON.stringify(result)
+    await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
+    isFirstOutput = false
+    const statusIcon = result.toolErrors ? '!' : '✓'
+    const exitInfo = result.metadata?.timedOut
+      ? ' - TIMEOUT'
+      : result.metadata?.exitCode && result.metadata.exitCode !== 0
+        ? ` - exit ${result.metadata.exitCode}`
+        : ''
+    logProgress(`  ${statusIcon} (${result.timing.total}ms)${exitInfo}`, progress)
+  }
+  logProgress('Done!', progress)
+  return results
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Capture command CLI handler.
+ *
+ * @param args - Command line arguments (after 'capture')
+ */
+export const capture = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      schema: { type: 'string', short: 's' },
+      output: { type: 'string', short: 'o' },
+      cwd: { type: 'string', short: 'c' },
+      timeout: { type: 'string', short: 't' },
+      progress: { type: 'boolean', default: false },
+      append: { type: 'boolean', default: false },
+      grader: { type: 'string', short: 'g' },
+      debug: { type: 'boolean', default: false },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: agent-eval-harness capture <prompts.jsonl> --schema <schema.json> [options]
+Arguments:
+  prompts.jsonl     Input file with evaluation prompts
+Options:
+  -s, --schema      Path to agent schema JSON file (required)
+  -o, --output      Output file (default: stdout)
+  -c, --cwd         Working directory for agent
+  -t, --timeout     Request timeout in ms (overrides schema default)
+  --progress        Show progress to stderr
+  --append          Append to output file instead of overwriting
+  -g, --grader      Path to grader (.ts/.js module or executable script)
+  --debug           Enable debug mode (shows raw output, JSONPath matching)
+  -h, --help        Show this help message
+Output Format:
+  Full trajectory JSONL with toolErrors indicator.
+  Use 'agent-eval-harness summarize' to derive compact views.
+Exit Info (in metadata):
+  exitCode      Process exit code (null if killed/timed out)
+  signal        Signal that killed process (if any)
+  timedOut      true if process was killed due to timeout
+Graders:
+  TS/JS modules must export a 'grade' function.
+  Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
+Examples:
+  # Basic capture with schema
+  agent-eval-harness capture prompts.jsonl --schema claude.json -o results.jsonl
+  # With TypeScript grader
+  agent-eval-harness capture prompts.jsonl -s claude.json --grader ./grader.ts -o results.jsonl
+  # With debug mode
+  agent-eval-harness capture prompts.jsonl -s claude.json --debug -o results.jsonl
+  # With per-prompt timeout override (in prompts.jsonl):
+  {"id": "slow-task", "input": "...", "timeout": 180000}
+`)
+    return
+  }
+  const promptsPath = positionals[0]
+  if (!promptsPath) {
+    console.error('Error: prompts.jsonl path is required')
+    process.exit(1)
+  }
+  if (!values.schema) {
+    console.error('Error: --schema is required')
+    console.error('Example: agent-eval-harness capture prompts.jsonl --schema ./claude.json')
+    process.exit(1)
+  }
+  // Load grader if specified
+  let grader: Grader | undefined
+  if (values.grader) {
+    try {
+      grader = await loadGrader(values.grader)
+    } catch (error) {
+      console.error(`Error: ${error instanceof Error ? error.message : error}`)
+      process.exit(1)
+    }
+  }
+  await runCapture({
+    promptsPath,
+    schemaPath: values.schema,
+    outputPath: values.output,
+    cwd: values.cwd,
+    timeout: values.timeout ? Number.parseInt(values.timeout, 10) : undefined,
+    progress: values.progress ?? false,
+    append: values.append ?? false,
+    grader,
+    debug: values.debug ?? false,
+  })
+}

package/src/commands/summarize.ts ADDED Viewed

@@ -0,0 +1,228 @@
+/**
+ * Summarize command - derive compact views from full trajectory results.
+ *
+ * @remarks
+ * Transforms full trajectory JSONL into:
+ * - Summary JSONL: Compact format for jq analysis
+ * - Markdown: Human-readable format for LLM-as-judge workflows
+ *
+ * @packageDocumentation
+ */
+import { parseArgs } from 'node:util'
+import { extractContent, extractFilePath, headTailPreview, loadResults, resolvePath } from '../core.ts'
+import { HEAD_LINES, MAX_CONTENT_LENGTH, TAIL_LINES } from '../schemas/constants.ts'
+import type { CaptureResult, SummaryResult } from '../schemas.ts'
+// ============================================================================
+// Types
+// ============================================================================
+/** Configuration for summarize command */
+export type SummarizeConfig = {
+  /** Path to results.jsonl file */
+  resultsPath: string
+  /** Output file path */
+  outputPath?: string
+  /** Output as markdown instead of JSONL */
+  markdown?: boolean
+}
+/**
+ * Format capture result as compact summary.
+ *
+ * @param result - Full capture result
+ * @returns Compact summary result
+ *
+ * @public
+ */
+export const formatSummary = (result: CaptureResult): SummaryResult => {
+  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
+  return {
+    id: result.id,
+    input: inputText,
+    output: result.output,
+    toolCalls: result.trajectory.filter((s) => s.type === 'tool_call').map((s) => (s as { name: string }).name),
+    duration: result.timing.end - result.timing.start,
+  }
+}
+/**
+ * Format capture result as markdown with step IDs.
+ *
+ * @param result - Full capture result
+ * @returns Markdown formatted string
+ *
+ * @public
+ */
+export const formatMarkdown = (result: CaptureResult): string => {
+  const inputText = Array.isArray(result.input) ? result.input.join('\n') : result.input
+  const lines: string[] = [`## Evaluation Record: ${result.id}`, '', `**Input:** ${inputText}`, '', '**Trajectory:**']
+  let stepNum = 1
+  for (const step of result.trajectory) {
+    const stepId = `${result.id}-step-${stepNum}`
+    if (step.type === 'thought') {
+      const preview = step.content.slice(0, 100)
+      const truncated = step.content.length > 100 ? '...' : ''
+      lines.push(`${stepNum}. [THOUGHT] ${preview}${truncated} [→${stepId}]`)
+      stepNum++
+    } else if (step.type === 'tool_call') {
+      const duration = step.duration ? ` (${step.duration}ms)` : ''
+      const filePath = extractFilePath(step.input)
+      const content = extractContent(step.input)
+      lines.push(`${stepNum}. [TOOL:${step.name}] -> ${step.status}${duration} [→${stepId}]`)
+      // Add file path if present
+      if (filePath) {
+        const charCount = content?.length ?? 0
+        lines.push(`   File: ${filePath}${charCount > 0 ? ` (${charCount} chars)` : ''}`)
+      }
+      // Add head/tail preview for content-producing tools
+      if (content && content.length > 0) {
+        const preview = content.length > MAX_CONTENT_LENGTH ? headTailPreview(content, HEAD_LINES, TAIL_LINES) : content
+        // Detect file extension for syntax highlighting
+        const ext = filePath?.split('.').pop() ?? 'typescript'
+        lines.push(`   \`\`\`${ext}`)
+        lines.push(`   ${preview.split('\n').join('\n   ')}`)
+        lines.push('   ```')
+      }
+      stepNum++
+    } else if (step.type === 'plan') {
+      const entries = step.entries as Array<{ content: string; status: string }>
+      const planSummary = entries.map((e) => `${e.content}: ${e.status}`).join(', ')
+      const truncated = planSummary.length > 80 ? '...' : ''
+      lines.push(`${stepNum}. [PLAN] ${planSummary.slice(0, 80)}${truncated} [→${stepId}]`)
+      stepNum++
+    } else if (step.type === 'message') {
+      const preview = step.content.slice(0, 100)
+      const truncated = step.content.length > 100 ? '...' : ''
+      lines.push(`${stepNum}. [MESSAGE] ${preview}${truncated} [→${stepId}]`)
+      stepNum++
+    }
+  }
+  lines.push('')
+  const outputPreview = result.output.slice(0, 200)
+  const outputTruncated = result.output.length > 200 ? '...' : ''
+  lines.push(`**Output:** ${outputPreview}${outputTruncated}`)
+  lines.push('')
+  const metadataStr = Object.entries(result.metadata)
+    .map(([k, v]) => `${k}=${v}`)
+    .join(', ')
+  lines.push(`**Metadata:** ${metadataStr}`)
+  lines.push(`**Tool Errors:** ${result.toolErrors}`)
+  lines.push(`**Duration:** ${result.timing.end - result.timing.start}ms`)
+  if (result.score) {
+    lines.push(`**Score:** ${result.score.pass ? 'PASS' : 'FAIL'} (${result.score.score})`)
+    if (result.score.reasoning) {
+      lines.push(`**Reasoning:** ${result.score.reasoning}`)
+    }
+  }
+  lines.push('')
+  lines.push('---')
+  lines.push('')
+  return lines.join('\n')
+}
+// ============================================================================
+// Summarize Implementation
+// ============================================================================
+/**
+ * Execute summarize with configuration object.
+ *
+ * @param config - Summarize configuration
+ * @returns Formatted output string
+ */
+export const runSummarize = async (config: SummarizeConfig): Promise<string> => {
+  const { resultsPath, outputPath, markdown = false } = config
+  // Load results
+  const results = await loadResults(resultsPath)
+  // Format output
+  let output: string
+  if (markdown) {
+    output = results.map(formatMarkdown).join('\n')
+  } else {
+    output = results.map((r) => JSON.stringify(formatSummary(r))).join('\n')
+  }
+  // Write output
+  if (outputPath) {
+    await Bun.write(resolvePath(outputPath), output)
+  } else {
+    // biome-ignore lint/suspicious/noConsole: CLI stdout output
+    console.log(output)
+  }
+  return output
+}
+// ============================================================================
+// CLI Entry Point
+// ============================================================================
+/**
+ * Summarize command CLI handler.
+ *
+ * @param args - Command line arguments (after 'summarize')
+ */
+export const summarize = async (args: string[]): Promise<void> => {
+  const { values, positionals } = parseArgs({
+    args,
+    options: {
+      output: { type: 'string', short: 'o' },
+      markdown: { type: 'boolean', short: 'm', default: false },
+      help: { type: 'boolean', short: 'h' },
+    },
+    allowPositionals: true,
+  })
+  if (values.help) {
+    // biome-ignore lint/suspicious/noConsole: CLI help output
+    console.log(`
+Usage: agent-eval-harness summarize <results.jsonl> [options]
+Arguments:
+  results.jsonl     Input file with capture results
+Options:
+  -o, --output      Output file (default: stdout)
+  -m, --markdown    Output as markdown instead of JSONL
+  -h, --help        Show this help message
+Output Formats:
+  JSONL (default): Compact summary with id, input, output, toolCalls, duration
+  Markdown (-m):   Human-readable format with step IDs for LLM-as-judge
+Examples:
+  # Summary JSONL for jq analysis
+  agent-eval-harness summarize results.jsonl -o summary.jsonl
+  # Markdown for LLM evaluation
+  agent-eval-harness summarize results.jsonl --markdown -o results.md
+`)
+    return
+  }
+  const resultsPath = positionals[0]
+  if (!resultsPath) {
+    console.error('Error: results.jsonl path is required')
+    process.exit(1)
+  }
+  await runSummarize({
+    resultsPath,
+    outputPath: values.output,
+    markdown: values.markdown ?? false,
+  })
+}