npm - @plaited/agent-eval-harness - Versions diffs - 0.5.0 - Mend

@plaited/agent-eval-harness 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/LICENSE +15 -0
package/README.md +273 -0
package/bin/cli.ts +162 -0
package/bin/tests/cli.spec.ts +529 -0
package/package.json +67 -0
package/src/commands/balance.ts +257 -0
package/src/commands/calibrate.ts +313 -0
package/src/commands/capture.ts +393 -0
package/src/commands/summarize.ts +228 -0
package/src/commands/tests/balance-helpers.spec.ts +279 -0
package/src/commands/tests/calibrate-helpers.spec.ts +226 -0
package/src/commands/tests/capture-cli.spec.ts +190 -0
package/src/commands/tests/capture-helpers.spec.ts +524 -0
package/src/commands/tests/summarize-helpers.spec.ts +339 -0
package/src/commands/tests/trials-calculations.spec.ts +209 -0
package/src/commands/tests/trials-cli.spec.ts +147 -0
package/src/commands/trials.ts +388 -0
package/src/commands/validate-refs.ts +188 -0
package/src/commands.ts +33 -0
package/src/core/core.ts +25 -0
package/src/core/loading.ts +96 -0
package/src/core/output.ts +121 -0
package/src/core/tests/core.spec.ts +309 -0
package/src/core/trajectory.ts +166 -0
package/src/core.ts +28 -0
package/src/harness.ts +46 -0
package/src/headless/headless-cli.ts +430 -0
package/src/headless/headless-history-builder.ts +141 -0
package/src/headless/headless-output-parser.ts +366 -0
package/src/headless/headless-session-manager.ts +587 -0
package/src/headless/headless.schemas.ts +310 -0
package/src/headless/headless.types.ts +19 -0
package/src/headless/tests/headless.spec.ts +678 -0
package/src/headless.ts +72 -0
package/src/integration_tests/claude.spec.ts +157 -0
package/src/integration_tests/gemini.spec.ts +139 -0
package/src/pipeline/compare.ts +325 -0
package/src/pipeline/extract.ts +241 -0
package/src/pipeline/format.ts +292 -0
package/src/pipeline/grade.ts +169 -0
package/src/pipeline/pipeline.ts +41 -0
package/src/pipeline/pipeline.types.ts +241 -0
package/src/pipeline/run.ts +412 -0
package/src/pipeline/tests/pipeline.spec.ts +356 -0
package/src/pipeline.ts +34 -0
package/src/schemas/constants.ts +94 -0
package/src/schemas/grader-loader.ts +174 -0
package/src/schemas/schemas-cli.ts +239 -0
package/src/schemas/schemas.ts +558 -0
package/src/schemas/tests/constants.spec.ts +121 -0
package/src/schemas/tests/fixtures/grader-bad-module.ts +5 -0
package/src/schemas/tests/fixtures/grader-exec-fail.py +9 -0
package/src/schemas/tests/fixtures/grader-exec-invalid.py +6 -0
package/src/schemas/tests/fixtures/grader-exec.py +29 -0
package/src/schemas/tests/fixtures/grader-module.ts +14 -0
package/src/schemas/tests/grader-loader.spec.ts +153 -0
package/src/schemas/tests/schemas-cli.spec.ts +142 -0
package/src/schemas/tests/schemas.spec.ts +606 -0
package/src/schemas.ts +90 -0

package/src/pipeline/tests/pipeline.spec.ts ADDED Viewed

@@ -0,0 +1,356 @@
+/**
+ * Unit tests for pipeline commands.
+ *
+ * @remarks
+ * Tests for the Unix-style pipeline commands:
+ * - format: formatMarkdown, formatCsv helpers
+ * - compare: parseLabeledRun helper
+ * - type validation
+ *
+ * @packageDocumentation
+ */
+import { describe, expect, test } from 'bun:test'
+import type {
+  ComparisonGraderInput,
+  ComparisonGraderResult,
+  ExtractedResult,
+  FormatStyle,
+  GradedResult,
+  LabeledRun,
+  RawOutput,
+} from '../pipeline.types.ts'
+// ============================================================================
+// Type Validation Tests
+// ============================================================================
+describe('RawOutput type', () => {
+  test('accepts valid raw output', () => {
+    const raw: RawOutput = {
+      id: 'test-001',
+      input: 'What is 2+2?',
+      rawLines: ['{"type":"message","content":"4"}'],
+      timing: {
+        start: 1000,
+        end: 2000,
+        total: 1000,
+      },
+    }
+    expect(raw.id).toBe('test-001')
+    expect(raw.timing.total).toBe(1000)
+  })
+  test('accepts array input for multi-turn', () => {
+    const raw: RawOutput = {
+      id: 'multi-001',
+      input: ['Hello', 'How are you?'],
+      rawLines: [],
+      timing: { start: 0, end: 100, total: 100 },
+    }
+    expect(Array.isArray(raw.input)).toBe(true)
+    expect((raw.input as string[]).length).toBe(2)
+  })
+  test('accepts optional hint', () => {
+    const raw: RawOutput = {
+      id: 'hint-001',
+      input: 'Calculate something',
+      hint: 'Expected: numeric answer',
+      rawLines: [],
+      timing: { start: 0, end: 0, total: 0 },
+    }
+    expect(raw.hint).toBe('Expected: numeric answer')
+  })
+  test('accepts optional error', () => {
+    const raw: RawOutput = {
+      id: 'error-001',
+      input: 'fail test',
+      rawLines: [],
+      timing: { start: 0, end: 100, total: 100 },
+      error: 'Timeout exceeded',
+    }
+    expect(raw.error).toBe('Timeout exceeded')
+  })
+})
+describe('ExtractedResult type', () => {
+  test('accepts valid extracted result', () => {
+    const extracted: ExtractedResult = {
+      id: 'test-001',
+      input: 'What is 2+2?',
+      output: '4',
+      trajectory: [
+        {
+          type: 'message',
+          content: '4',
+          timestamp: 100,
+        },
+      ],
+      toolErrors: false,
+      timing: { start: 0, end: 100, total: 100 },
+    }
+    expect(extracted.output).toBe('4')
+    expect(extracted.trajectory.length).toBe(1)
+    expect(extracted.toolErrors).toBe(false)
+  })
+  test('accepts thought and tool_call steps', () => {
+    const extracted: ExtractedResult = {
+      id: 'complex-001',
+      input: 'Create a file',
+      output: 'Done',
+      trajectory: [
+        { type: 'thought', content: 'I need to create a file', timestamp: 50 },
+        {
+          type: 'tool_call',
+          name: 'Write',
+          input: { path: '/tmp/test.txt', content: 'hello' },
+          status: 'completed',
+          timestamp: 200,
+        },
+        { type: 'message', content: 'Done', timestamp: 250 },
+      ],
+      toolErrors: false,
+      timing: { start: 0, end: 300, total: 300 },
+    }
+    expect(extracted.trajectory.length).toBe(3)
+    expect(extracted.trajectory[1]?.type).toBe('tool_call')
+  })
+})
+describe('GradedResult type', () => {
+  test('extends ExtractedResult with score', () => {
+    const graded: GradedResult = {
+      id: 'graded-001',
+      input: 'What is 2+2?',
+      output: '4',
+      trajectory: [],
+      toolErrors: false,
+      timing: { start: 0, end: 100, total: 100 },
+      score: {
+        pass: true,
+        score: 1.0,
+        reasoning: 'Correct answer',
+      },
+    }
+    expect(graded.score.pass).toBe(true)
+    expect(graded.score.score).toBe(1.0)
+    expect(graded.score.reasoning).toBe('Correct answer')
+  })
+  test('accepts failing score', () => {
+    const graded: GradedResult = {
+      id: 'fail-001',
+      input: 'What is 2+2?',
+      output: '5',
+      trajectory: [],
+      toolErrors: false,
+      timing: { start: 0, end: 100, total: 100 },
+      score: {
+        pass: false,
+        score: 0.0,
+        reasoning: 'Incorrect answer',
+      },
+    }
+    expect(graded.score.pass).toBe(false)
+    expect(graded.score.score).toBe(0.0)
+  })
+})
+describe('FormatStyle type', () => {
+  test('accepts valid format styles', () => {
+    const styles: FormatStyle[] = ['jsonl', 'markdown', 'csv']
+    expect(styles).toContain('jsonl')
+    expect(styles).toContain('markdown')
+    expect(styles).toContain('csv')
+  })
+})
+describe('LabeledRun type', () => {
+  test('accepts label and path', () => {
+    const run: LabeledRun = {
+      label: 'baseline',
+      path: './results/baseline.jsonl',
+    }
+    expect(run.label).toBe('baseline')
+    expect(run.path).toBe('./results/baseline.jsonl')
+  })
+})
+describe('ComparisonGraderInput type', () => {
+  test('accepts multiple runs', () => {
+    const input: ComparisonGraderInput = {
+      id: 'compare-001',
+      input: 'What is 2+2?',
+      runs: {
+        baseline: { output: '4' },
+        experiment: { output: 'Four', trajectory: [] },
+      },
+    }
+    expect(Object.keys(input.runs).length).toBe(2)
+    expect(input.runs.baseline?.output).toBe('4')
+    expect(input.runs.experiment?.trajectory).toEqual([])
+  })
+})
+describe('ComparisonGraderResult type', () => {
+  test('accepts rankings with reasoning', () => {
+    const result: ComparisonGraderResult = {
+      rankings: [
+        { run: 'baseline', rank: 1, score: 0.95 },
+        { run: 'experiment', rank: 2, score: 0.8 },
+      ],
+      reasoning: 'Baseline was more concise',
+    }
+    expect(result.rankings.length).toBe(2)
+    expect(result.rankings[0]?.rank).toBe(1)
+    expect(result.reasoning).toBeDefined()
+  })
+})
+// ============================================================================
+// Helper Function Tests (via import)
+// ============================================================================
+// Note: Some helper functions are not exported from the modules.
+// These tests verify the type contracts that the helpers must satisfy.
+describe('pipeline data flow', () => {
+  test('RawOutput can flow to ExtractedResult', () => {
+    const raw: RawOutput = {
+      id: 'flow-001',
+      input: 'test',
+      hint: 'expected: something',
+      rawLines: ['{"type":"message","content":"result"}'],
+      timing: { start: 0, end: 100, total: 100 },
+    }
+    // Simulate extraction
+    const extracted: ExtractedResult = {
+      id: raw.id,
+      input: raw.input,
+      hint: raw.hint,
+      output: 'result',
+      trajectory: [{ type: 'message', content: 'result', timestamp: 100 }],
+      toolErrors: false,
+      timing: raw.timing,
+    }
+    expect(extracted.id).toBe(raw.id)
+    expect(extracted.input).toBe(raw.input)
+    expect(extracted.hint).toBe(raw.hint)
+  })
+  test('ExtractedResult can flow to GradedResult', () => {
+    const extracted: ExtractedResult = {
+      id: 'grade-flow-001',
+      input: 'test',
+      output: 'result',
+      trajectory: [],
+      toolErrors: false,
+      timing: { start: 0, end: 100, total: 100 },
+    }
+    // Simulate grading
+    const graded: GradedResult = {
+      ...extracted,
+      score: { pass: true, score: 1.0 },
+    }
+    expect(graded.id).toBe(extracted.id)
+    expect(graded.score.pass).toBe(true)
+  })
+})
+describe('comparison data structures', () => {
+  test('LabeledRun derived from filename', () => {
+    // Simulate parseLabeledRun behavior
+    const path = '/path/to/results-baseline.jsonl'
+    const basename = path.split('/').pop() ?? ''
+    const label = basename.replace('.jsonl', '')
+    const run: LabeledRun = { label, path }
+    expect(run.label).toBe('results-baseline')
+  })
+  test('LabeledRun with explicit label', () => {
+    // Simulate explicit label:path format
+    const arg = 'my-baseline:/path/to/results.jsonl'
+    const colonIdx = arg.indexOf(':')
+    const label = arg.slice(0, colonIdx)
+    const path = arg.slice(colonIdx + 1)
+    const run: LabeledRun = { label, path }
+    expect(run.label).toBe('my-baseline')
+    expect(run.path).toBe('/path/to/results.jsonl')
+  })
+  test('comparison aggregates results by prompt ID', () => {
+    const results1 = [
+      { id: 'p1', output: 'a' },
+      { id: 'p2', output: 'b' },
+    ]
+    const results2 = [
+      { id: 'p1', output: 'x' },
+      { id: 'p2', output: 'y' },
+    ]
+    // Simulate comparison aggregation
+    const promptIds = new Set([...results1.map((r) => r.id), ...results2.map((r) => r.id)])
+    expect(promptIds.size).toBe(2)
+    const comparisonInput: ComparisonGraderInput = {
+      id: 'p1',
+      input: 'test prompt',
+      runs: {
+        run1: { output: results1.find((r) => r.id === 'p1')?.output ?? '' },
+        run2: { output: results2.find((r) => r.id === 'p1')?.output ?? '' },
+      },
+    }
+    expect(comparisonInput.runs.run1?.output).toBe('a')
+    expect(comparisonInput.runs.run2?.output).toBe('x')
+  })
+})
+describe('format style contracts', () => {
+  test('markdown format includes summary when graded', () => {
+    // Verify the type contract for markdown formatting
+    const gradedResults: GradedResult[] = [
+      {
+        id: 't1',
+        input: 'a',
+        output: 'x',
+        trajectory: [],
+        toolErrors: false,
+        timing: { start: 0, end: 100, total: 100 },
+        score: { pass: true, score: 1.0 },
+      },
+      {
+        id: 't2',
+        input: 'b',
+        output: 'y',
+        trajectory: [],
+        toolErrors: false,
+        timing: { start: 0, end: 100, total: 100 },
+        score: { pass: false, score: 0.5 },
+      },
+    ]
+    const passed = gradedResults.filter((r) => r.score.pass).length
+    const total = gradedResults.length
+    const passRate = passed / total
+    expect(passRate).toBe(0.5)
+  })
+  test('csv format escapes special characters', () => {
+    // Test CSV escaping contract
+    const escapeCsv = (str: string) => `"${str.replace(/"/g, '""').replace(/\n/g, '\\n')}"`
+    expect(escapeCsv('hello')).toBe('"hello"')
+    expect(escapeCsv('say "hello"')).toBe('"say ""hello"""')
+    expect(escapeCsv('line1\nline2')).toBe('"line1\\nline2"')
+  })
+})

package/src/pipeline.ts ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Pipeline commands re-export.
+ *
+ * @remarks
+ * Public API for pipeline commands. Import from here for external use.
+ *
+ * @packageDocumentation
+ */
+export {
+  // Types
+  type CompareConfig,
+  type ComparisonGrader,
+  type ComparisonGraderInput,
+  type ComparisonGraderResult,
+  type ComparisonRanking,
+  type ComparisonResult,
+  // Commands
+  compare,
+  type ExtractConfig,
+  type ExtractedResult,
+  extract,
+  type FormatConfig,
+  type FormatStyle,
+  format,
+  type GradeConfig,
+  type GradedResult,
+  grade,
+  type LabeledRun,
+  type RawOutput,
+  type RunConfig,
+  type RunMode,
+  run,
+} from './pipeline/pipeline.ts'

package/src/schemas/constants.ts ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * Constants for harness and JSON-RPC protocol operations.
+ *
+ * @remarks
+ * Contains all constant values used across the implementation:
+ * - JSON-RPC method names and protocol version
+ * - JSON-RPC error codes
+ * - Harness defaults (timeouts, preview limits)
+ *
+ * @packageDocumentation
+ */
+// ============================================================================
+// JSON-RPC Protocol Methods
+// ============================================================================
+/** JSON-RPC method names for headless adapter protocol */
+export const PROTOCOL_METHODS = {
+  // Lifecycle
+  INITIALIZE: 'initialize',
+  SHUTDOWN: 'shutdown',
+  // Sessions
+  CREATE_SESSION: 'session/new',
+  LOAD_SESSION: 'session/load',
+  PROMPT: 'session/prompt',
+  CANCEL: 'session/cancel',
+  UPDATE: 'session/update',
+  REQUEST_PERMISSION: 'session/request_permission',
+  SET_MODEL: 'session/set_model',
+  // Protocol-level
+  CANCEL_REQUEST: '$/cancel_request',
+} as const
+// ============================================================================
+// Protocol Version
+// ============================================================================
+/** Current protocol version */
+export const PROTOCOL_VERSION = 1 as const
+// ============================================================================
+// JSON-RPC Error Codes
+// ============================================================================
+/** Standard JSON-RPC error codes */
+export const JSON_RPC_ERRORS = {
+  PARSE_ERROR: -32700,
+  INVALID_REQUEST: -32600,
+  METHOD_NOT_FOUND: -32601,
+  INVALID_PARAMS: -32602,
+  INTERNAL_ERROR: -32603,
+  REQUEST_CANCELLED: -32800,
+} as const
+// ============================================================================
+// Client Defaults
+// ============================================================================
+/** Default client name for protocol handshake */
+export const DEFAULT_CLIENT_NAME = 'plaited-eval-harness'
+/** Default timeout for protocol operations in milliseconds */
+export const DEFAULT_PROTOCOL_TIMEOUT = 30000
+/** Default polling interval for streaming updates in milliseconds */
+export const DEFAULT_POLLING_INTERVAL = 50
+// ============================================================================
+// Harness Preview Configuration
+// ============================================================================
+/** Number of lines to show at the head of content previews */
+export const HEAD_LINES = 8
+/** Number of lines to show at the tail of content previews */
+export const TAIL_LINES = 4
+/** Maximum content length before applying head/tail preview */
+export const MAX_CONTENT_LENGTH = 500
+// ============================================================================
+// Harness Defaults
+// ============================================================================
+/** Default timeout for prompt evaluation in milliseconds */
+export const DEFAULT_HARNESS_TIMEOUT = 60000
+/** Default number of trials for pass@k analysis */
+export const DEFAULT_TRIAL_COUNT = 5
+/** Default sample size for calibration */
+export const DEFAULT_CALIBRATION_SAMPLE_SIZE = 10

package/src/schemas/grader-loader.ts ADDED Viewed

@@ -0,0 +1,174 @@
+/**
+ * Polyglot grader loader module.
+ *
+ * @remarks
+ * Supports loading graders from:
+ * - TypeScript/JavaScript modules (import as ES module)
+ * - Executable scripts (Python, Ruby, shell, etc. via subprocess)
+ *
+ * Executable graders use stdin/stdout JSON protocol:
+ * - Input: `{"input": "...", "output": "...", "expected": "...", "trajectory": [...]}`
+ * - Output: `{"pass": true, "score": 1.0, "reasoning": "..."}`
+ *
+ * @packageDocumentation
+ */
+import type { Grader, TrajectoryStep } from './schemas.ts'
+import { GraderResultSchema } from './schemas.ts'
+// ============================================================================
+// Constants
+// ============================================================================
+/** File extensions that are imported as ES modules */
+const JS_EXTENSIONS = ['.ts', '.js', '.mjs', '.cjs']
+// ============================================================================
+// Helpers
+// ============================================================================
+/** Check if a file path is a JavaScript/TypeScript module */
+const isJsModule = (path: string): boolean => JS_EXTENSIONS.some((ext) => path.endsWith(ext))
+/** Resolve path relative to process.cwd() */
+const resolvePath = (path: string): string => {
+  if (path.startsWith('/')) return path
+  return `${process.cwd()}/${path}`
+}
+// ============================================================================
+// Executable Grader
+// ============================================================================
+/** Input format for executable graders (stdin JSON) */
+type ExecGraderInput = {
+  input: string | string[]
+  output: string
+  hint?: string
+  trajectory?: TrajectoryStep[]
+}
+/**
+ * Create a grader function that executes an external script.
+ *
+ * @remarks
+ * The script receives JSON on stdin and must output JSON on stdout.
+ * Non-zero exit codes are treated as errors.
+ *
+ * @param execPath - Absolute path to the executable script
+ * @returns Grader function
+ */
+const createExecGrader = (execPath: string): Grader => {
+  return async (params) => {
+    const input: ExecGraderInput = {
+      input: params.input,
+      output: params.output,
+      hint: params.hint,
+      trajectory: params.trajectory,
+    }
+    const inputJson = JSON.stringify(input)
+    const proc = Bun.spawn([execPath], {
+      stdin: new TextEncoder().encode(inputJson),
+      stdout: 'pipe',
+      stderr: 'pipe',
+    })
+    const [stdout, stderr, exitCode] = await Promise.all([
+      new Response(proc.stdout).text(),
+      new Response(proc.stderr).text(),
+      proc.exited,
+    ])
+    if (exitCode !== 0) {
+      throw new Error(`Grader exited with code ${exitCode}: ${stderr.trim() || 'No error output'}`)
+    }
+    const trimmedStdout = stdout.trim()
+    if (!trimmedStdout) {
+      throw new Error('Grader produced no output')
+    }
+    let parsed: unknown
+    try {
+      parsed = JSON.parse(trimmedStdout)
+    } catch {
+      throw new Error(`Grader output is not valid JSON: ${trimmedStdout.slice(0, 100)}`)
+    }
+    const result = GraderResultSchema.safeParse(parsed)
+    if (!result.success) {
+      throw new Error(`Invalid grader result: ${result.error.message}`)
+    }
+    return result.data
+  }
+}
+// ============================================================================
+// Module Grader
+// ============================================================================
+/**
+ * Load a grader from a JavaScript/TypeScript module.
+ *
+ * @remarks
+ * The module must export a `grade` function matching the `Grader` type.
+ *
+ * @param modulePath - Absolute path to the module
+ * @returns Grader function
+ */
+const loadModuleGrader = async (modulePath: string): Promise<Grader> => {
+  const graderModule = await import(modulePath)
+  if (typeof graderModule.grade !== 'function') {
+    throw new Error(`Grader module must export a 'grade' function`)
+  }
+  return graderModule.grade as Grader
+}
+// ============================================================================
+// Public API
+// ============================================================================
+/**
+ * Load a grader from a file path.
+ *
+ * @remarks
+ * Detection logic:
+ * - `.ts`, `.js`, `.mjs`, `.cjs` → Import as ES module
+ * - Everything else → Execute as subprocess
+ *
+ * @param graderPath - Path to the grader (relative or absolute)
+ * @returns Grader function
+ * @throws Error if grader not found or invalid
+ *
+ * @example
+ * ```typescript
+ * // TypeScript grader
+ * const grader = await loadGrader('./grader.ts')
+ *
+ * // Python grader
+ * const grader = await loadGrader('./grader.py')
+ *
+ * // Any executable
+ * const grader = await loadGrader('./my-grader')
+ * ```
+ */
+export const loadGrader = async (graderPath: string): Promise<Grader> => {
+  const resolvedPath = resolvePath(graderPath)
+  // Check file exists
+  const file = Bun.file(resolvedPath)
+  if (!(await file.exists())) {
+    throw new Error(`Grader not found: ${resolvedPath}`)
+  }
+  if (isJsModule(resolvedPath)) {
+    return loadModuleGrader(resolvedPath)
+  }
+  return createExecGrader(resolvedPath)
+}