npm - @plaited/agent-eval-harness - Versions diffs - 0.6.2 → 0.8.0 - Mend

@plaited/agent-eval-harness 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +146 -6
package/package.json +1 -1
package/src/commands/capture.ts +9 -1
package/src/commands/trials.ts +6 -0
package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
package/src/graders/trials-compare-statistical.ts +188 -0
package/src/graders/trials-compare-weighted.ts +128 -0
package/src/graders.ts +21 -1
package/src/pipeline/compare-format-detection.ts +100 -0
package/src/pipeline/compare-trials.ts +596 -0
package/src/pipeline/compare.ts +75 -19
package/src/pipeline/grade.ts +6 -0
package/src/pipeline/pipeline.types.ts +57 -1
package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
package/src/pipeline/tests/compare-trials.spec.ts +277 -0
package/src/schemas/grader-loader.ts +4 -0
package/src/schemas/schemas.ts +161 -0
package/src/schemas/tests/fixtures/grader-git.ts +116 -0
package/src/schemas/tests/grader-git.spec.ts +222 -0
package/src/schemas.ts +13 -0

package/src/pipeline/compare.ts CHANGED Viewed

@@ -42,6 +42,8 @@ import type {
   TrajectoryInfo,
   TrajectoryRichness,
 } from '../schemas.ts'
+import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
+import { runTrialsCompare } from './compare-trials.ts'
 import type {
   CompareConfig,
   ComparisonGrader,
@@ -647,6 +649,7 @@ export const compare = async (args: string[]): Promise<void> => {
       strategy: { type: 'string', short: 's' },
       output: { type: 'string', short: 'o' },
       format: { type: 'string', short: 'f' },
+      'input-format': { type: 'string' },
       progress: { type: 'boolean', default: false },
       help: { type: 'boolean', short: 'h' },
     },
@@ -658,6 +661,7 @@ export const compare = async (args: string[]): Promise<void> => {
 Usage: agent-eval-harness compare [files...] [options]
 Compare multiple runs of the same prompts and generate aggregate report.
+Supports both CaptureResult (single-run) and TrialResult (multi-run reliability) formats.
 Arguments:
   files...          Result files to compare (positional, unlimited)
@@ -668,30 +672,47 @@ Options:
   -g, --grader      Path to custom grader (required if strategy=custom)
   -o, --output      Output file (default: stdout)
   -f, --format      Output format: json (default) or markdown
+  --input-format    Input format: auto (default), capture, or trials
   --progress        Show progress to stderr
   -h, --help        Show this help message
+Input Formats:
+  auto        Auto-detect from file content (default)
+  capture     CaptureResult format (trajectory/timing fields)
+  trials      TrialResult format (trials/k fields) for pass@k analysis
 Built-in Strategies:
-  weighted      Configurable weights for quality, latency, reliability
-                Customize via: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
-  statistical   Bootstrap sampling for confidence intervals
-                Customize via: COMPARE_BOOTSTRAP_ITERATIONS
+  For CaptureResult (capture format):
+    weighted      Configurable weights for quality, latency, reliability
+                  Env vars: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
+    statistical   Bootstrap sampling for confidence intervals
+                  Env var: COMPARE_BOOTSTRAP_ITERATIONS
+  For TrialResult (trials format):
+    weighted      Configurable weights for capability, reliability, consistency
+                  Env vars: COMPARE_CAPABILITY, COMPARE_RELIABILITY, COMPARE_CONSISTENCY
+    statistical   Bootstrap sampling for passAtK confidence intervals
+                  Env var: COMPARE_BOOTSTRAP_ITERATIONS
 Custom Grader:
   Must export 'grade' or 'compare' function with signature:
-    (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
+    CaptureResult: (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
+    TrialResult:   (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
 Examples:
-  # Default: weighted strategy with JSON output
+  # Default: auto-detect format, weighted strategy, JSON output
   agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
+  # Explicit trials format for pass@k comparison
+  agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json
+  # Trials comparison with custom weights
+  COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \\
+    agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
   # Statistical significance strategy
   agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
-  # Custom weights
-  COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \\
-    agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
   # Markdown report
   agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
@@ -749,19 +770,54 @@ Examples:
     process.exit(1)
   }
-  // Validate format (explicit format takes precedence, otherwise infer from extension)
+  // Validate output format (explicit format takes precedence, otherwise infer from extension)
   const format = inferFormat(values.output, values.format)
   if (values.format && !['json', 'markdown'].includes(values.format)) {
     console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
     process.exit(1)
   }
-  await runCompare({
-    runs,
-    strategy,
-    graderPath: values.grader,
-    outputPath: values.output,
-    progress: values.progress,
-    format,
-  })
+  // Validate input format
+  const inputFormatArg = values['input-format']
+  if (inputFormatArg && !['auto', 'capture', 'trials'].includes(inputFormatArg)) {
+    console.error(`Error: Invalid input-format '${inputFormatArg}'. Use: auto, capture, or trials`)
+    process.exit(1)
+  }
+  // Detect or use specified input format
+  let inputFormat: CompareInputFormat
+  try {
+    if (inputFormatArg === 'capture') {
+      inputFormat = 'capture'
+    } else if (inputFormatArg === 'trials') {
+      inputFormat = 'trials'
+    } else {
+      // Auto-detect from file content
+      inputFormat = await detectAndValidateFormat(runs.map((r) => r.path))
+    }
+  } catch (error) {
+    console.error(`Error: ${error instanceof Error ? error.message : error}`)
+    process.exit(1)
+  }
+  // Route to appropriate comparison function based on input format
+  if (inputFormat === 'trials') {
+    await runTrialsCompare({
+      runs,
+      strategy,
+      graderPath: values.grader,
+      outputPath: values.output,
+      progress: values.progress,
+      format,
+    })
+  } else {
+    await runCompare({
+      runs,
+      strategy,
+      graderPath: values.grader,
+      outputPath: values.output,
+      progress: values.progress,
+      format,
+    })
+  }
 }

package/src/pipeline/grade.ts CHANGED Viewed

@@ -52,6 +52,7 @@ export const runGrade = async (
       hint: extracted.hint,
       trajectory: extracted.trajectory,
       metadata: extracted.metadata,
+      cwd: extracted.cwd,
     })
     const graded: GradedResult = {
@@ -59,6 +60,11 @@ export const runGrade = async (
       score,
     }
+    // Merge outcome from grader if present
+    if (score.outcome) {
+      graded.outcome = score.outcome
+    }
     const icon = score.pass ? '✓' : '✗'
     logProgress(`  ${icon} score=${score.score.toFixed(2)}`, progress)

package/src/pipeline/pipeline.types.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * @packageDocumentation
  */
-import type { GraderResult, TrajectoryStep } from '../schemas.ts'
+import type { GraderResult, TrajectoryStep, TrialEntry } from '../schemas.ts'
 /**
  * Raw output from the `run` command.
@@ -62,6 +62,8 @@ export type ExtractedResult = {
   toolErrors: boolean
   /** Optional metadata from original prompt */
   metadata?: Record<string, unknown>
+  /** Working directory path (optional, for git-based grading) */
+  cwd?: string
   /** Timing metadata */
   timing: {
     start: number
@@ -77,10 +79,13 @@ export type ExtractedResult = {
  *
  * @remarks
  * Adds grader score to extracted result.
+ * Outcome field is merged from grader result if present.
  */
 export type GradedResult = ExtractedResult & {
   /** Grader score */
   score: GraderResult
+  /** Outcome data from grader (if grader returned outcome) */
+  outcome?: Record<string, unknown>
 }
 /**
@@ -267,3 +272,54 @@ export type ComparisonResult = {
   /** Optional reasoning */
   reasoning?: string
 }
+// ============================================================================
+// Trials Comparison Types
+// ============================================================================
+/**
+ * Run data for trials comparison.
+ *
+ * @remarks
+ * Contains the trials-specific metrics (passAtK, passExpK) plus
+ * the individual trial entries for deeper analysis.
+ */
+export type TrialsComparisonRunData = {
+  /** Simple pass rate: passes / k */
+  passRate?: number
+  /** pass@k: probability of at least one pass in k samples */
+  passAtK?: number
+  /** pass^k: probability of all k samples passing */
+  passExpK?: number
+  /** Number of trials (k) */
+  k: number
+  /** Individual trial results */
+  trials: TrialEntry[]
+}
+/**
+ * Input to trials comparison grader function.
+ *
+ * @remarks
+ * Provides all runs' trial results for a single prompt ID
+ * so the grader can compare capability and reliability.
+ */
+export type TrialsComparisonGraderInput = {
+  /** Test case identifier */
+  id: string
+  /** Original prompt input */
+  input: string | string[]
+  /** Grader context hint */
+  hint?: string
+  /** Results keyed by run label */
+  runs: Record<string, TrialsComparisonRunData>
+}
+/**
+ * Trials comparison grader function type.
+ *
+ * @remarks
+ * User-provided graders implement this interface to compare
+ * multiple runs of the same prompt using trials data.
+ */
+export type TrialsComparisonGrader = (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>

package/src/pipeline/tests/compare-format-detection.spec.ts ADDED Viewed

@@ -0,0 +1,142 @@
+/**
+ * Unit tests for compare format detection.
+ *
+ * @remarks
+ * Tests for auto-detecting CaptureResult vs TrialResult format.
+ *
+ * @packageDocumentation
+ */
+import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
+import { detectAndValidateFormat, detectInputFormat } from '../compare-format-detection.ts'
+// ============================================================================
+// Test Fixtures
+// ============================================================================
+const CAPTURE_RESULT = JSON.stringify({
+  id: 'test-001',
+  input: 'Hello',
+  output: 'Hi there',
+  trajectory: [{ type: 'message', content: 'Hi', timestamp: 1234567890 }],
+  timing: { start: 1234567890, end: 1234567891, total: 1, sessionCreation: 0 },
+  metadata: {},
+  toolErrors: false,
+})
+const TRIAL_RESULT = JSON.stringify({
+  id: 'test-001',
+  input: 'Hello',
+  k: 3,
+  passRate: 0.67,
+  passAtK: 0.9,
+  passExpK: 0.3,
+  trials: [
+    { trialNum: 1, output: 'Hi', trajectory: [], duration: 100, pass: true, score: 1.0 },
+    { trialNum: 2, output: 'Hello', trajectory: [], duration: 120, pass: true, score: 0.8 },
+    { trialNum: 3, output: 'Error', trajectory: [], duration: 150, pass: false, score: 0.2 },
+  ],
+})
+const tempDir = `${import.meta.dir}/.test-tmp/format-detection`
+beforeAll(async () => {
+  await Bun.$`mkdir -p ${tempDir}`
+})
+afterAll(async () => {
+  await Bun.$`rm -rf ${tempDir}`
+})
+// ============================================================================
+// detectInputFormat Tests
+// ============================================================================
+describe('detectInputFormat', () => {
+  test('detects CaptureResult format', async () => {
+    const path = `${tempDir}/capture.jsonl`
+    await Bun.write(path, `${CAPTURE_RESULT}\n`)
+    const format = await detectInputFormat(path)
+    expect(format).toBe('capture')
+  })
+  test('detects TrialResult format', async () => {
+    const path = `${tempDir}/trial.jsonl`
+    await Bun.write(path, `${TRIAL_RESULT}\n`)
+    const format = await detectInputFormat(path)
+    expect(format).toBe('trials')
+  })
+  test('throws on empty file', async () => {
+    const path = `${tempDir}/empty.jsonl`
+    await Bun.write(path, '')
+    await expect(detectInputFormat(path)).rejects.toThrow('Empty file')
+  })
+  test('throws on invalid JSON', async () => {
+    const path = `${tempDir}/invalid.jsonl`
+    await Bun.write(path, 'not json\n')
+    await expect(detectInputFormat(path)).rejects.toThrow('Invalid JSON')
+  })
+  test('throws on unrecognized format', async () => {
+    const path = `${tempDir}/unknown.jsonl`
+    await Bun.write(path, `${JSON.stringify({ id: 'test', foo: 'bar' })}\n`)
+    await expect(detectInputFormat(path)).rejects.toThrow('Unable to detect format')
+  })
+  test('ignores empty lines and uses first non-empty line', async () => {
+    const path = `${tempDir}/with-empty.jsonl`
+    await Bun.write(path, `\n\n${CAPTURE_RESULT}\n`)
+    const format = await detectInputFormat(path)
+    expect(format).toBe('capture')
+  })
+})
+// ============================================================================
+// detectAndValidateFormat Tests
+// ============================================================================
+describe('detectAndValidateFormat', () => {
+  test('validates all files have same format', async () => {
+    const path1 = `${tempDir}/capture1.jsonl`
+    const path2 = `${tempDir}/capture2.jsonl`
+    await Bun.write(path1, `${CAPTURE_RESULT}\n`)
+    await Bun.write(path2, `${CAPTURE_RESULT}\n`)
+    const format = await detectAndValidateFormat([path1, path2])
+    expect(format).toBe('capture')
+  })
+  test('throws on format mismatch', async () => {
+    const capturePath = `${tempDir}/capture-mixed.jsonl`
+    const trialPath = `${tempDir}/trial-mixed.jsonl`
+    await Bun.write(capturePath, `${CAPTURE_RESULT}\n`)
+    await Bun.write(trialPath, `${TRIAL_RESULT}\n`)
+    await expect(detectAndValidateFormat([capturePath, trialPath])).rejects.toThrow('Format mismatch')
+  })
+  test('throws on empty file list', async () => {
+    await expect(detectAndValidateFormat([])).rejects.toThrow('No files provided')
+  })
+  test('works with single file', async () => {
+    const path = `${tempDir}/single-trial.jsonl`
+    await Bun.write(path, `${TRIAL_RESULT}\n`)
+    const format = await detectAndValidateFormat([path])
+    expect(format).toBe('trials')
+  })
+})

package/src/pipeline/tests/compare-trials.spec.ts ADDED Viewed

@@ -0,0 +1,277 @@
+/**
+ * Unit tests for trials comparison module.
+ *
+ * @remarks
+ * Tests for runTrialsCompare and supporting functions.
+ *
+ * @packageDocumentation
+ */
+import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
+import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
+// ============================================================================
+// Test Fixtures
+// ============================================================================
+const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
+  id,
+  input: `Prompt for ${id}`,
+  k,
+  passRate: passAtK,
+  passAtK,
+  passExpK,
+  trials: Array.from({ length: k }, (_, i) => ({
+    trialNum: i + 1,
+    output: `Output ${i + 1}`,
+    trajectory: [],
+    duration: 100 + i * 10,
+    pass: Math.random() < passAtK,
+    score: passAtK,
+  })),
+})
+const tempDir = `${import.meta.dir}/.test-tmp/compare-trials`
+beforeAll(async () => {
+  await Bun.$`mkdir -p ${tempDir}`
+})
+afterAll(async () => {
+  await Bun.$`rm -rf ${tempDir}`
+})
+// ============================================================================
+// buildTrialsIndex Tests
+// ============================================================================
+describe('buildTrialsIndex', () => {
+  test('builds index from JSONL file', async () => {
+    const path = `${tempDir}/trials-index.jsonl`
+    const trial1 = createTrialResult('test-001', 0.9, 0.3)
+    const trial2 = createTrialResult('test-002', 0.8, 0.6)
+    await Bun.write(path, [JSON.stringify(trial1), JSON.stringify(trial2)].join('\n'))
+    const index = await buildTrialsIndex(path)
+    expect(index.size).toBe(2)
+    expect(index.get('test-001')?.passAtK).toBe(0.9)
+    expect(index.get('test-002')?.passExpK).toBe(0.6)
+  })
+  test('handles empty file', async () => {
+    const path = `${tempDir}/empty-trials.jsonl`
+    await Bun.write(path, '')
+    const index = await buildTrialsIndex(path)
+    expect(index.size).toBe(0)
+  })
+  test('throws on invalid JSON', async () => {
+    const path = `${tempDir}/invalid-trials.jsonl`
+    await Bun.write(path, 'not json\n')
+    await expect(buildTrialsIndex(path)).rejects.toThrow()
+  })
+})
+// ============================================================================
+// runTrialsCompare Tests
+// ============================================================================
+describe('runTrialsCompare', () => {
+  test('compares two trial runs and produces report', async () => {
+    const run1Path = `${tempDir}/run1.jsonl`
+    const run2Path = `${tempDir}/run2.jsonl`
+    const trial1a = createTrialResult('test-001', 0.9, 0.7)
+    const trial1b = createTrialResult('test-002', 0.8, 0.5)
+    const trial2a = createTrialResult('test-001', 0.95, 0.9)
+    const trial2b = createTrialResult('test-002', 0.6, 0.4)
+    await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
+    await Bun.write(run2Path, [JSON.stringify(trial2a), JSON.stringify(trial2b)].join('\n'))
+    const outputPath = `${tempDir}/comparison.json`
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'baseline', path: run1Path },
+        { label: 'variant', path: run2Path },
+      ],
+      outputPath,
+      progress: false,
+    })
+    expect(report.meta.inputFormat).toBe('trials')
+    expect(report.meta.runs).toEqual(['baseline', 'variant'])
+    expect(report.meta.promptCount).toBe(2)
+    expect(report.capability).toBeDefined()
+    expect(report.reliability).toBeDefined()
+    expect(report.flakiness).toBeDefined()
+    expect(report.headToHead.capability.length).toBeGreaterThan(0)
+    // Verify output file was written
+    const outputExists = await Bun.file(outputPath).exists()
+    expect(outputExists).toBe(true)
+  })
+  test('throws with fewer than 2 runs', async () => {
+    const run1Path = `${tempDir}/single-run.jsonl`
+    await Bun.write(run1Path, JSON.stringify(createTrialResult('test-001', 0.9, 0.7)))
+    await expect(
+      runTrialsCompare({
+        runs: [{ label: 'only', path: run1Path }],
+        progress: false,
+      }),
+    ).rejects.toThrow('At least 2 runs required')
+  })
+  test('skips prompts only in one run', async () => {
+    const run1Path = `${tempDir}/partial1.jsonl`
+    const run2Path = `${tempDir}/partial2.jsonl`
+    // Only run1 has test-001
+    const trial1a = createTrialResult('test-001', 0.9, 0.7)
+    // Both have test-002
+    const trial1b = createTrialResult('test-002', 0.8, 0.5)
+    const trial2b = createTrialResult('test-002', 0.6, 0.4)
+    await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
+    await Bun.write(run2Path, JSON.stringify(trial2b))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      progress: false,
+    })
+    // Only test-002 should be compared (both runs have it)
+    expect(report.headToHead.overall.length).toBeGreaterThan(0)
+    // Per-prompt should only have test-002
+    const perPromptIds = report.perPrompt?.map((p) => p.id) ?? []
+    expect(perPromptIds).toContain('test-002')
+    expect(perPromptIds).not.toContain('test-001')
+  })
+  test('generates markdown output when format is markdown', async () => {
+    const run1Path = `${tempDir}/md-run1.jsonl`
+    const run2Path = `${tempDir}/md-run2.jsonl`
+    const outputPath = `${tempDir}/report.md`
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.8, 0.6)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    await runTrialsCompare({
+      runs: [
+        { label: 'agent1', path: run1Path },
+        { label: 'agent2', path: run2Path },
+      ],
+      outputPath,
+      format: 'markdown',
+      progress: false,
+    })
+    const content = await Bun.file(outputPath).text()
+    expect(content).toContain('# Trials Comparison Report')
+    expect(content).toContain('## Capability')
+    expect(content).toContain('## Reliability')
+    expect(content).toContain('## Flakiness')
+    expect(content).toContain('agent1')
+    expect(content).toContain('agent2')
+  })
+  test('uses statistical strategy when specified', async () => {
+    const run1Path = `${tempDir}/stat-run1.jsonl`
+    const run2Path = `${tempDir}/stat-run2.jsonl`
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.5, 0.3)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'better', path: run1Path },
+        { label: 'worse', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Report should be generated without error
+    expect(report.meta.runs).toEqual(['better', 'worse'])
+  })
+  test('computes correct capability metrics', async () => {
+    const run1Path = `${tempDir}/cap-run1.jsonl`
+    // Create 3 prompts with known passAtK values
+    const trials = [
+      createTrialResult('p1', 1.0, 0.8), // passAtK = 1.0
+      createTrialResult('p2', 0.5, 0.3), // passAtK = 0.5
+      createTrialResult('p3', 0.8, 0.6), // passAtK = 0.8
+    ]
+    // Average passAtK = (1.0 + 0.5 + 0.8) / 3 = 0.767
+    // Sorted: 0.5, 0.8, 1.0 -> median = 0.8
+    await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
+    const run2Path = `${tempDir}/cap-run2.jsonl`
+    await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'test', path: run1Path },
+        { label: 'test2', path: run2Path },
+      ],
+      progress: false,
+    })
+    const cap = report.capability.test
+    expect(cap).toBeDefined()
+    // Average should be approximately 0.767
+    expect(cap?.avgPassAtK).toBeCloseTo(0.767, 2)
+    // Median of [0.5, 0.8, 1.0] = 0.8
+    expect(cap?.medianPassAtK).toBeCloseTo(0.8, 2)
+  })
+  test('identifies flaky prompts correctly', async () => {
+    const run1Path = `${tempDir}/flaky-run1.jsonl`
+    // Create prompts with varying flakiness
+    const trials = [
+      createTrialResult('consistent', 0.9, 0.9), // flakiness = 0
+      createTrialResult('flaky', 0.9, 0.1), // flakiness = 0.8
+      createTrialResult('moderate', 0.7, 0.5), // flakiness = 0.2
+    ]
+    await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
+    const run2Path = `${tempDir}/flaky-run2.jsonl`
+    await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'test', path: run1Path },
+        { label: 'test2', path: run2Path },
+      ],
+      progress: false,
+    })
+    const flak = report.flakiness.test
+    expect(flak).toBeDefined()
+    // 2 prompts have non-zero flakiness
+    expect(flak?.flakyPromptCount).toBe(2)
+    // Top flaky should include 'flaky' prompt
+    const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
+    expect(topFlakyIds).toContain('flaky')
+  })
+})

package/src/schemas/grader-loader.ts CHANGED Viewed

@@ -47,6 +47,7 @@ const resolvePath = (path: string): string => {
  * The metadata field contains arbitrary key-value pairs from the original
  * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
  * category-specific grading logic or filter calibration samples.
+ * The cwd field provides the working directory path for git-based outcome detection.
  */
 type ExecGraderInput = {
   input: string | string[]
@@ -54,6 +55,7 @@ type ExecGraderInput = {
   hint?: string
   trajectory?: TrajectoryStep[]
   metadata?: Record<string, unknown>
+  cwd?: string
 }
 /**
@@ -73,6 +75,8 @@ const createExecGrader = (execPath: string): Grader => {
       output: params.output,
       hint: params.hint,
       trajectory: params.trajectory,
+      metadata: params.metadata,
+      cwd: params.cwd,
     }
     const inputJson = JSON.stringify(input)