npm - @plaited/agent-eval-harness - Versions diffs - 0.8.0 → 0.9.0 - Mend

@plaited/agent-eval-harness 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/package.json +5 -3
package/src/graders/bootstrap.ts +135 -0
package/src/graders/compare-statistical.ts +14 -86
package/src/graders/tests/bootstrap.spec.ts +169 -0
package/src/graders/trials-compare-statistical.ts +6 -11
package/src/pipeline/compare-trials.ts +68 -13
package/src/pipeline/compare.ts +70 -12
package/src/pipeline/tests/compare-statistical.spec.ts +285 -0
package/src/pipeline/tests/compare-trials.spec.ts +143 -0
package/src/schemas/schemas.ts +69 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.8.0",
+  "version": "0.9.0",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {
@@ -42,8 +42,10 @@
     "check:types": "tsc --noEmit",
     "check:write": "biome check --write && format-package --write",
     "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
-    "test": "bun test ./**/tests/*.spec.ts",
-    "test:integration": "bun test ./**/integration_tests/*.spec.ts"
+    "test": "bun run test:bin && bun test:src",
+    "test:bin": "bun test bin/tests/*.spec.ts",
+    "test:integration": "bun test ./**/integration_tests/*.spec.ts",
+    "test:src": "bun test src/**/tests/*.spec.ts"
   },
   "lint-staged": {
     "*.{js,cjs,jsx,tsx,ts}": [

package/src/graders/bootstrap.ts ADDED Viewed

@@ -0,0 +1,135 @@
+/**
+ * Shared bootstrap sampling utilities for confidence interval computation.
+ *
+ * @remarks
+ * Bootstrap resampling provides robust confidence intervals without
+ * assuming a specific distribution. For small samples, it's more
+ * reliable than parametric methods.
+ *
+ * Environment variable configuration:
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
+ *
+ * @packageDocumentation
+ */
+/** Default number of bootstrap iterations */
+export const DEFAULT_ITERATIONS = 1000
+/** Default confidence level (95%) */
+export const DEFAULT_CONFIDENCE_LEVEL = 0.95
+/**
+ * Confidence interval as [lower, upper] bounds.
+ */
+export type ConfidenceInterval = [number, number]
+/**
+ * Bootstrap confidence interval result.
+ */
+export type BootstrapResult = {
+  /** Median of bootstrap sample means (50th percentile) */
+  median: number
+  /** Confidence interval [lower, upper] */
+  ci: ConfidenceInterval
+}
+/**
+ * Configuration for bootstrap sampling.
+ */
+export type BootstrapConfig = {
+  /** Number of bootstrap iterations (default: 1000) */
+  iterations?: number
+  /** Confidence level between 0 and 1 (default: 0.95) */
+  confidenceLevel?: number
+}
+/**
+ * Compute bootstrap confidence interval for sample mean.
+ *
+ * @remarks
+ * Bootstrap resampling provides robust confidence intervals without
+ * assuming a specific distribution. For small samples, it's more
+ * reliable than parametric methods.
+ *
+ * @param samples - Array of numeric samples
+ * @param config - Optional bootstrap configuration
+ * @returns Bootstrap median and confidence interval
+ *
+ * @public
+ */
+export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
+  const iterations = config?.iterations ?? DEFAULT_ITERATIONS
+  const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
+  if (samples.length === 0) {
+    return { median: 0, ci: [0, 0] }
+  }
+  if (samples.length === 1) {
+    const value = samples[0] ?? 0
+    return { median: value, ci: [value, value] }
+  }
+  const means: number[] = []
+  for (let i = 0; i < iterations; i++) {
+    // Resample with replacement - we know samples.length > 1 at this point
+    const resampled = Array.from(
+      { length: samples.length },
+      () => samples[Math.floor(Math.random() * samples.length)] as number,
+    )
+    // Compute mean of resampled data
+    const sum = resampled.reduce((acc, val) => acc + val, 0)
+    means.push(sum / resampled.length)
+  }
+  // Sort means for percentile calculation
+  means.sort((a, b) => a - b)
+  // Compute percentile indices based on confidence level
+  // For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
+  const alpha = (1 - confidenceLevel) / 2
+  const lowerIdx = Math.floor(iterations * alpha)
+  const upperIdx = Math.floor(iterations * (1 - alpha))
+  return {
+    median: means[Math.floor(iterations / 2)] ?? 0,
+    ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
+  }
+}
+/**
+ * Format confidence interval as string.
+ *
+ * @param ci - Confidence interval [lower, upper]
+ * @param decimals - Number of decimal places (default: 3)
+ * @returns Formatted CI string or empty string if undefined
+ *
+ * @public
+ */
+export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
+  if (!ci) return ''
+  return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
+}
+/**
+ * Get bootstrap configuration from environment variables.
+ *
+ * @remarks
+ * Reads configuration from:
+ * - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
+ *
+ * @returns Bootstrap configuration
+ *
+ * @public
+ */
+export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
+  const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
+  if (!envValue) return { iterations: DEFAULT_ITERATIONS }
+  const parsed = Number.parseInt(envValue, 10)
+  const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
+  return { iterations }
+}

package/src/graders/compare-statistical.ts CHANGED Viewed

@@ -12,81 +12,7 @@
  */
 import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
-/** Default number of bootstrap iterations */
-const DEFAULT_ITERATIONS = 1000
-/**
- * Bootstrap confidence interval result.
- */
-type BootstrapResult = {
-  /** Estimated mean from bootstrap */
-  mean: number
-  /** 95% confidence interval [lower, upper] */
-  ci95: [number, number]
-}
-/**
- * Compute bootstrap confidence interval for sample mean.
- *
- * @remarks
- * Bootstrap resampling provides robust confidence intervals without
- * assuming a specific distribution. For small samples, it's more
- * reliable than parametric methods.
- *
- * @param samples - Array of numeric samples
- * @param iterations - Number of bootstrap iterations
- * @returns Bootstrap mean and 95% confidence interval
- */
-const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
-  if (samples.length === 0) {
-    return { mean: 0, ci95: [0, 0] }
-  }
-  if (samples.length === 1) {
-    const value = samples[0] ?? 0
-    return { mean: value, ci95: [value, value] }
-  }
-  const means: number[] = []
-  for (let i = 0; i < iterations; i++) {
-    // Resample with replacement - we know samples.length > 1 at this point
-    const resampled = Array.from(
-      { length: samples.length },
-      () => samples[Math.floor(Math.random() * samples.length)] as number,
-    )
-    // Compute mean of resampled data
-    const sum = resampled.reduce((acc, val) => acc + val, 0)
-    means.push(sum / resampled.length)
-  }
-  // Sort means for percentile calculation
-  means.sort((a, b) => a - b)
-  // 95% CI: 2.5th and 97.5th percentiles
-  const lowerIdx = Math.floor(iterations * 0.025)
-  const upperIdx = Math.floor(iterations * 0.975)
-  return {
-    mean: means[Math.floor(iterations / 2)] ?? 0,
-    ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
-  }
-}
-/**
- * Get bootstrap iterations from environment variable.
- *
- * @returns Number of bootstrap iterations
- */
-const getIterationsFromEnv = (): number => {
-  const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
-  if (!envValue) return DEFAULT_ITERATIONS
-  const parsed = Number.parseInt(envValue, 10)
-  return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
-}
+import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
 /**
  * Statistical significance comparison grader.
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
  * @public
  */
 export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
-  const iterations = getIterationsFromEnv()
+  const config = getBootstrapConfigFromEnv()
   // Collect scores for each run
   const runStats = Object.entries(runs).map(([label, run]) => {
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
     // For single-prompt comparison, we only have one sample
     // In practice, this grader is most useful when aggregating across prompts
-    const stats = bootstrap([score], iterations)
+    const stats = bootstrap([score], config)
     return { label, score, stats }
   })
-  // Sort by bootstrap mean descending
-  const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
+  // Sort by bootstrap median descending
+  const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
   // Check if winner is statistically significant
   // CIs don't overlap = significant difference (approximately p<0.05)
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
   const second = sorted[1]
   if (first && second) {
     // Non-overlapping: first's lower bound > second's upper bound
-    isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
+    isSignificant = first.stats.ci[0] > second.stats.ci[1]
   }
   const reasoning = isSignificant
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
     rankings: sorted.map((s, i) => ({
       run: s.label,
       rank: i + 1,
-      score: s.stats.mean,
+      score: s.stats.median,
     })),
     reasoning,
   }
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
  *
  * @public
  */
-export const createStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): ComparisonGrader => {
+export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
+  const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
   return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
     const runStats = Object.entries(runs).map(([label, run]) => {
       const score = run.score?.score ?? 0
-      const stats = bootstrap([score], iterations)
+      const stats = bootstrap([score], config)
       return { label, score, stats }
     })
-    const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
+    const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
     let isSignificant = false
     const first = sorted[0]
     const second = sorted[1]
     if (first && second) {
-      isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
+      isSignificant = first.stats.ci[0] > second.stats.ci[1]
     }
     return {
       rankings: sorted.map((s, i) => ({
         run: s.label,
         rank: i + 1,
-        score: s.stats.mean,
+        score: s.stats.median,
       })),
       reasoning: isSignificant
         ? `Winner "${first?.label}" is statistically significant (p<0.05)`

package/src/graders/tests/bootstrap.spec.ts ADDED Viewed

@@ -0,0 +1,169 @@
+/**
+ * Unit tests for bootstrap sampling utilities.
+ */
+import { afterEach, describe, expect, test } from 'bun:test'
+import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
+describe('bootstrap', () => {
+  describe('edge cases', () => {
+    test('returns {median: 0, ci: [0, 0]} for empty array', () => {
+      const result = bootstrap([])
+      expect(result.median).toBe(0)
+      expect(result.ci).toEqual([0, 0])
+    })
+    test('returns {median: value, ci: [value, value]} for single sample', () => {
+      const result = bootstrap([0.75])
+      expect(result.median).toBe(0.75)
+      expect(result.ci).toEqual([0.75, 0.75])
+    })
+    test('handles single sample of 0', () => {
+      const result = bootstrap([0])
+      expect(result.median).toBe(0)
+      expect(result.ci).toEqual([0, 0])
+    })
+    test('handles single sample of 1', () => {
+      const result = bootstrap([1])
+      expect(result.median).toBe(1)
+      expect(result.ci).toEqual([1, 1])
+    })
+  })
+  describe('confidence interval bounds', () => {
+    test('CI lower bound <= median <= CI upper bound', () => {
+      const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
+      const result = bootstrap(samples, { iterations: 1000 })
+      expect(result.ci[0]).toBeLessThanOrEqual(result.median)
+      expect(result.median).toBeLessThanOrEqual(result.ci[1])
+    })
+    test('CI contains the true median for uniform samples', () => {
+      // For identical samples, CI should collapse to the value
+      const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
+      const result = bootstrap(samples, { iterations: 1000 })
+      expect(result.median).toBeCloseTo(0.5, 2)
+      expect(result.ci[0]).toBeCloseTo(0.5, 2)
+      expect(result.ci[1]).toBeCloseTo(0.5, 2)
+    })
+    test('CI widens with more variance in samples', () => {
+      const lowVariance = [0.49, 0.5, 0.51]
+      const highVariance = [0.1, 0.5, 0.9]
+      const lowResult = bootstrap(lowVariance, { iterations: 1000 })
+      const highResult = bootstrap(highVariance, { iterations: 1000 })
+      const lowWidth = lowResult.ci[1] - lowResult.ci[0]
+      const highWidth = highResult.ci[1] - highResult.ci[0]
+      expect(highWidth).toBeGreaterThan(lowWidth)
+    })
+  })
+  describe('configuration', () => {
+    test('uses default iterations when not specified', () => {
+      // Just verify it runs without error with defaults
+      const result = bootstrap([0.5, 0.6, 0.7])
+      expect(result.median).toBeGreaterThan(0)
+    })
+    test('accepts custom iteration count', () => {
+      const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
+      expect(result.median).toBeGreaterThan(0)
+    })
+    test('accepts custom confidence level', () => {
+      const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
+      // 90% CI should be narrower than 95% CI
+      const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
+      const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
+      const width90 = ci90.ci[1] - ci90.ci[0]
+      const width95 = ci95.ci[1] - ci95.ci[0]
+      // 95% CI should generally be wider than 90% CI
+      // Allow some tolerance due to randomness
+      expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
+    })
+  })
+  describe('statistical properties', () => {
+    test('median is close to sample mean', () => {
+      const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
+      const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
+      const result = bootstrap(samples, { iterations: 10000 })
+      // Bootstrap median should be close to sample mean for symmetric distributions
+      expect(result.median).toBeCloseTo(sampleMean, 1)
+    })
+    test('is deterministic-ish for large iteration counts', () => {
+      const samples = [0.3, 0.5, 0.7]
+      // With many iterations, results should be similar across runs
+      const result1 = bootstrap(samples, { iterations: 10000 })
+      const result2 = bootstrap(samples, { iterations: 10000 })
+      expect(result1.median).toBeCloseTo(result2.median, 1)
+    })
+  })
+})
+describe('getBootstrapConfigFromEnv', () => {
+  const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
+  afterEach(() => {
+    if (originalEnv === undefined) {
+      delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
+    } else {
+      process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
+    }
+  })
+  test('returns default iterations when env var not set', () => {
+    delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
+    const config = getBootstrapConfigFromEnv()
+    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
+  })
+  test('parses valid iteration count from env', () => {
+    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
+    const config = getBootstrapConfigFromEnv()
+    expect(config.iterations).toBe(5000)
+  })
+  test('returns default for invalid (non-numeric) env value', () => {
+    process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
+    const config = getBootstrapConfigFromEnv()
+    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
+  })
+  test('returns default for iteration count below minimum (100)', () => {
+    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
+    const config = getBootstrapConfigFromEnv()
+    expect(config.iterations).toBe(DEFAULT_ITERATIONS)
+  })
+  test('accepts iteration count at minimum (100)', () => {
+    process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
+    const config = getBootstrapConfigFromEnv()
+    expect(config.iterations).toBe(100)
+  })
+})
+describe('constants', () => {
+  test('DEFAULT_ITERATIONS is 1000', () => {
+    expect(DEFAULT_ITERATIONS).toBe(1000)
+  })
+  test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
+    expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
+  })
+})

package/src/graders/trials-compare-statistical.ts CHANGED Viewed

@@ -20,9 +20,7 @@ import type {
   TrialsComparisonGrader,
   TrialsComparisonGraderInput,
 } from '../pipeline/pipeline.types.ts'
-/** Default number of bootstrap iterations */
-const DEFAULT_ITERATIONS = 1000
+import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
 /**
  * Bootstrap confidence interval result.
@@ -82,16 +80,13 @@ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): Boot
 }
 /**
- * Get bootstrap iterations from environment variable.
+ * Get bootstrap iterations from environment or use default.
  *
  * @returns Number of bootstrap iterations
  */
-const getIterationsFromEnv = (): number => {
-  const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
-  if (!envValue) return DEFAULT_ITERATIONS
-  const parsed = Number.parseInt(envValue, 10)
-  return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
+const getIterations = (): number => {
+  const config = getBootstrapConfigFromEnv()
+  return config.iterations ?? DEFAULT_ITERATIONS
 }
 /**
@@ -109,7 +104,7 @@ const getIterationsFromEnv = (): number => {
 export const grade: TrialsComparisonGrader = async ({
   runs,
 }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
-  const iterations = getIterationsFromEnv()
+  const iterations = getIterations()
   // Collect pass/fail outcomes for each run
   const runStats = Object.entries(runs).map(([label, run]) => {

package/src/pipeline/compare-trials.ts CHANGED Viewed

@@ -16,6 +16,7 @@
  */
 import { logProgress, writeOutput } from '../core.ts'
+import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
 import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
 import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
 import type {
@@ -194,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
   const passExpKValues = results.map((r) => r.passExpK ?? 0)
   if (passExpKValues.length === 0) {
-    return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
+    return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
   }
   const sorted = [...passExpKValues].sort((a, b) => a - b)
   const sum = passExpKValues.reduce((a, b) => a + b, 0)
   return {
+    type: 'trial',
     avgPassExpK: sum / passExpKValues.length,
     medianPassExpK: percentile(sorted, 0.5),
     p25PassExpK: percentile(sorted, 0.25),
@@ -407,6 +409,34 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
     flakiness[label] = computeFlakinessMetrics(results)
   }
+  // Compute confidence intervals when using statistical strategy
+  if (strategy === 'statistical') {
+    const bootstrapConfig = getBootstrapConfigFromEnv()
+    for (const label of runLabels) {
+      const resultsMap = runResults[label] ?? new Map()
+      const results = [...resultsMap.values()]
+      const passAtKValues = results.map((r) => r.passAtK ?? 0)
+      const passExpKValues = results.map((r) => r.passExpK ?? 0)
+      // Capability CIs
+      const capabilityMetrics = capability[label]
+      if (capabilityMetrics) {
+        capabilityMetrics.confidenceIntervals = {
+          avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
+        }
+      }
+      // Reliability CIs
+      const reliabilityMetrics = reliability[label]
+      if (reliabilityMetrics) {
+        reliabilityMetrics.confidenceIntervals = {
+          avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
+        }
+      }
+    }
+  }
   // Compute pairwise comparisons
   const capabilityPairwise: PairwiseComparison[] = []
   const reliabilityPairwise: PairwiseComparison[] = []
@@ -531,27 +561,52 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
   lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
   lines.push('')
+  // Check if any run has confidence intervals (statistical strategy was used)
+  const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
   // Capability table
   lines.push('## Capability (passAtK)')
   lines.push('')
-  lines.push('| Run | Avg | Median | P25 | P75 |')
-  lines.push('|-----|-----|--------|-----|-----|')
-  for (const [label, c] of Object.entries(report.capability)) {
-    lines.push(
-      `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
-    )
+  if (hasCIs) {
+    lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
+    lines.push('|-----|-----|--------|--------|-----|-----|')
+    for (const [label, c] of Object.entries(report.capability)) {
+      const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
+      lines.push(
+        `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
+      )
+    }
+  } else {
+    lines.push('| Run | Avg | Median | P25 | P75 |')
+    lines.push('|-----|-----|--------|-----|-----|')
+    for (const [label, c] of Object.entries(report.capability)) {
+      lines.push(
+        `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
+      )
+    }
   }
   lines.push('')
   // Reliability table
   lines.push('## Reliability (passExpK)')
   lines.push('')
-  lines.push('| Run | Avg | Median | P25 | P75 |')
-  lines.push('|-----|-----|--------|-----|-----|')
-  for (const [label, r] of Object.entries(report.reliability)) {
-    lines.push(
-      `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
-    )
+  if (hasCIs) {
+    lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
+    lines.push('|-----|-----|--------|--------|-----|-----|')
+    for (const [label, r] of Object.entries(report.reliability)) {
+      const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
+      lines.push(
+        `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
+      )
+    }
+  } else {
+    lines.push('| Run | Avg | Median | P25 | P75 |')
+    lines.push('|-----|-----|--------|-----|-----|')
+    for (const [label, r] of Object.entries(report.reliability)) {
+      lines.push(
+        `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
+      )
+    }
   }
   lines.push('')

package/src/pipeline/compare.ts CHANGED Viewed

@@ -25,6 +25,7 @@
 import { basename, extname } from 'node:path'
 import { parseArgs } from 'node:util'
 import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
+import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
 import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
 import { grade as weightedGrade } from '../graders/compare-weighted.ts'
 import type {
@@ -463,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
     const completedCount = results.filter((r) => r.output && !r.errors?.length).length
     reliability[label] = {
+      type: 'run',
       toolErrors: toolErrorCount,
       toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
       timeouts: timeoutCount,
@@ -471,6 +473,36 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
     }
   }
+  // Compute confidence intervals when using statistical strategy
+  if (strategy === 'statistical') {
+    const bootstrapConfig = getBootstrapConfigFromEnv()
+    for (const label of runLabels) {
+      const resultsMap = runResults[label] ?? new Map()
+      const results = [...resultsMap.values()]
+      const scores = results.map((r) => r.score?.score ?? 0)
+      const passes = results.map((r) => (r.score?.pass === true ? 1 : 0))
+      const latencies = results.map((r) => r.timing?.total ?? 0)
+      // Quality CIs
+      const qualityMetrics = quality[label]
+      if (qualityMetrics) {
+        qualityMetrics.confidenceIntervals = {
+          avgScore: bootstrap(scores, bootstrapConfig).ci,
+          passRate: bootstrap(passes, bootstrapConfig).ci,
+        }
+      }
+      // Performance CIs
+      const performanceMetrics = performance[label]
+      if (performanceMetrics) {
+        performanceMetrics.confidenceIntervals = {
+          latencyMean: bootstrap(latencies, bootstrapConfig).ci,
+        }
+      }
+    }
+  }
   // Trajectory info
   const trajectoryInfo: Record<string, TrajectoryInfo> = {}
   for (const label of runLabels) {
@@ -586,27 +618,53 @@ const formatReportAsMarkdown = (report: ComparisonReport): string => {
   lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
   lines.push('')
+  // Check if any run has confidence intervals (statistical strategy was used)
+  const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
   // Quality table
   lines.push('## Quality')
   lines.push('')
-  lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
-  lines.push('|-----|-----------|-----------|------|------|')
-  for (const [label, q] of Object.entries(report.quality)) {
-    lines.push(
-      `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
-    )
+  if (hasCIs) {
+    lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |')
+    lines.push('|-----|-----------|--------|-----------|--------|------|------|')
+    for (const [label, q] of Object.entries(report.quality)) {
+      const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore)
+      const passRateCI = formatCI(q.confidenceIntervals?.passRate)
+      lines.push(
+        `| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`,
+      )
+    }
+  } else {
+    lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
+    lines.push('|-----|-----------|-----------|------|------|')
+    for (const [label, q] of Object.entries(report.quality)) {
+      lines.push(
+        `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
+      )
+    }
   }
   lines.push('')
   // Performance table
   lines.push('## Performance')
   lines.push('')
-  lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
-  lines.push('|-----|----------|----------|----------|-----------|')
-  for (const [label, p] of Object.entries(report.performance)) {
-    lines.push(
-      `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
-    )
+  if (hasCIs) {
+    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |')
+    lines.push('|-----|----------|----------|----------|-----------|--------|')
+    for (const [label, p] of Object.entries(report.performance)) {
+      const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
+      lines.push(
+        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`,
+      )
+    }
+  } else {
+    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
+    lines.push('|-----|----------|----------|----------|-----------|')
+    for (const [label, p] of Object.entries(report.performance)) {
+      lines.push(
+        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
+      )
+    }
   }
   lines.push('')

package/src/pipeline/tests/compare-statistical.spec.ts ADDED Viewed

@@ -0,0 +1,285 @@
+/**
+ * Integration tests for compare command statistical strategy.
+ *
+ * @remarks
+ * Tests verify confidence interval computation for the statistical strategy
+ * in the compare command with CaptureResult format.
+ *
+ * @packageDocumentation
+ */
+import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
+import type { CaptureResult } from '../../schemas.ts'
+import { runCompare } from '../compare.ts'
+// ============================================================================
+// Test Fixtures
+// ============================================================================
+const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({
+  id,
+  input: `Prompt for ${id}`,
+  output: `Output for ${id}`,
+  trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }],
+  metadata: {},
+  timing: {
+    start: Date.now(),
+    end: Date.now() + duration,
+    sessionCreation: 100,
+    total: duration,
+  },
+  toolErrors: false,
+  score: {
+    pass,
+    score,
+    reasoning: pass ? 'Passed' : 'Failed',
+  },
+})
+const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical`
+beforeAll(async () => {
+  await Bun.$`mkdir -p ${tempDir}`
+})
+afterAll(async () => {
+  await Bun.$`rm -rf ${tempDir}`
+})
+// ============================================================================
+// Statistical Strategy CI Tests
+// ============================================================================
+describe('runCompare statistical strategy', () => {
+  test('computes confidence intervals for quality metrics', async () => {
+    const run1Path = `${tempDir}/ci-qual-run1.jsonl`
+    const run2Path = `${tempDir}/ci-qual-run2.jsonl`
+    // Create multiple prompts with varying scores for meaningful CI computation
+    const results1 = [
+      createCaptureResult('p1', 0.9, true, 1000),
+      createCaptureResult('p2', 0.85, true, 1100),
+      createCaptureResult('p3', 0.95, true, 900),
+      createCaptureResult('p4', 0.8, true, 1200),
+    ]
+    const results2 = [
+      createCaptureResult('p1', 0.6, false, 2000),
+      createCaptureResult('p2', 0.5, false, 2100),
+      createCaptureResult('p3', 0.7, true, 1900),
+      createCaptureResult('p4', 0.55, false, 2200),
+    ]
+    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
+    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
+    const report = await runCompare({
+      runs: [
+        { label: 'high', path: run1Path },
+        { label: 'low', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Verify confidence intervals are computed for quality
+    const highQuality = report.quality.high
+    expect(highQuality).toBeDefined()
+    expect(highQuality?.confidenceIntervals).toBeDefined()
+    expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined()
+    expect(highQuality?.confidenceIntervals?.passRate).toBeDefined()
+    // avgScore CI should be a tuple [lower, upper]
+    const avgScoreCI = highQuality?.confidenceIntervals?.avgScore
+    expect(avgScoreCI).toHaveLength(2)
+    expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0)
+    // CI should contain the average (within reasonable bounds)
+    expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0)
+    expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1)
+    // passRate CI should also be valid
+    const passRateCI = highQuality?.confidenceIntervals?.passRate
+    expect(passRateCI).toHaveLength(2)
+    expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
+    // Verify reliability metrics include type discriminator
+    expect(report.reliability.high?.type).toBe('run')
+    expect(report.reliability.low?.type).toBe('run')
+  })
+  test('computes confidence intervals for performance metrics', async () => {
+    const run1Path = `${tempDir}/ci-perf-run1.jsonl`
+    const run2Path = `${tempDir}/ci-perf-run2.jsonl`
+    // Create results with varying latencies
+    const results1 = [
+      createCaptureResult('p1', 0.9, true, 1000),
+      createCaptureResult('p2', 0.85, true, 1100),
+      createCaptureResult('p3', 0.95, true, 900),
+      createCaptureResult('p4', 0.8, true, 1050),
+    ]
+    const results2 = [
+      createCaptureResult('p1', 0.7, true, 2000),
+      createCaptureResult('p2', 0.65, true, 2200),
+      createCaptureResult('p3', 0.75, true, 1800),
+      createCaptureResult('p4', 0.6, true, 2100),
+    ]
+    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
+    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
+    const report = await runCompare({
+      runs: [
+        { label: 'fast', path: run1Path },
+        { label: 'slow', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Verify confidence intervals are computed for performance
+    const fastPerf = report.performance.fast
+    expect(fastPerf).toBeDefined()
+    expect(fastPerf?.confidenceIntervals).toBeDefined()
+    expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined()
+    // latencyMean CI should be a tuple [lower, upper]
+    const latencyCI = fastPerf?.confidenceIntervals?.latencyMean
+    expect(latencyCI).toHaveLength(2)
+    expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0)
+    // Fast run should have lower latency CI than slow run
+    const slowPerf = report.performance.slow
+    const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean
+    expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0)
+  })
+  test('weighted strategy does not compute confidence intervals', async () => {
+    const run1Path = `${tempDir}/no-ci-run1.jsonl`
+    const run2Path = `${tempDir}/no-ci-run2.jsonl`
+    const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)]
+    const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)]
+    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
+    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
+    const report = await runCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      strategy: 'weighted', // Default strategy
+      progress: false,
+    })
+    // Confidence intervals should NOT be present for weighted strategy
+    const quality = report.quality.run1
+    expect(quality?.confidenceIntervals).toBeUndefined()
+    const perf = report.performance.run1
+    expect(perf?.confidenceIntervals).toBeUndefined()
+  })
+  test('statistical strategy includes CIs in markdown output', async () => {
+    const run1Path = `${tempDir}/ci-md-run1.jsonl`
+    const run2Path = `${tempDir}/ci-md-run2.jsonl`
+    const outputPath = `${tempDir}/ci-report.md`
+    const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)]
+    const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)]
+    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
+    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
+    await runCompare({
+      runs: [
+        { label: 'agent1', path: run1Path },
+        { label: 'agent2', path: run2Path },
+      ],
+      strategy: 'statistical',
+      outputPath,
+      format: 'markdown',
+      progress: false,
+    })
+    const content = await Bun.file(outputPath).text()
+    // Markdown should include 95% CI column headers
+    expect(content).toContain('95% CI')
+    // Should contain CI values in bracket format [lower, upper]
+    expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
+  })
+  test('handles single sample gracefully with degenerate CI', async () => {
+    const run1Path = `${tempDir}/single-run1.jsonl`
+    const run2Path = `${tempDir}/single-run2.jsonl`
+    // Single sample per run
+    const result1 = createCaptureResult('p1', 0.9, true)
+    const result2 = createCaptureResult('p1', 0.5, false)
+    await Bun.write(run1Path, JSON.stringify(result1))
+    await Bun.write(run2Path, JSON.stringify(result2))
+    const report = await runCompare({
+      runs: [
+        { label: 'single1', path: run1Path },
+        { label: 'single2', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Should still compute CIs (they will be degenerate for single sample)
+    const quality = report.quality.single1
+    expect(quality?.confidenceIntervals).toBeDefined()
+    expect(quality?.confidenceIntervals?.avgScore).toBeDefined()
+    // For single sample, CI should collapse to the value
+    const ci = quality?.confidenceIntervals?.avgScore
+    expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2)
+    expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2)
+  })
+  test('JSON output includes confidence intervals structure', async () => {
+    const run1Path = `${tempDir}/json-ci-run1.jsonl`
+    const run2Path = `${tempDir}/json-ci-run2.jsonl`
+    const outputPath = `${tempDir}/ci-report.json`
+    const results1 = [
+      createCaptureResult('p1', 0.9, true),
+      createCaptureResult('p2', 0.85, true),
+      createCaptureResult('p3', 0.95, true),
+    ]
+    const results2 = [
+      createCaptureResult('p1', 0.6, false),
+      createCaptureResult('p2', 0.5, false),
+      createCaptureResult('p3', 0.7, true),
+    ]
+    await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
+    await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
+    await runCompare({
+      runs: [
+        { label: 'high', path: run1Path },
+        { label: 'low', path: run2Path },
+      ],
+      strategy: 'statistical',
+      outputPath,
+      format: 'json',
+      progress: false,
+    })
+    const content = await Bun.file(outputPath).text()
+    const parsed = JSON.parse(content)
+    // Verify JSON structure includes confidenceIntervals
+    expect(parsed.quality.high.confidenceIntervals).toBeDefined()
+    expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array)
+    expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2)
+    expect(parsed.performance.high.confidenceIntervals).toBeDefined()
+    expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array)
+  })
+})

package/src/pipeline/tests/compare-trials.spec.ts CHANGED Viewed

@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
     expect(report.meta.promptCount).toBe(2)
     expect(report.capability).toBeDefined()
     expect(report.reliability).toBeDefined()
+    expect(report.reliability.baseline?.type).toBe('trial')
+    expect(report.reliability.variant?.type).toBe('trial')
     expect(report.flakiness).toBeDefined()
     expect(report.headToHead.capability.length).toBeGreaterThan(0)
@@ -210,6 +212,147 @@ describe('runTrialsCompare', () => {
     expect(report.meta.runs).toEqual(['better', 'worse'])
   })
+  test('statistical strategy computes confidence intervals for capability metrics', async () => {
+    const run1Path = `${tempDir}/ci-cap-run1.jsonl`
+    const run2Path = `${tempDir}/ci-cap-run2.jsonl`
+    // Create multiple prompts for meaningful CI computation
+    const trials1 = [
+      createTrialResult('p1', 0.9, 0.8),
+      createTrialResult('p2', 0.85, 0.7),
+      createTrialResult('p3', 0.95, 0.9),
+    ]
+    const trials2 = [
+      createTrialResult('p1', 0.6, 0.4),
+      createTrialResult('p2', 0.5, 0.3),
+      createTrialResult('p3', 0.7, 0.5),
+    ]
+    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
+    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'high', path: run1Path },
+        { label: 'low', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Verify confidence intervals are computed for capability
+    const highCap = report.capability.high
+    expect(highCap).toBeDefined()
+    expect(highCap?.confidenceIntervals).toBeDefined()
+    expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined()
+    // CI should be a tuple [lower, upper]
+    const ci = highCap?.confidenceIntervals?.avgPassAtK
+    expect(ci).toHaveLength(2)
+    expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
+    // CI should contain the average (within reasonable bounds)
+    expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0)
+    expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1)
+  })
+  test('statistical strategy computes confidence intervals for reliability metrics', async () => {
+    const run1Path = `${tempDir}/ci-rel-run1.jsonl`
+    const run2Path = `${tempDir}/ci-rel-run2.jsonl`
+    const trials1 = [
+      createTrialResult('p1', 0.9, 0.85),
+      createTrialResult('p2', 0.8, 0.75),
+      createTrialResult('p3', 0.85, 0.8),
+    ]
+    const trials2 = [
+      createTrialResult('p1', 0.7, 0.3),
+      createTrialResult('p2', 0.6, 0.2),
+      createTrialResult('p3', 0.65, 0.25),
+    ]
+    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
+    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'reliable', path: run1Path },
+        { label: 'flaky', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Verify confidence intervals are computed for reliability
+    const reliableRel = report.reliability.reliable
+    expect(reliableRel).toBeDefined()
+    expect(reliableRel?.type).toBe('trial')
+    expect(reliableRel?.confidenceIntervals).toBeDefined()
+    expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
+    // CI should be a tuple [lower, upper]
+    const ci = reliableRel?.confidenceIntervals?.avgPassExpK
+    expect(ci).toHaveLength(2)
+    expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
+  })
+  test('weighted strategy does not compute confidence intervals', async () => {
+    const run1Path = `${tempDir}/no-ci-run1.jsonl`
+    const run2Path = `${tempDir}/no-ci-run2.jsonl`
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.5, 0.3)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      strategy: 'weighted', // Default strategy
+      progress: false,
+    })
+    // Confidence intervals should NOT be present for weighted strategy
+    const cap = report.capability.run1
+    expect(cap?.confidenceIntervals).toBeUndefined()
+    const rel = report.reliability.run1
+    expect(rel?.confidenceIntervals).toBeUndefined()
+  })
+  test('statistical strategy includes CIs in markdown output', async () => {
+    const run1Path = `${tempDir}/ci-md-run1.jsonl`
+    const run2Path = `${tempDir}/ci-md-run2.jsonl`
+    const outputPath = `${tempDir}/ci-report.md`
+    const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)]
+    const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)]
+    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
+    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
+    await runTrialsCompare({
+      runs: [
+        { label: 'agent1', path: run1Path },
+        { label: 'agent2', path: run2Path },
+      ],
+      strategy: 'statistical',
+      outputPath,
+      format: 'markdown',
+      progress: false,
+    })
+    const content = await Bun.file(outputPath).text()
+    // Markdown should include 95% CI column headers
+    expect(content).toContain('95% CI')
+    // Should contain CI values in bracket format [lower, upper]
+    expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
+  })
   test('computes correct capability metrics', async () => {
     const run1Path = `${tempDir}/cap-run1.jsonl`

package/src/schemas/schemas.ts CHANGED Viewed

@@ -573,6 +573,17 @@ export type ValidationResult = z.infer<typeof ValidationResultSchema>
 // Comparison Report Schemas
 // ============================================================================
+/**
+ * Confidence interval schema as [lower, upper] bounds.
+ *
+ * @remarks
+ * Used for bootstrap-computed confidence intervals when strategy=statistical.
+ */
+export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()])
+/** Confidence interval type */
+export type ConfidenceInterval = z.infer<typeof ConfidenceIntervalSchema>
 /**
  * Score distribution histogram for quality analysis.
  *
@@ -590,6 +601,19 @@ export const ScoreDistributionSchema = z.object({
 /** Score distribution type */
 export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
+/**
+ * Confidence intervals for quality metrics.
+ */
+export const QualityConfidenceIntervalsSchema = z.object({
+  /** CI for avgScore */
+  avgScore: ConfidenceIntervalSchema.optional(),
+  /** CI for passRate */
+  passRate: ConfidenceIntervalSchema.optional(),
+})
+/** Quality confidence intervals type */
+export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceIntervalsSchema>
 /**
  * Quality metrics for a single run in comparison.
  */
@@ -604,6 +628,8 @@ export const QualityMetricsSchema = z.object({
   failCount: z.number(),
   /** Score distribution histogram */
   scoreDistribution: ScoreDistributionSchema,
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: QualityConfidenceIntervalsSchema.optional(),
 })
 /** Quality metrics type */
@@ -630,6 +656,17 @@ export const LatencyStatsSchema = z.object({
 /** Latency stats type */
 export type LatencyStats = z.infer<typeof LatencyStatsSchema>
+/**
+ * Confidence intervals for performance metrics.
+ */
+export const PerformanceConfidenceIntervalsSchema = z.object({
+  /** CI for latency mean */
+  latencyMean: ConfidenceIntervalSchema.optional(),
+})
+/** Performance confidence intervals type */
+export type PerformanceConfidenceIntervals = z.infer<typeof PerformanceConfidenceIntervalsSchema>
 /**
  * Performance metrics for a single run in comparison.
  */
@@ -640,6 +677,8 @@ export const PerformanceMetricsSchema = z.object({
   firstResponse: LatencyStatsSchema.optional(),
   /** Sum of all run durations in milliseconds */
   totalDuration: z.number(),
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(),
 })
 /** Performance metrics type */
@@ -649,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
  * Reliability metrics for a single run in comparison.
  */
 export const ReliabilityMetricsSchema = z.object({
+  /** Discriminator for run-based reliability metrics */
+  type: z.literal('run'),
   /** Count of runs with toolErrors=true */
   toolErrors: z.number(),
   /** Percentage of runs with tool errors */
@@ -782,6 +823,17 @@ export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
 // Trials Comparison Report Schemas
 // ============================================================================
+/**
+ * Confidence intervals for trials capability metrics.
+ */
+export const TrialsCapabilityConfidenceIntervalsSchema = z.object({
+  /** CI for avgPassAtK */
+  avgPassAtK: ConfidenceIntervalSchema.optional(),
+})
+/** Trials capability confidence intervals type */
+export type TrialsCapabilityConfidenceIntervals = z.infer<typeof TrialsCapabilityConfidenceIntervalsSchema>
 /**
  * Capability metrics for trials comparison (passAtK-based).
  *
@@ -798,11 +850,24 @@ export const TrialsCapabilityMetricsSchema = z.object({
   p25PassAtK: z.number(),
   /** 75th percentile passAtK */
   p75PassAtK: z.number(),
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(),
 })
 /** Trials capability metrics type */
 export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
+/**
+ * Confidence intervals for trials reliability metrics.
+ */
+export const TrialsReliabilityConfidenceIntervalsSchema = z.object({
+  /** CI for avgPassExpK */
+  avgPassExpK: ConfidenceIntervalSchema.optional(),
+})
+/** Trials reliability confidence intervals type */
+export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabilityConfidenceIntervalsSchema>
 /**
  * Reliability metrics for trials comparison (passExpK-based).
  *
@@ -811,6 +876,8 @@ export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSche
  * Higher passExpK means the agent reliably solves the task every time.
  */
 export const TrialsReliabilityMetricsSchema = z.object({
+  /** Discriminator for trial-based reliability metrics */
+  type: z.literal('trial'),
   /** Average passExpK across all prompts */
   avgPassExpK: z.number(),
   /** Median passExpK */
@@ -819,6 +886,8 @@ export const TrialsReliabilityMetricsSchema = z.object({
   p25PassExpK: z.number(),
   /** 75th percentile passExpK */
   p75PassExpK: z.number(),
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(),
 })
 /** Trials reliability metrics type */