@plaited/agent-eval-harness 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.8.0",
3
+ "version": "0.9.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -42,8 +42,10 @@
42
42
  "check:types": "tsc --noEmit",
43
43
  "check:write": "biome check --write && format-package --write",
44
44
  "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
45
- "test": "bun test ./**/tests/*.spec.ts",
46
- "test:integration": "bun test ./**/integration_tests/*.spec.ts"
45
+ "test": "bun run test:bin && bun test:src",
46
+ "test:bin": "bun test bin/tests/*.spec.ts",
47
+ "test:integration": "bun test ./**/integration_tests/*.spec.ts",
48
+ "test:src": "bun test src/**/tests/*.spec.ts"
47
49
  },
48
50
  "lint-staged": {
49
51
  "*.{js,cjs,jsx,tsx,ts}": [
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Shared bootstrap sampling utilities for confidence interval computation.
3
+ *
4
+ * @remarks
5
+ * Bootstrap resampling provides robust confidence intervals without
6
+ * assuming a specific distribution. For small samples, it's more
7
+ * reliable than parametric methods.
8
+ *
9
+ * Environment variable configuration:
10
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+
15
+ /** Default number of bootstrap iterations */
16
+ export const DEFAULT_ITERATIONS = 1000
17
+
18
+ /** Default confidence level (95%) */
19
+ export const DEFAULT_CONFIDENCE_LEVEL = 0.95
20
+
21
+ /**
22
+ * Confidence interval as [lower, upper] bounds.
23
+ */
24
+ export type ConfidenceInterval = [number, number]
25
+
26
+ /**
27
+ * Bootstrap confidence interval result.
28
+ */
29
+ export type BootstrapResult = {
30
+ /** Median of bootstrap sample means (50th percentile) */
31
+ median: number
32
+ /** Confidence interval [lower, upper] */
33
+ ci: ConfidenceInterval
34
+ }
35
+
36
+ /**
37
+ * Configuration for bootstrap sampling.
38
+ */
39
+ export type BootstrapConfig = {
40
+ /** Number of bootstrap iterations (default: 1000) */
41
+ iterations?: number
42
+ /** Confidence level between 0 and 1 (default: 0.95) */
43
+ confidenceLevel?: number
44
+ }
45
+
46
+ /**
47
+ * Compute bootstrap confidence interval for sample mean.
48
+ *
49
+ * @remarks
50
+ * Bootstrap resampling provides robust confidence intervals without
51
+ * assuming a specific distribution. For small samples, it's more
52
+ * reliable than parametric methods.
53
+ *
54
+ * @param samples - Array of numeric samples
55
+ * @param config - Optional bootstrap configuration
56
+ * @returns Bootstrap median and confidence interval
57
+ *
58
+ * @public
59
+ */
60
+ export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
61
+ const iterations = config?.iterations ?? DEFAULT_ITERATIONS
62
+ const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
63
+
64
+ if (samples.length === 0) {
65
+ return { median: 0, ci: [0, 0] }
66
+ }
67
+
68
+ if (samples.length === 1) {
69
+ const value = samples[0] ?? 0
70
+ return { median: value, ci: [value, value] }
71
+ }
72
+
73
+ const means: number[] = []
74
+
75
+ for (let i = 0; i < iterations; i++) {
76
+ // Resample with replacement - we know samples.length > 1 at this point
77
+ const resampled = Array.from(
78
+ { length: samples.length },
79
+ () => samples[Math.floor(Math.random() * samples.length)] as number,
80
+ )
81
+
82
+ // Compute mean of resampled data
83
+ const sum = resampled.reduce((acc, val) => acc + val, 0)
84
+ means.push(sum / resampled.length)
85
+ }
86
+
87
+ // Sort means for percentile calculation
88
+ means.sort((a, b) => a - b)
89
+
90
+ // Compute percentile indices based on confidence level
91
+ // For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
92
+ const alpha = (1 - confidenceLevel) / 2
93
+ const lowerIdx = Math.floor(iterations * alpha)
94
+ const upperIdx = Math.floor(iterations * (1 - alpha))
95
+
96
+ return {
97
+ median: means[Math.floor(iterations / 2)] ?? 0,
98
+ ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
99
+ }
100
+ }
101
+
102
+ /**
103
+ * Format confidence interval as string.
104
+ *
105
+ * @param ci - Confidence interval [lower, upper]
106
+ * @param decimals - Number of decimal places (default: 3)
107
+ * @returns Formatted CI string or empty string if undefined
108
+ *
109
+ * @public
110
+ */
111
+ export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
112
+ if (!ci) return ''
113
+ return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
114
+ }
115
+
116
+ /**
117
+ * Get bootstrap configuration from environment variables.
118
+ *
119
+ * @remarks
120
+ * Reads configuration from:
121
+ * - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
122
+ *
123
+ * @returns Bootstrap configuration
124
+ *
125
+ * @public
126
+ */
127
+ export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
128
+ const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
129
+ if (!envValue) return { iterations: DEFAULT_ITERATIONS }
130
+
131
+ const parsed = Number.parseInt(envValue, 10)
132
+ const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
133
+
134
+ return { iterations }
135
+ }
@@ -12,81 +12,7 @@
12
12
  */
13
13
 
14
14
  import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
15
-
16
- /** Default number of bootstrap iterations */
17
- const DEFAULT_ITERATIONS = 1000
18
-
19
- /**
20
- * Bootstrap confidence interval result.
21
- */
22
- type BootstrapResult = {
23
- /** Estimated mean from bootstrap */
24
- mean: number
25
- /** 95% confidence interval [lower, upper] */
26
- ci95: [number, number]
27
- }
28
-
29
- /**
30
- * Compute bootstrap confidence interval for sample mean.
31
- *
32
- * @remarks
33
- * Bootstrap resampling provides robust confidence intervals without
34
- * assuming a specific distribution. For small samples, it's more
35
- * reliable than parametric methods.
36
- *
37
- * @param samples - Array of numeric samples
38
- * @param iterations - Number of bootstrap iterations
39
- * @returns Bootstrap mean and 95% confidence interval
40
- */
41
- const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
42
- if (samples.length === 0) {
43
- return { mean: 0, ci95: [0, 0] }
44
- }
45
-
46
- if (samples.length === 1) {
47
- const value = samples[0] ?? 0
48
- return { mean: value, ci95: [value, value] }
49
- }
50
-
51
- const means: number[] = []
52
-
53
- for (let i = 0; i < iterations; i++) {
54
- // Resample with replacement - we know samples.length > 1 at this point
55
- const resampled = Array.from(
56
- { length: samples.length },
57
- () => samples[Math.floor(Math.random() * samples.length)] as number,
58
- )
59
-
60
- // Compute mean of resampled data
61
- const sum = resampled.reduce((acc, val) => acc + val, 0)
62
- means.push(sum / resampled.length)
63
- }
64
-
65
- // Sort means for percentile calculation
66
- means.sort((a, b) => a - b)
67
-
68
- // 95% CI: 2.5th and 97.5th percentiles
69
- const lowerIdx = Math.floor(iterations * 0.025)
70
- const upperIdx = Math.floor(iterations * 0.975)
71
-
72
- return {
73
- mean: means[Math.floor(iterations / 2)] ?? 0,
74
- ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
75
- }
76
- }
77
-
78
- /**
79
- * Get bootstrap iterations from environment variable.
80
- *
81
- * @returns Number of bootstrap iterations
82
- */
83
- const getIterationsFromEnv = (): number => {
84
- const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
85
- if (!envValue) return DEFAULT_ITERATIONS
86
-
87
- const parsed = Number.parseInt(envValue, 10)
88
- return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
89
- }
15
+ import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
90
16
 
91
17
  /**
92
18
  * Statistical significance comparison grader.
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
107
33
  * @public
108
34
  */
109
35
  export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
110
- const iterations = getIterationsFromEnv()
36
+ const config = getBootstrapConfigFromEnv()
111
37
 
112
38
  // Collect scores for each run
113
39
  const runStats = Object.entries(runs).map(([label, run]) => {
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
116
42
 
117
43
  // For single-prompt comparison, we only have one sample
118
44
  // In practice, this grader is most useful when aggregating across prompts
119
- const stats = bootstrap([score], iterations)
45
+ const stats = bootstrap([score], config)
120
46
 
121
47
  return { label, score, stats }
122
48
  })
123
49
 
124
- // Sort by bootstrap mean descending
125
- const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
50
+ // Sort by bootstrap median descending
51
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
126
52
 
127
53
  // Check if winner is statistically significant
128
54
  // CIs don't overlap = significant difference (approximately p<0.05)
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
131
57
  const second = sorted[1]
132
58
  if (first && second) {
133
59
  // Non-overlapping: first's lower bound > second's upper bound
134
- isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
60
+ isSignificant = first.stats.ci[0] > second.stats.ci[1]
135
61
  }
136
62
 
137
63
  const reasoning = isSignificant
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
142
68
  rankings: sorted.map((s, i) => ({
143
69
  run: s.label,
144
70
  rank: i + 1,
145
- score: s.stats.mean,
71
+ score: s.stats.median,
146
72
  })),
147
73
  reasoning,
148
74
  }
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
156
82
  *
157
83
  * @public
158
84
  */
159
- export const createStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): ComparisonGrader => {
85
+ export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
86
+ const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
87
+
160
88
  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
161
89
  const runStats = Object.entries(runs).map(([label, run]) => {
162
90
  const score = run.score?.score ?? 0
163
- const stats = bootstrap([score], iterations)
91
+ const stats = bootstrap([score], config)
164
92
  return { label, score, stats }
165
93
  })
166
94
 
167
- const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
95
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
168
96
 
169
97
  let isSignificant = false
170
98
  const first = sorted[0]
171
99
  const second = sorted[1]
172
100
  if (first && second) {
173
- isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
101
+ isSignificant = first.stats.ci[0] > second.stats.ci[1]
174
102
  }
175
103
 
176
104
  return {
177
105
  rankings: sorted.map((s, i) => ({
178
106
  run: s.label,
179
107
  rank: i + 1,
180
- score: s.stats.mean,
108
+ score: s.stats.median,
181
109
  })),
182
110
  reasoning: isSignificant
183
111
  ? `Winner "${first?.label}" is statistically significant (p<0.05)`
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Unit tests for bootstrap sampling utilities.
3
+ */
4
+
5
+ import { afterEach, describe, expect, test } from 'bun:test'
6
+ import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
7
+
8
+ describe('bootstrap', () => {
9
+ describe('edge cases', () => {
10
+ test('returns {median: 0, ci: [0, 0]} for empty array', () => {
11
+ const result = bootstrap([])
12
+ expect(result.median).toBe(0)
13
+ expect(result.ci).toEqual([0, 0])
14
+ })
15
+
16
+ test('returns {median: value, ci: [value, value]} for single sample', () => {
17
+ const result = bootstrap([0.75])
18
+ expect(result.median).toBe(0.75)
19
+ expect(result.ci).toEqual([0.75, 0.75])
20
+ })
21
+
22
+ test('handles single sample of 0', () => {
23
+ const result = bootstrap([0])
24
+ expect(result.median).toBe(0)
25
+ expect(result.ci).toEqual([0, 0])
26
+ })
27
+
28
+ test('handles single sample of 1', () => {
29
+ const result = bootstrap([1])
30
+ expect(result.median).toBe(1)
31
+ expect(result.ci).toEqual([1, 1])
32
+ })
33
+ })
34
+
35
+ describe('confidence interval bounds', () => {
36
+ test('CI lower bound <= median <= CI upper bound', () => {
37
+ const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
38
+ const result = bootstrap(samples, { iterations: 1000 })
39
+
40
+ expect(result.ci[0]).toBeLessThanOrEqual(result.median)
41
+ expect(result.median).toBeLessThanOrEqual(result.ci[1])
42
+ })
43
+
44
+ test('CI contains the true median for uniform samples', () => {
45
+ // For identical samples, CI should collapse to the value
46
+ const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
47
+ const result = bootstrap(samples, { iterations: 1000 })
48
+
49
+ expect(result.median).toBeCloseTo(0.5, 2)
50
+ expect(result.ci[0]).toBeCloseTo(0.5, 2)
51
+ expect(result.ci[1]).toBeCloseTo(0.5, 2)
52
+ })
53
+
54
+ test('CI widens with more variance in samples', () => {
55
+ const lowVariance = [0.49, 0.5, 0.51]
56
+ const highVariance = [0.1, 0.5, 0.9]
57
+
58
+ const lowResult = bootstrap(lowVariance, { iterations: 1000 })
59
+ const highResult = bootstrap(highVariance, { iterations: 1000 })
60
+
61
+ const lowWidth = lowResult.ci[1] - lowResult.ci[0]
62
+ const highWidth = highResult.ci[1] - highResult.ci[0]
63
+
64
+ expect(highWidth).toBeGreaterThan(lowWidth)
65
+ })
66
+ })
67
+
68
+ describe('configuration', () => {
69
+ test('uses default iterations when not specified', () => {
70
+ // Just verify it runs without error with defaults
71
+ const result = bootstrap([0.5, 0.6, 0.7])
72
+ expect(result.median).toBeGreaterThan(0)
73
+ })
74
+
75
+ test('accepts custom iteration count', () => {
76
+ const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
77
+ expect(result.median).toBeGreaterThan(0)
78
+ })
79
+
80
+ test('accepts custom confidence level', () => {
81
+ const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
82
+
83
+ // 90% CI should be narrower than 95% CI
84
+ const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
85
+ const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
86
+
87
+ const width90 = ci90.ci[1] - ci90.ci[0]
88
+ const width95 = ci95.ci[1] - ci95.ci[0]
89
+
90
+ // 95% CI should generally be wider than 90% CI
91
+ // Allow some tolerance due to randomness
92
+ expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
93
+ })
94
+ })
95
+
96
+ describe('statistical properties', () => {
97
+ test('median is close to sample mean', () => {
98
+ const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
99
+ const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
100
+
101
+ const result = bootstrap(samples, { iterations: 10000 })
102
+
103
+ // Bootstrap median should be close to sample mean for symmetric distributions
104
+ expect(result.median).toBeCloseTo(sampleMean, 1)
105
+ })
106
+
107
+ test('is deterministic-ish for large iteration counts', () => {
108
+ const samples = [0.3, 0.5, 0.7]
109
+
110
+ // With many iterations, results should be similar across runs
111
+ const result1 = bootstrap(samples, { iterations: 10000 })
112
+ const result2 = bootstrap(samples, { iterations: 10000 })
113
+
114
+ expect(result1.median).toBeCloseTo(result2.median, 1)
115
+ })
116
+ })
117
+ })
118
+
119
+ describe('getBootstrapConfigFromEnv', () => {
120
+ const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
121
+
122
+ afterEach(() => {
123
+ if (originalEnv === undefined) {
124
+ delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
125
+ } else {
126
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
127
+ }
128
+ })
129
+
130
+ test('returns default iterations when env var not set', () => {
131
+ delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
132
+ const config = getBootstrapConfigFromEnv()
133
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
134
+ })
135
+
136
+ test('parses valid iteration count from env', () => {
137
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
138
+ const config = getBootstrapConfigFromEnv()
139
+ expect(config.iterations).toBe(5000)
140
+ })
141
+
142
+ test('returns default for invalid (non-numeric) env value', () => {
143
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
144
+ const config = getBootstrapConfigFromEnv()
145
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
146
+ })
147
+
148
+ test('returns default for iteration count below minimum (100)', () => {
149
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
150
+ const config = getBootstrapConfigFromEnv()
151
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
152
+ })
153
+
154
+ test('accepts iteration count at minimum (100)', () => {
155
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
156
+ const config = getBootstrapConfigFromEnv()
157
+ expect(config.iterations).toBe(100)
158
+ })
159
+ })
160
+
161
+ describe('constants', () => {
162
+ test('DEFAULT_ITERATIONS is 1000', () => {
163
+ expect(DEFAULT_ITERATIONS).toBe(1000)
164
+ })
165
+
166
+ test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
167
+ expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
168
+ })
169
+ })
@@ -20,9 +20,7 @@ import type {
20
20
  TrialsComparisonGrader,
21
21
  TrialsComparisonGraderInput,
22
22
  } from '../pipeline/pipeline.types.ts'
23
-
24
- /** Default number of bootstrap iterations */
25
- const DEFAULT_ITERATIONS = 1000
23
+ import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
26
24
 
27
25
  /**
28
26
  * Bootstrap confidence interval result.
@@ -82,16 +80,13 @@ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): Boot
82
80
  }
83
81
 
84
82
  /**
85
- * Get bootstrap iterations from environment variable.
83
+ * Get bootstrap iterations from environment or use default.
86
84
  *
87
85
  * @returns Number of bootstrap iterations
88
86
  */
89
- const getIterationsFromEnv = (): number => {
90
- const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
91
- if (!envValue) return DEFAULT_ITERATIONS
92
-
93
- const parsed = Number.parseInt(envValue, 10)
94
- return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
87
+ const getIterations = (): number => {
88
+ const config = getBootstrapConfigFromEnv()
89
+ return config.iterations ?? DEFAULT_ITERATIONS
95
90
  }
96
91
 
97
92
  /**
@@ -109,7 +104,7 @@ const getIterationsFromEnv = (): number => {
109
104
  export const grade: TrialsComparisonGrader = async ({
110
105
  runs,
111
106
  }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
112
- const iterations = getIterationsFromEnv()
107
+ const iterations = getIterations()
113
108
 
114
109
  // Collect pass/fail outcomes for each run
115
110
  const runStats = Object.entries(runs).map(([label, run]) => {
@@ -16,6 +16,7 @@
16
16
  */
17
17
 
18
18
  import { logProgress, writeOutput } from '../core.ts'
19
+ import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
19
20
  import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
20
21
  import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
21
22
  import type {
@@ -194,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
194
195
  const passExpKValues = results.map((r) => r.passExpK ?? 0)
195
196
 
196
197
  if (passExpKValues.length === 0) {
197
- return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
198
+ return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
198
199
  }
199
200
 
200
201
  const sorted = [...passExpKValues].sort((a, b) => a - b)
201
202
  const sum = passExpKValues.reduce((a, b) => a + b, 0)
202
203
 
203
204
  return {
205
+ type: 'trial',
204
206
  avgPassExpK: sum / passExpKValues.length,
205
207
  medianPassExpK: percentile(sorted, 0.5),
206
208
  p25PassExpK: percentile(sorted, 0.25),
@@ -407,6 +409,34 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
407
409
  flakiness[label] = computeFlakinessMetrics(results)
408
410
  }
409
411
 
412
+ // Compute confidence intervals when using statistical strategy
413
+ if (strategy === 'statistical') {
414
+ const bootstrapConfig = getBootstrapConfigFromEnv()
415
+
416
+ for (const label of runLabels) {
417
+ const resultsMap = runResults[label] ?? new Map()
418
+ const results = [...resultsMap.values()]
419
+ const passAtKValues = results.map((r) => r.passAtK ?? 0)
420
+ const passExpKValues = results.map((r) => r.passExpK ?? 0)
421
+
422
+ // Capability CIs
423
+ const capabilityMetrics = capability[label]
424
+ if (capabilityMetrics) {
425
+ capabilityMetrics.confidenceIntervals = {
426
+ avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
427
+ }
428
+ }
429
+
430
+ // Reliability CIs
431
+ const reliabilityMetrics = reliability[label]
432
+ if (reliabilityMetrics) {
433
+ reliabilityMetrics.confidenceIntervals = {
434
+ avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
435
+ }
436
+ }
437
+ }
438
+ }
439
+
410
440
  // Compute pairwise comparisons
411
441
  const capabilityPairwise: PairwiseComparison[] = []
412
442
  const reliabilityPairwise: PairwiseComparison[] = []
@@ -531,27 +561,52 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
531
561
  lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
532
562
  lines.push('')
533
563
 
564
+ // Check if any run has confidence intervals (statistical strategy was used)
565
+ const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
566
+
534
567
  // Capability table
535
568
  lines.push('## Capability (passAtK)')
536
569
  lines.push('')
537
- lines.push('| Run | Avg | Median | P25 | P75 |')
538
- lines.push('|-----|-----|--------|-----|-----|')
539
- for (const [label, c] of Object.entries(report.capability)) {
540
- lines.push(
541
- `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
542
- )
570
+ if (hasCIs) {
571
+ lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
572
+ lines.push('|-----|-----|--------|--------|-----|-----|')
573
+ for (const [label, c] of Object.entries(report.capability)) {
574
+ const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
575
+ lines.push(
576
+ `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
577
+ )
578
+ }
579
+ } else {
580
+ lines.push('| Run | Avg | Median | P25 | P75 |')
581
+ lines.push('|-----|-----|--------|-----|-----|')
582
+ for (const [label, c] of Object.entries(report.capability)) {
583
+ lines.push(
584
+ `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
585
+ )
586
+ }
543
587
  }
544
588
  lines.push('')
545
589
 
546
590
  // Reliability table
547
591
  lines.push('## Reliability (passExpK)')
548
592
  lines.push('')
549
- lines.push('| Run | Avg | Median | P25 | P75 |')
550
- lines.push('|-----|-----|--------|-----|-----|')
551
- for (const [label, r] of Object.entries(report.reliability)) {
552
- lines.push(
553
- `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
554
- )
593
+ if (hasCIs) {
594
+ lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
595
+ lines.push('|-----|-----|--------|--------|-----|-----|')
596
+ for (const [label, r] of Object.entries(report.reliability)) {
597
+ const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
598
+ lines.push(
599
+ `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
600
+ )
601
+ }
602
+ } else {
603
+ lines.push('| Run | Avg | Median | P25 | P75 |')
604
+ lines.push('|-----|-----|--------|-----|-----|')
605
+ for (const [label, r] of Object.entries(report.reliability)) {
606
+ lines.push(
607
+ `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
608
+ )
609
+ }
555
610
  }
556
611
  lines.push('')
557
612
 
@@ -25,6 +25,7 @@
25
25
  import { basename, extname } from 'node:path'
26
26
  import { parseArgs } from 'node:util'
27
27
  import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
28
+ import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
28
29
  import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
29
30
  import { grade as weightedGrade } from '../graders/compare-weighted.ts'
30
31
  import type {
@@ -463,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
463
464
  const completedCount = results.filter((r) => r.output && !r.errors?.length).length
464
465
 
465
466
  reliability[label] = {
467
+ type: 'run',
466
468
  toolErrors: toolErrorCount,
467
469
  toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
468
470
  timeouts: timeoutCount,
@@ -471,6 +473,36 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
471
473
  }
472
474
  }
473
475
 
476
+ // Compute confidence intervals when using statistical strategy
477
+ if (strategy === 'statistical') {
478
+ const bootstrapConfig = getBootstrapConfigFromEnv()
479
+
480
+ for (const label of runLabels) {
481
+ const resultsMap = runResults[label] ?? new Map()
482
+ const results = [...resultsMap.values()]
483
+ const scores = results.map((r) => r.score?.score ?? 0)
484
+ const passes = results.map((r) => (r.score?.pass === true ? 1 : 0))
485
+ const latencies = results.map((r) => r.timing?.total ?? 0)
486
+
487
+ // Quality CIs
488
+ const qualityMetrics = quality[label]
489
+ if (qualityMetrics) {
490
+ qualityMetrics.confidenceIntervals = {
491
+ avgScore: bootstrap(scores, bootstrapConfig).ci,
492
+ passRate: bootstrap(passes, bootstrapConfig).ci,
493
+ }
494
+ }
495
+
496
+ // Performance CIs
497
+ const performanceMetrics = performance[label]
498
+ if (performanceMetrics) {
499
+ performanceMetrics.confidenceIntervals = {
500
+ latencyMean: bootstrap(latencies, bootstrapConfig).ci,
501
+ }
502
+ }
503
+ }
504
+ }
505
+
474
506
  // Trajectory info
475
507
  const trajectoryInfo: Record<string, TrajectoryInfo> = {}
476
508
  for (const label of runLabels) {
@@ -586,27 +618,53 @@ const formatReportAsMarkdown = (report: ComparisonReport): string => {
586
618
  lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
587
619
  lines.push('')
588
620
 
621
+ // Check if any run has confidence intervals (statistical strategy was used)
622
+ const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
623
+
589
624
  // Quality table
590
625
  lines.push('## Quality')
591
626
  lines.push('')
592
- lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
593
- lines.push('|-----|-----------|-----------|------|------|')
594
- for (const [label, q] of Object.entries(report.quality)) {
595
- lines.push(
596
- `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
597
- )
627
+ if (hasCIs) {
628
+ lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |')
629
+ lines.push('|-----|-----------|--------|-----------|--------|------|------|')
630
+ for (const [label, q] of Object.entries(report.quality)) {
631
+ const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore)
632
+ const passRateCI = formatCI(q.confidenceIntervals?.passRate)
633
+ lines.push(
634
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`,
635
+ )
636
+ }
637
+ } else {
638
+ lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
639
+ lines.push('|-----|-----------|-----------|------|------|')
640
+ for (const [label, q] of Object.entries(report.quality)) {
641
+ lines.push(
642
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
643
+ )
644
+ }
598
645
  }
599
646
  lines.push('')
600
647
 
601
648
  // Performance table
602
649
  lines.push('## Performance')
603
650
  lines.push('')
604
- lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
605
- lines.push('|-----|----------|----------|----------|-----------|')
606
- for (const [label, p] of Object.entries(report.performance)) {
607
- lines.push(
608
- `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
609
- )
651
+ if (hasCIs) {
652
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |')
653
+ lines.push('|-----|----------|----------|----------|-----------|--------|')
654
+ for (const [label, p] of Object.entries(report.performance)) {
655
+ const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
656
+ lines.push(
657
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`,
658
+ )
659
+ }
660
+ } else {
661
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
662
+ lines.push('|-----|----------|----------|----------|-----------|')
663
+ for (const [label, p] of Object.entries(report.performance)) {
664
+ lines.push(
665
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
666
+ )
667
+ }
610
668
  }
611
669
  lines.push('')
612
670
 
@@ -0,0 +1,285 @@
1
+ /**
2
+ * Integration tests for compare command statistical strategy.
3
+ *
4
+ * @remarks
5
+ * Tests verify confidence interval computation for the statistical strategy
6
+ * in the compare command with CaptureResult format.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
12
+ import type { CaptureResult } from '../../schemas.ts'
13
+ import { runCompare } from '../compare.ts'
14
+
15
+ // ============================================================================
16
+ // Test Fixtures
17
+ // ============================================================================
18
+
19
+ const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({
20
+ id,
21
+ input: `Prompt for ${id}`,
22
+ output: `Output for ${id}`,
23
+ trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }],
24
+ metadata: {},
25
+ timing: {
26
+ start: Date.now(),
27
+ end: Date.now() + duration,
28
+ sessionCreation: 100,
29
+ total: duration,
30
+ },
31
+ toolErrors: false,
32
+ score: {
33
+ pass,
34
+ score,
35
+ reasoning: pass ? 'Passed' : 'Failed',
36
+ },
37
+ })
38
+
39
+ const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical`
40
+
41
+ beforeAll(async () => {
42
+ await Bun.$`mkdir -p ${tempDir}`
43
+ })
44
+
45
+ afterAll(async () => {
46
+ await Bun.$`rm -rf ${tempDir}`
47
+ })
48
+
49
+ // ============================================================================
50
+ // Statistical Strategy CI Tests
51
+ // ============================================================================
52
+
53
+ describe('runCompare statistical strategy', () => {
54
+ test('computes confidence intervals for quality metrics', async () => {
55
+ const run1Path = `${tempDir}/ci-qual-run1.jsonl`
56
+ const run2Path = `${tempDir}/ci-qual-run2.jsonl`
57
+
58
+ // Create multiple prompts with varying scores for meaningful CI computation
59
+ const results1 = [
60
+ createCaptureResult('p1', 0.9, true, 1000),
61
+ createCaptureResult('p2', 0.85, true, 1100),
62
+ createCaptureResult('p3', 0.95, true, 900),
63
+ createCaptureResult('p4', 0.8, true, 1200),
64
+ ]
65
+ const results2 = [
66
+ createCaptureResult('p1', 0.6, false, 2000),
67
+ createCaptureResult('p2', 0.5, false, 2100),
68
+ createCaptureResult('p3', 0.7, true, 1900),
69
+ createCaptureResult('p4', 0.55, false, 2200),
70
+ ]
71
+
72
+ await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
73
+ await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
74
+
75
+ const report = await runCompare({
76
+ runs: [
77
+ { label: 'high', path: run1Path },
78
+ { label: 'low', path: run2Path },
79
+ ],
80
+ strategy: 'statistical',
81
+ progress: false,
82
+ })
83
+
84
+ // Verify confidence intervals are computed for quality
85
+ const highQuality = report.quality.high
86
+ expect(highQuality).toBeDefined()
87
+ expect(highQuality?.confidenceIntervals).toBeDefined()
88
+ expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined()
89
+ expect(highQuality?.confidenceIntervals?.passRate).toBeDefined()
90
+
91
+ // avgScore CI should be a tuple [lower, upper]
92
+ const avgScoreCI = highQuality?.confidenceIntervals?.avgScore
93
+ expect(avgScoreCI).toHaveLength(2)
94
+ expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0)
95
+
96
+ // CI should contain the average (within reasonable bounds)
97
+ expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0)
98
+ expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1)
99
+
100
+ // passRate CI should also be valid
101
+ const passRateCI = highQuality?.confidenceIntervals?.passRate
102
+ expect(passRateCI).toHaveLength(2)
103
+ expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
104
+
105
+ // Verify reliability metrics include type discriminator
106
+ expect(report.reliability.high?.type).toBe('run')
107
+ expect(report.reliability.low?.type).toBe('run')
108
+ })
109
+
110
+ test('computes confidence intervals for performance metrics', async () => {
111
+ const run1Path = `${tempDir}/ci-perf-run1.jsonl`
112
+ const run2Path = `${tempDir}/ci-perf-run2.jsonl`
113
+
114
+ // Create results with varying latencies
115
+ const results1 = [
116
+ createCaptureResult('p1', 0.9, true, 1000),
117
+ createCaptureResult('p2', 0.85, true, 1100),
118
+ createCaptureResult('p3', 0.95, true, 900),
119
+ createCaptureResult('p4', 0.8, true, 1050),
120
+ ]
121
+ const results2 = [
122
+ createCaptureResult('p1', 0.7, true, 2000),
123
+ createCaptureResult('p2', 0.65, true, 2200),
124
+ createCaptureResult('p3', 0.75, true, 1800),
125
+ createCaptureResult('p4', 0.6, true, 2100),
126
+ ]
127
+
128
+ await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
129
+ await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
130
+
131
+ const report = await runCompare({
132
+ runs: [
133
+ { label: 'fast', path: run1Path },
134
+ { label: 'slow', path: run2Path },
135
+ ],
136
+ strategy: 'statistical',
137
+ progress: false,
138
+ })
139
+
140
+ // Verify confidence intervals are computed for performance
141
+ const fastPerf = report.performance.fast
142
+ expect(fastPerf).toBeDefined()
143
+ expect(fastPerf?.confidenceIntervals).toBeDefined()
144
+ expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined()
145
+
146
+ // latencyMean CI should be a tuple [lower, upper]
147
+ const latencyCI = fastPerf?.confidenceIntervals?.latencyMean
148
+ expect(latencyCI).toHaveLength(2)
149
+ expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0)
150
+
151
+ // Fast run should have lower latency CI than slow run
152
+ const slowPerf = report.performance.slow
153
+ const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean
154
+ expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0)
155
+ })
156
+
157
+ test('weighted strategy does not compute confidence intervals', async () => {
158
+ const run1Path = `${tempDir}/no-ci-run1.jsonl`
159
+ const run2Path = `${tempDir}/no-ci-run2.jsonl`
160
+
161
+ const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)]
162
+ const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)]
163
+
164
+ await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
165
+ await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
166
+
167
+ const report = await runCompare({
168
+ runs: [
169
+ { label: 'run1', path: run1Path },
170
+ { label: 'run2', path: run2Path },
171
+ ],
172
+ strategy: 'weighted', // Default strategy
173
+ progress: false,
174
+ })
175
+
176
+ // Confidence intervals should NOT be present for weighted strategy
177
+ const quality = report.quality.run1
178
+ expect(quality?.confidenceIntervals).toBeUndefined()
179
+
180
+ const perf = report.performance.run1
181
+ expect(perf?.confidenceIntervals).toBeUndefined()
182
+ })
183
+
184
+ test('statistical strategy includes CIs in markdown output', async () => {
185
+ const run1Path = `${tempDir}/ci-md-run1.jsonl`
186
+ const run2Path = `${tempDir}/ci-md-run2.jsonl`
187
+ const outputPath = `${tempDir}/ci-report.md`
188
+
189
+ const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)]
190
+ const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)]
191
+
192
+ await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
193
+ await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
194
+
195
+ await runCompare({
196
+ runs: [
197
+ { label: 'agent1', path: run1Path },
198
+ { label: 'agent2', path: run2Path },
199
+ ],
200
+ strategy: 'statistical',
201
+ outputPath,
202
+ format: 'markdown',
203
+ progress: false,
204
+ })
205
+
206
+ const content = await Bun.file(outputPath).text()
207
+
208
+ // Markdown should include 95% CI column headers
209
+ expect(content).toContain('95% CI')
210
+ // Should contain CI values in bracket format [lower, upper]
211
+ expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
212
+ })
213
+
214
+ test('handles single sample gracefully with degenerate CI', async () => {
215
+ const run1Path = `${tempDir}/single-run1.jsonl`
216
+ const run2Path = `${tempDir}/single-run2.jsonl`
217
+
218
+ // Single sample per run
219
+ const result1 = createCaptureResult('p1', 0.9, true)
220
+ const result2 = createCaptureResult('p1', 0.5, false)
221
+
222
+ await Bun.write(run1Path, JSON.stringify(result1))
223
+ await Bun.write(run2Path, JSON.stringify(result2))
224
+
225
+ const report = await runCompare({
226
+ runs: [
227
+ { label: 'single1', path: run1Path },
228
+ { label: 'single2', path: run2Path },
229
+ ],
230
+ strategy: 'statistical',
231
+ progress: false,
232
+ })
233
+
234
+ // Should still compute CIs (they will be degenerate for single sample)
235
+ const quality = report.quality.single1
236
+ expect(quality?.confidenceIntervals).toBeDefined()
237
+ expect(quality?.confidenceIntervals?.avgScore).toBeDefined()
238
+
239
+ // For single sample, CI should collapse to the value
240
+ const ci = quality?.confidenceIntervals?.avgScore
241
+ expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2)
242
+ expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2)
243
+ })
244
+
245
+ test('JSON output includes confidence intervals structure', async () => {
246
+ const run1Path = `${tempDir}/json-ci-run1.jsonl`
247
+ const run2Path = `${tempDir}/json-ci-run2.jsonl`
248
+ const outputPath = `${tempDir}/ci-report.json`
249
+
250
+ const results1 = [
251
+ createCaptureResult('p1', 0.9, true),
252
+ createCaptureResult('p2', 0.85, true),
253
+ createCaptureResult('p3', 0.95, true),
254
+ ]
255
+ const results2 = [
256
+ createCaptureResult('p1', 0.6, false),
257
+ createCaptureResult('p2', 0.5, false),
258
+ createCaptureResult('p3', 0.7, true),
259
+ ]
260
+
261
+ await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
262
+ await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
263
+
264
+ await runCompare({
265
+ runs: [
266
+ { label: 'high', path: run1Path },
267
+ { label: 'low', path: run2Path },
268
+ ],
269
+ strategy: 'statistical',
270
+ outputPath,
271
+ format: 'json',
272
+ progress: false,
273
+ })
274
+
275
+ const content = await Bun.file(outputPath).text()
276
+ const parsed = JSON.parse(content)
277
+
278
+ // Verify JSON structure includes confidenceIntervals
279
+ expect(parsed.quality.high.confidenceIntervals).toBeDefined()
280
+ expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array)
281
+ expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2)
282
+ expect(parsed.performance.high.confidenceIntervals).toBeDefined()
283
+ expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array)
284
+ })
285
+ })
@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
108
108
  expect(report.meta.promptCount).toBe(2)
109
109
  expect(report.capability).toBeDefined()
110
110
  expect(report.reliability).toBeDefined()
111
+ expect(report.reliability.baseline?.type).toBe('trial')
112
+ expect(report.reliability.variant?.type).toBe('trial')
111
113
  expect(report.flakiness).toBeDefined()
112
114
  expect(report.headToHead.capability.length).toBeGreaterThan(0)
113
115
 
@@ -210,6 +212,147 @@ describe('runTrialsCompare', () => {
210
212
  expect(report.meta.runs).toEqual(['better', 'worse'])
211
213
  })
212
214
 
215
+ test('statistical strategy computes confidence intervals for capability metrics', async () => {
216
+ const run1Path = `${tempDir}/ci-cap-run1.jsonl`
217
+ const run2Path = `${tempDir}/ci-cap-run2.jsonl`
218
+
219
+ // Create multiple prompts for meaningful CI computation
220
+ const trials1 = [
221
+ createTrialResult('p1', 0.9, 0.8),
222
+ createTrialResult('p2', 0.85, 0.7),
223
+ createTrialResult('p3', 0.95, 0.9),
224
+ ]
225
+ const trials2 = [
226
+ createTrialResult('p1', 0.6, 0.4),
227
+ createTrialResult('p2', 0.5, 0.3),
228
+ createTrialResult('p3', 0.7, 0.5),
229
+ ]
230
+
231
+ await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
232
+ await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
233
+
234
+ const report = await runTrialsCompare({
235
+ runs: [
236
+ { label: 'high', path: run1Path },
237
+ { label: 'low', path: run2Path },
238
+ ],
239
+ strategy: 'statistical',
240
+ progress: false,
241
+ })
242
+
243
+ // Verify confidence intervals are computed for capability
244
+ const highCap = report.capability.high
245
+ expect(highCap).toBeDefined()
246
+ expect(highCap?.confidenceIntervals).toBeDefined()
247
+ expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined()
248
+
249
+ // CI should be a tuple [lower, upper]
250
+ const ci = highCap?.confidenceIntervals?.avgPassAtK
251
+ expect(ci).toHaveLength(2)
252
+ expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
253
+
254
+ // CI should contain the average (within reasonable bounds)
255
+ expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0)
256
+ expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1)
257
+ })
258
+
259
+ test('statistical strategy computes confidence intervals for reliability metrics', async () => {
260
+ const run1Path = `${tempDir}/ci-rel-run1.jsonl`
261
+ const run2Path = `${tempDir}/ci-rel-run2.jsonl`
262
+
263
+ const trials1 = [
264
+ createTrialResult('p1', 0.9, 0.85),
265
+ createTrialResult('p2', 0.8, 0.75),
266
+ createTrialResult('p3', 0.85, 0.8),
267
+ ]
268
+ const trials2 = [
269
+ createTrialResult('p1', 0.7, 0.3),
270
+ createTrialResult('p2', 0.6, 0.2),
271
+ createTrialResult('p3', 0.65, 0.25),
272
+ ]
273
+
274
+ await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
275
+ await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
276
+
277
+ const report = await runTrialsCompare({
278
+ runs: [
279
+ { label: 'reliable', path: run1Path },
280
+ { label: 'flaky', path: run2Path },
281
+ ],
282
+ strategy: 'statistical',
283
+ progress: false,
284
+ })
285
+
286
+ // Verify confidence intervals are computed for reliability
287
+ const reliableRel = report.reliability.reliable
288
+ expect(reliableRel).toBeDefined()
289
+ expect(reliableRel?.type).toBe('trial')
290
+ expect(reliableRel?.confidenceIntervals).toBeDefined()
291
+ expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
292
+
293
+ // CI should be a tuple [lower, upper]
294
+ const ci = reliableRel?.confidenceIntervals?.avgPassExpK
295
+ expect(ci).toHaveLength(2)
296
+ expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
297
+ })
298
+
299
+ test('weighted strategy does not compute confidence intervals', async () => {
300
+ const run1Path = `${tempDir}/no-ci-run1.jsonl`
301
+ const run2Path = `${tempDir}/no-ci-run2.jsonl`
302
+
303
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
304
+ const trial2 = createTrialResult('test-001', 0.5, 0.3)
305
+
306
+ await Bun.write(run1Path, JSON.stringify(trial1))
307
+ await Bun.write(run2Path, JSON.stringify(trial2))
308
+
309
+ const report = await runTrialsCompare({
310
+ runs: [
311
+ { label: 'run1', path: run1Path },
312
+ { label: 'run2', path: run2Path },
313
+ ],
314
+ strategy: 'weighted', // Default strategy
315
+ progress: false,
316
+ })
317
+
318
+ // Confidence intervals should NOT be present for weighted strategy
319
+ const cap = report.capability.run1
320
+ expect(cap?.confidenceIntervals).toBeUndefined()
321
+
322
+ const rel = report.reliability.run1
323
+ expect(rel?.confidenceIntervals).toBeUndefined()
324
+ })
325
+
326
+ test('statistical strategy includes CIs in markdown output', async () => {
327
+ const run1Path = `${tempDir}/ci-md-run1.jsonl`
328
+ const run2Path = `${tempDir}/ci-md-run2.jsonl`
329
+ const outputPath = `${tempDir}/ci-report.md`
330
+
331
+ const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)]
332
+ const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)]
333
+
334
+ await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
335
+ await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
336
+
337
+ await runTrialsCompare({
338
+ runs: [
339
+ { label: 'agent1', path: run1Path },
340
+ { label: 'agent2', path: run2Path },
341
+ ],
342
+ strategy: 'statistical',
343
+ outputPath,
344
+ format: 'markdown',
345
+ progress: false,
346
+ })
347
+
348
+ const content = await Bun.file(outputPath).text()
349
+
350
+ // Markdown should include 95% CI column headers
351
+ expect(content).toContain('95% CI')
352
+ // Should contain CI values in bracket format [lower, upper]
353
+ expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
354
+ })
355
+
213
356
  test('computes correct capability metrics', async () => {
214
357
  const run1Path = `${tempDir}/cap-run1.jsonl`
215
358
 
@@ -573,6 +573,17 @@ export type ValidationResult = z.infer<typeof ValidationResultSchema>
573
573
  // Comparison Report Schemas
574
574
  // ============================================================================
575
575
 
576
+ /**
577
+ * Confidence interval schema as [lower, upper] bounds.
578
+ *
579
+ * @remarks
580
+ * Used for bootstrap-computed confidence intervals when strategy=statistical.
581
+ */
582
+ export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()])
583
+
584
+ /** Confidence interval type */
585
+ export type ConfidenceInterval = z.infer<typeof ConfidenceIntervalSchema>
586
+
576
587
  /**
577
588
  * Score distribution histogram for quality analysis.
578
589
  *
@@ -590,6 +601,19 @@ export const ScoreDistributionSchema = z.object({
590
601
  /** Score distribution type */
591
602
  export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
592
603
 
604
+ /**
605
+ * Confidence intervals for quality metrics.
606
+ */
607
+ export const QualityConfidenceIntervalsSchema = z.object({
608
+ /** CI for avgScore */
609
+ avgScore: ConfidenceIntervalSchema.optional(),
610
+ /** CI for passRate */
611
+ passRate: ConfidenceIntervalSchema.optional(),
612
+ })
613
+
614
+ /** Quality confidence intervals type */
615
+ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceIntervalsSchema>
616
+
593
617
  /**
594
618
  * Quality metrics for a single run in comparison.
595
619
  */
@@ -604,6 +628,8 @@ export const QualityMetricsSchema = z.object({
604
628
  failCount: z.number(),
605
629
  /** Score distribution histogram */
606
630
  scoreDistribution: ScoreDistributionSchema,
631
+ /** Confidence intervals (only with strategy=statistical) */
632
+ confidenceIntervals: QualityConfidenceIntervalsSchema.optional(),
607
633
  })
608
634
 
609
635
  /** Quality metrics type */
@@ -630,6 +656,17 @@ export const LatencyStatsSchema = z.object({
630
656
  /** Latency stats type */
631
657
  export type LatencyStats = z.infer<typeof LatencyStatsSchema>
632
658
 
659
+ /**
660
+ * Confidence intervals for performance metrics.
661
+ */
662
+ export const PerformanceConfidenceIntervalsSchema = z.object({
663
+ /** CI for latency mean */
664
+ latencyMean: ConfidenceIntervalSchema.optional(),
665
+ })
666
+
667
+ /** Performance confidence intervals type */
668
+ export type PerformanceConfidenceIntervals = z.infer<typeof PerformanceConfidenceIntervalsSchema>
669
+
633
670
  /**
634
671
  * Performance metrics for a single run in comparison.
635
672
  */
@@ -640,6 +677,8 @@ export const PerformanceMetricsSchema = z.object({
640
677
  firstResponse: LatencyStatsSchema.optional(),
641
678
  /** Sum of all run durations in milliseconds */
642
679
  totalDuration: z.number(),
680
+ /** Confidence intervals (only with strategy=statistical) */
681
+ confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(),
643
682
  })
644
683
 
645
684
  /** Performance metrics type */
@@ -649,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
649
688
  * Reliability metrics for a single run in comparison.
650
689
  */
651
690
  export const ReliabilityMetricsSchema = z.object({
691
+ /** Discriminator for run-based reliability metrics */
692
+ type: z.literal('run'),
652
693
  /** Count of runs with toolErrors=true */
653
694
  toolErrors: z.number(),
654
695
  /** Percentage of runs with tool errors */
@@ -782,6 +823,17 @@ export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
782
823
  // Trials Comparison Report Schemas
783
824
  // ============================================================================
784
825
 
826
+ /**
827
+ * Confidence intervals for trials capability metrics.
828
+ */
829
+ export const TrialsCapabilityConfidenceIntervalsSchema = z.object({
830
+ /** CI for avgPassAtK */
831
+ avgPassAtK: ConfidenceIntervalSchema.optional(),
832
+ })
833
+
834
+ /** Trials capability confidence intervals type */
835
+ export type TrialsCapabilityConfidenceIntervals = z.infer<typeof TrialsCapabilityConfidenceIntervalsSchema>
836
+
785
837
  /**
786
838
  * Capability metrics for trials comparison (passAtK-based).
787
839
  *
@@ -798,11 +850,24 @@ export const TrialsCapabilityMetricsSchema = z.object({
798
850
  p25PassAtK: z.number(),
799
851
  /** 75th percentile passAtK */
800
852
  p75PassAtK: z.number(),
853
+ /** Confidence intervals (only with strategy=statistical) */
854
+ confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(),
801
855
  })
802
856
 
803
857
  /** Trials capability metrics type */
804
858
  export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
805
859
 
860
+ /**
861
+ * Confidence intervals for trials reliability metrics.
862
+ */
863
+ export const TrialsReliabilityConfidenceIntervalsSchema = z.object({
864
+ /** CI for avgPassExpK */
865
+ avgPassExpK: ConfidenceIntervalSchema.optional(),
866
+ })
867
+
868
+ /** Trials reliability confidence intervals type */
869
+ export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabilityConfidenceIntervalsSchema>
870
+
806
871
  /**
807
872
  * Reliability metrics for trials comparison (passExpK-based).
808
873
  *
@@ -811,6 +876,8 @@ export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSche
811
876
  * Higher passExpK means the agent reliably solves the task every time.
812
877
  */
813
878
  export const TrialsReliabilityMetricsSchema = z.object({
879
+ /** Discriminator for trial-based reliability metrics */
880
+ type: z.literal('trial'),
814
881
  /** Average passExpK across all prompts */
815
882
  avgPassExpK: z.number(),
816
883
  /** Median passExpK */
@@ -819,6 +886,8 @@ export const TrialsReliabilityMetricsSchema = z.object({
819
886
  p25PassExpK: z.number(),
820
887
  /** 75th percentile passExpK */
821
888
  p75PassExpK: z.number(),
889
+ /** Confidence intervals (only with strategy=statistical) */
890
+ confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(),
822
891
  })
823
892
 
824
893
  /** Trials reliability metrics type */