@plaited/agent-eval-harness 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -3
- package/src/graders/bootstrap.ts +135 -0
- package/src/graders/compare-statistical.ts +14 -86
- package/src/graders/tests/bootstrap.spec.ts +169 -0
- package/src/graders/trials-compare-statistical.ts +6 -11
- package/src/pipeline/compare-trials.ts +66 -12
- package/src/pipeline/compare.ts +69 -12
- package/src/pipeline/tests/compare-statistical.spec.ts +281 -0
- package/src/pipeline/tests/compare-trials.spec.ts +140 -0
- package/src/schemas/schemas.ts +65 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.8.
|
|
3
|
+
"version": "0.8.1",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -42,8 +42,10 @@
|
|
|
42
42
|
"check:types": "tsc --noEmit",
|
|
43
43
|
"check:write": "biome check --write && format-package --write",
|
|
44
44
|
"prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
|
|
45
|
-
"test": "bun test
|
|
46
|
-
"test:
|
|
45
|
+
"test": "bun run test:bin && bun test:src",
|
|
46
|
+
"test:bin": "bun test bin/tests/*.spec.ts",
|
|
47
|
+
"test:integration": "bun test ./**/integration_tests/*.spec.ts",
|
|
48
|
+
"test:src": "bun test src/**/tests/*.spec.ts"
|
|
47
49
|
},
|
|
48
50
|
"lint-staged": {
|
|
49
51
|
"*.{js,cjs,jsx,tsx,ts}": [
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared bootstrap sampling utilities for confidence interval computation.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
6
|
+
* assuming a specific distribution. For small samples, it's more
|
|
7
|
+
* reliable than parametric methods.
|
|
8
|
+
*
|
|
9
|
+
* Environment variable configuration:
|
|
10
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** Default number of bootstrap iterations */
|
|
16
|
+
export const DEFAULT_ITERATIONS = 1000
|
|
17
|
+
|
|
18
|
+
/** Default confidence level (95%) */
|
|
19
|
+
export const DEFAULT_CONFIDENCE_LEVEL = 0.95
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Confidence interval as [lower, upper] bounds.
|
|
23
|
+
*/
|
|
24
|
+
export type ConfidenceInterval = [number, number]
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Bootstrap confidence interval result.
|
|
28
|
+
*/
|
|
29
|
+
export type BootstrapResult = {
|
|
30
|
+
/** Median of bootstrap sample means (50th percentile) */
|
|
31
|
+
median: number
|
|
32
|
+
/** Confidence interval [lower, upper] */
|
|
33
|
+
ci: ConfidenceInterval
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Configuration for bootstrap sampling.
|
|
38
|
+
*/
|
|
39
|
+
export type BootstrapConfig = {
|
|
40
|
+
/** Number of bootstrap iterations (default: 1000) */
|
|
41
|
+
iterations?: number
|
|
42
|
+
/** Confidence level between 0 and 1 (default: 0.95) */
|
|
43
|
+
confidenceLevel?: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Compute bootstrap confidence interval for sample mean.
|
|
48
|
+
*
|
|
49
|
+
* @remarks
|
|
50
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
51
|
+
* assuming a specific distribution. For small samples, it's more
|
|
52
|
+
* reliable than parametric methods.
|
|
53
|
+
*
|
|
54
|
+
* @param samples - Array of numeric samples
|
|
55
|
+
* @param config - Optional bootstrap configuration
|
|
56
|
+
* @returns Bootstrap median and confidence interval
|
|
57
|
+
*
|
|
58
|
+
* @public
|
|
59
|
+
*/
|
|
60
|
+
export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
|
|
61
|
+
const iterations = config?.iterations ?? DEFAULT_ITERATIONS
|
|
62
|
+
const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
|
|
63
|
+
|
|
64
|
+
if (samples.length === 0) {
|
|
65
|
+
return { median: 0, ci: [0, 0] }
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (samples.length === 1) {
|
|
69
|
+
const value = samples[0] ?? 0
|
|
70
|
+
return { median: value, ci: [value, value] }
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const means: number[] = []
|
|
74
|
+
|
|
75
|
+
for (let i = 0; i < iterations; i++) {
|
|
76
|
+
// Resample with replacement - we know samples.length > 1 at this point
|
|
77
|
+
const resampled = Array.from(
|
|
78
|
+
{ length: samples.length },
|
|
79
|
+
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
// Compute mean of resampled data
|
|
83
|
+
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
84
|
+
means.push(sum / resampled.length)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Sort means for percentile calculation
|
|
88
|
+
means.sort((a, b) => a - b)
|
|
89
|
+
|
|
90
|
+
// Compute percentile indices based on confidence level
|
|
91
|
+
// For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
|
|
92
|
+
const alpha = (1 - confidenceLevel) / 2
|
|
93
|
+
const lowerIdx = Math.floor(iterations * alpha)
|
|
94
|
+
const upperIdx = Math.floor(iterations * (1 - alpha))
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
median: means[Math.floor(iterations / 2)] ?? 0,
|
|
98
|
+
ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Format confidence interval as string.
|
|
104
|
+
*
|
|
105
|
+
* @param ci - Confidence interval [lower, upper]
|
|
106
|
+
* @param decimals - Number of decimal places (default: 3)
|
|
107
|
+
* @returns Formatted CI string or empty string if undefined
|
|
108
|
+
*
|
|
109
|
+
* @public
|
|
110
|
+
*/
|
|
111
|
+
export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
|
|
112
|
+
if (!ci) return ''
|
|
113
|
+
return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Get bootstrap configuration from environment variables.
|
|
118
|
+
*
|
|
119
|
+
* @remarks
|
|
120
|
+
* Reads configuration from:
|
|
121
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
|
|
122
|
+
*
|
|
123
|
+
* @returns Bootstrap configuration
|
|
124
|
+
*
|
|
125
|
+
* @public
|
|
126
|
+
*/
|
|
127
|
+
export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
|
|
128
|
+
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
129
|
+
if (!envValue) return { iterations: DEFAULT_ITERATIONS }
|
|
130
|
+
|
|
131
|
+
const parsed = Number.parseInt(envValue, 10)
|
|
132
|
+
const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
133
|
+
|
|
134
|
+
return { iterations }
|
|
135
|
+
}
|
|
@@ -12,81 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
|
|
15
|
-
|
|
16
|
-
/** Default number of bootstrap iterations */
|
|
17
|
-
const DEFAULT_ITERATIONS = 1000
|
|
18
|
-
|
|
19
|
-
/**
|
|
20
|
-
* Bootstrap confidence interval result.
|
|
21
|
-
*/
|
|
22
|
-
type BootstrapResult = {
|
|
23
|
-
/** Estimated mean from bootstrap */
|
|
24
|
-
mean: number
|
|
25
|
-
/** 95% confidence interval [lower, upper] */
|
|
26
|
-
ci95: [number, number]
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Compute bootstrap confidence interval for sample mean.
|
|
31
|
-
*
|
|
32
|
-
* @remarks
|
|
33
|
-
* Bootstrap resampling provides robust confidence intervals without
|
|
34
|
-
* assuming a specific distribution. For small samples, it's more
|
|
35
|
-
* reliable than parametric methods.
|
|
36
|
-
*
|
|
37
|
-
* @param samples - Array of numeric samples
|
|
38
|
-
* @param iterations - Number of bootstrap iterations
|
|
39
|
-
* @returns Bootstrap mean and 95% confidence interval
|
|
40
|
-
*/
|
|
41
|
-
const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
|
|
42
|
-
if (samples.length === 0) {
|
|
43
|
-
return { mean: 0, ci95: [0, 0] }
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
if (samples.length === 1) {
|
|
47
|
-
const value = samples[0] ?? 0
|
|
48
|
-
return { mean: value, ci95: [value, value] }
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
const means: number[] = []
|
|
52
|
-
|
|
53
|
-
for (let i = 0; i < iterations; i++) {
|
|
54
|
-
// Resample with replacement - we know samples.length > 1 at this point
|
|
55
|
-
const resampled = Array.from(
|
|
56
|
-
{ length: samples.length },
|
|
57
|
-
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
// Compute mean of resampled data
|
|
61
|
-
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
62
|
-
means.push(sum / resampled.length)
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
// Sort means for percentile calculation
|
|
66
|
-
means.sort((a, b) => a - b)
|
|
67
|
-
|
|
68
|
-
// 95% CI: 2.5th and 97.5th percentiles
|
|
69
|
-
const lowerIdx = Math.floor(iterations * 0.025)
|
|
70
|
-
const upperIdx = Math.floor(iterations * 0.975)
|
|
71
|
-
|
|
72
|
-
return {
|
|
73
|
-
mean: means[Math.floor(iterations / 2)] ?? 0,
|
|
74
|
-
ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Get bootstrap iterations from environment variable.
|
|
80
|
-
*
|
|
81
|
-
* @returns Number of bootstrap iterations
|
|
82
|
-
*/
|
|
83
|
-
const getIterationsFromEnv = (): number => {
|
|
84
|
-
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
85
|
-
if (!envValue) return DEFAULT_ITERATIONS
|
|
86
|
-
|
|
87
|
-
const parsed = Number.parseInt(envValue, 10)
|
|
88
|
-
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
89
|
-
}
|
|
15
|
+
import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
|
|
90
16
|
|
|
91
17
|
/**
|
|
92
18
|
* Statistical significance comparison grader.
|
|
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
|
|
|
107
33
|
* @public
|
|
108
34
|
*/
|
|
109
35
|
export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
110
|
-
const
|
|
36
|
+
const config = getBootstrapConfigFromEnv()
|
|
111
37
|
|
|
112
38
|
// Collect scores for each run
|
|
113
39
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
116
42
|
|
|
117
43
|
// For single-prompt comparison, we only have one sample
|
|
118
44
|
// In practice, this grader is most useful when aggregating across prompts
|
|
119
|
-
const stats = bootstrap([score],
|
|
45
|
+
const stats = bootstrap([score], config)
|
|
120
46
|
|
|
121
47
|
return { label, score, stats }
|
|
122
48
|
})
|
|
123
49
|
|
|
124
|
-
// Sort by bootstrap
|
|
125
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
50
|
+
// Sort by bootstrap median descending
|
|
51
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
126
52
|
|
|
127
53
|
// Check if winner is statistically significant
|
|
128
54
|
// CIs don't overlap = significant difference (approximately p<0.05)
|
|
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
131
57
|
const second = sorted[1]
|
|
132
58
|
if (first && second) {
|
|
133
59
|
// Non-overlapping: first's lower bound > second's upper bound
|
|
134
|
-
isSignificant = first.stats.
|
|
60
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
135
61
|
}
|
|
136
62
|
|
|
137
63
|
const reasoning = isSignificant
|
|
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
142
68
|
rankings: sorted.map((s, i) => ({
|
|
143
69
|
run: s.label,
|
|
144
70
|
rank: i + 1,
|
|
145
|
-
score: s.stats.
|
|
71
|
+
score: s.stats.median,
|
|
146
72
|
})),
|
|
147
73
|
reasoning,
|
|
148
74
|
}
|
|
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
156
82
|
*
|
|
157
83
|
* @public
|
|
158
84
|
*/
|
|
159
|
-
export const createStatisticalGrader = (iterations
|
|
85
|
+
export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
|
|
86
|
+
const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
|
|
87
|
+
|
|
160
88
|
return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
161
89
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
162
90
|
const score = run.score?.score ?? 0
|
|
163
|
-
const stats = bootstrap([score],
|
|
91
|
+
const stats = bootstrap([score], config)
|
|
164
92
|
return { label, score, stats }
|
|
165
93
|
})
|
|
166
94
|
|
|
167
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
95
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
168
96
|
|
|
169
97
|
let isSignificant = false
|
|
170
98
|
const first = sorted[0]
|
|
171
99
|
const second = sorted[1]
|
|
172
100
|
if (first && second) {
|
|
173
|
-
isSignificant = first.stats.
|
|
101
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
174
102
|
}
|
|
175
103
|
|
|
176
104
|
return {
|
|
177
105
|
rankings: sorted.map((s, i) => ({
|
|
178
106
|
run: s.label,
|
|
179
107
|
rank: i + 1,
|
|
180
|
-
score: s.stats.
|
|
108
|
+
score: s.stats.median,
|
|
181
109
|
})),
|
|
182
110
|
reasoning: isSignificant
|
|
183
111
|
? `Winner "${first?.label}" is statistically significant (p<0.05)`
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for bootstrap sampling utilities.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
6
|
+
import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
|
|
7
|
+
|
|
8
|
+
describe('bootstrap', () => {
|
|
9
|
+
describe('edge cases', () => {
|
|
10
|
+
test('returns {median: 0, ci: [0, 0]} for empty array', () => {
|
|
11
|
+
const result = bootstrap([])
|
|
12
|
+
expect(result.median).toBe(0)
|
|
13
|
+
expect(result.ci).toEqual([0, 0])
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
test('returns {median: value, ci: [value, value]} for single sample', () => {
|
|
17
|
+
const result = bootstrap([0.75])
|
|
18
|
+
expect(result.median).toBe(0.75)
|
|
19
|
+
expect(result.ci).toEqual([0.75, 0.75])
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
test('handles single sample of 0', () => {
|
|
23
|
+
const result = bootstrap([0])
|
|
24
|
+
expect(result.median).toBe(0)
|
|
25
|
+
expect(result.ci).toEqual([0, 0])
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('handles single sample of 1', () => {
|
|
29
|
+
const result = bootstrap([1])
|
|
30
|
+
expect(result.median).toBe(1)
|
|
31
|
+
expect(result.ci).toEqual([1, 1])
|
|
32
|
+
})
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
describe('confidence interval bounds', () => {
|
|
36
|
+
test('CI lower bound <= median <= CI upper bound', () => {
|
|
37
|
+
const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
|
|
38
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
39
|
+
|
|
40
|
+
expect(result.ci[0]).toBeLessThanOrEqual(result.median)
|
|
41
|
+
expect(result.median).toBeLessThanOrEqual(result.ci[1])
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
test('CI contains the true median for uniform samples', () => {
|
|
45
|
+
// For identical samples, CI should collapse to the value
|
|
46
|
+
const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
|
|
47
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
48
|
+
|
|
49
|
+
expect(result.median).toBeCloseTo(0.5, 2)
|
|
50
|
+
expect(result.ci[0]).toBeCloseTo(0.5, 2)
|
|
51
|
+
expect(result.ci[1]).toBeCloseTo(0.5, 2)
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('CI widens with more variance in samples', () => {
|
|
55
|
+
const lowVariance = [0.49, 0.5, 0.51]
|
|
56
|
+
const highVariance = [0.1, 0.5, 0.9]
|
|
57
|
+
|
|
58
|
+
const lowResult = bootstrap(lowVariance, { iterations: 1000 })
|
|
59
|
+
const highResult = bootstrap(highVariance, { iterations: 1000 })
|
|
60
|
+
|
|
61
|
+
const lowWidth = lowResult.ci[1] - lowResult.ci[0]
|
|
62
|
+
const highWidth = highResult.ci[1] - highResult.ci[0]
|
|
63
|
+
|
|
64
|
+
expect(highWidth).toBeGreaterThan(lowWidth)
|
|
65
|
+
})
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
describe('configuration', () => {
|
|
69
|
+
test('uses default iterations when not specified', () => {
|
|
70
|
+
// Just verify it runs without error with defaults
|
|
71
|
+
const result = bootstrap([0.5, 0.6, 0.7])
|
|
72
|
+
expect(result.median).toBeGreaterThan(0)
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('accepts custom iteration count', () => {
|
|
76
|
+
const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
|
|
77
|
+
expect(result.median).toBeGreaterThan(0)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('accepts custom confidence level', () => {
|
|
81
|
+
const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
|
|
82
|
+
|
|
83
|
+
// 90% CI should be narrower than 95% CI
|
|
84
|
+
const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
|
|
85
|
+
const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
|
|
86
|
+
|
|
87
|
+
const width90 = ci90.ci[1] - ci90.ci[0]
|
|
88
|
+
const width95 = ci95.ci[1] - ci95.ci[0]
|
|
89
|
+
|
|
90
|
+
// 95% CI should generally be wider than 90% CI
|
|
91
|
+
// Allow some tolerance due to randomness
|
|
92
|
+
expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
describe('statistical properties', () => {
|
|
97
|
+
test('median is close to sample mean', () => {
|
|
98
|
+
const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
|
|
99
|
+
const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
|
|
100
|
+
|
|
101
|
+
const result = bootstrap(samples, { iterations: 10000 })
|
|
102
|
+
|
|
103
|
+
// Bootstrap median should be close to sample mean for symmetric distributions
|
|
104
|
+
expect(result.median).toBeCloseTo(sampleMean, 1)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('is deterministic-ish for large iteration counts', () => {
|
|
108
|
+
const samples = [0.3, 0.5, 0.7]
|
|
109
|
+
|
|
110
|
+
// With many iterations, results should be similar across runs
|
|
111
|
+
const result1 = bootstrap(samples, { iterations: 10000 })
|
|
112
|
+
const result2 = bootstrap(samples, { iterations: 10000 })
|
|
113
|
+
|
|
114
|
+
expect(result1.median).toBeCloseTo(result2.median, 1)
|
|
115
|
+
})
|
|
116
|
+
})
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
describe('getBootstrapConfigFromEnv', () => {
|
|
120
|
+
const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
121
|
+
|
|
122
|
+
afterEach(() => {
|
|
123
|
+
if (originalEnv === undefined) {
|
|
124
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
125
|
+
} else {
|
|
126
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
test('returns default iterations when env var not set', () => {
|
|
131
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
132
|
+
const config = getBootstrapConfigFromEnv()
|
|
133
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
test('parses valid iteration count from env', () => {
|
|
137
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
|
|
138
|
+
const config = getBootstrapConfigFromEnv()
|
|
139
|
+
expect(config.iterations).toBe(5000)
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
test('returns default for invalid (non-numeric) env value', () => {
|
|
143
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
|
|
144
|
+
const config = getBootstrapConfigFromEnv()
|
|
145
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
test('returns default for iteration count below minimum (100)', () => {
|
|
149
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
|
|
150
|
+
const config = getBootstrapConfigFromEnv()
|
|
151
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
test('accepts iteration count at minimum (100)', () => {
|
|
155
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
|
|
156
|
+
const config = getBootstrapConfigFromEnv()
|
|
157
|
+
expect(config.iterations).toBe(100)
|
|
158
|
+
})
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
describe('constants', () => {
|
|
162
|
+
test('DEFAULT_ITERATIONS is 1000', () => {
|
|
163
|
+
expect(DEFAULT_ITERATIONS).toBe(1000)
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
|
|
167
|
+
expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
|
|
168
|
+
})
|
|
169
|
+
})
|
|
@@ -20,9 +20,7 @@ import type {
|
|
|
20
20
|
TrialsComparisonGrader,
|
|
21
21
|
TrialsComparisonGraderInput,
|
|
22
22
|
} from '../pipeline/pipeline.types.ts'
|
|
23
|
-
|
|
24
|
-
/** Default number of bootstrap iterations */
|
|
25
|
-
const DEFAULT_ITERATIONS = 1000
|
|
23
|
+
import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
|
|
26
24
|
|
|
27
25
|
/**
|
|
28
26
|
* Bootstrap confidence interval result.
|
|
@@ -82,16 +80,13 @@ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): Boot
|
|
|
82
80
|
}
|
|
83
81
|
|
|
84
82
|
/**
|
|
85
|
-
* Get bootstrap iterations from environment
|
|
83
|
+
* Get bootstrap iterations from environment or use default.
|
|
86
84
|
*
|
|
87
85
|
* @returns Number of bootstrap iterations
|
|
88
86
|
*/
|
|
89
|
-
const
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
const parsed = Number.parseInt(envValue, 10)
|
|
94
|
-
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
87
|
+
const getIterations = (): number => {
|
|
88
|
+
const config = getBootstrapConfigFromEnv()
|
|
89
|
+
return config.iterations ?? DEFAULT_ITERATIONS
|
|
95
90
|
}
|
|
96
91
|
|
|
97
92
|
/**
|
|
@@ -109,7 +104,7 @@ const getIterationsFromEnv = (): number => {
|
|
|
109
104
|
export const grade: TrialsComparisonGrader = async ({
|
|
110
105
|
runs,
|
|
111
106
|
}: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
112
|
-
const iterations =
|
|
107
|
+
const iterations = getIterations()
|
|
113
108
|
|
|
114
109
|
// Collect pass/fail outcomes for each run
|
|
115
110
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
*/
|
|
17
17
|
|
|
18
18
|
import { logProgress, writeOutput } from '../core.ts'
|
|
19
|
+
import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
|
|
19
20
|
import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
|
|
20
21
|
import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
|
|
21
22
|
import type {
|
|
@@ -407,6 +408,34 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
407
408
|
flakiness[label] = computeFlakinessMetrics(results)
|
|
408
409
|
}
|
|
409
410
|
|
|
411
|
+
// Compute confidence intervals when using statistical strategy
|
|
412
|
+
if (strategy === 'statistical') {
|
|
413
|
+
const bootstrapConfig = getBootstrapConfigFromEnv()
|
|
414
|
+
|
|
415
|
+
for (const label of runLabels) {
|
|
416
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
417
|
+
const results = [...resultsMap.values()]
|
|
418
|
+
const passAtKValues = results.map((r) => r.passAtK ?? 0)
|
|
419
|
+
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
420
|
+
|
|
421
|
+
// Capability CIs
|
|
422
|
+
const capabilityMetrics = capability[label]
|
|
423
|
+
if (capabilityMetrics) {
|
|
424
|
+
capabilityMetrics.confidenceIntervals = {
|
|
425
|
+
avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Reliability CIs
|
|
430
|
+
const reliabilityMetrics = reliability[label]
|
|
431
|
+
if (reliabilityMetrics) {
|
|
432
|
+
reliabilityMetrics.confidenceIntervals = {
|
|
433
|
+
avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
410
439
|
// Compute pairwise comparisons
|
|
411
440
|
const capabilityPairwise: PairwiseComparison[] = []
|
|
412
441
|
const reliabilityPairwise: PairwiseComparison[] = []
|
|
@@ -531,27 +560,52 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
|
|
|
531
560
|
lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
|
|
532
561
|
lines.push('')
|
|
533
562
|
|
|
563
|
+
// Check if any run has confidence intervals (statistical strategy was used)
|
|
564
|
+
const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
|
|
565
|
+
|
|
534
566
|
// Capability table
|
|
535
567
|
lines.push('## Capability (passAtK)')
|
|
536
568
|
lines.push('')
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
569
|
+
if (hasCIs) {
|
|
570
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
571
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
572
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
573
|
+
const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
|
|
574
|
+
lines.push(
|
|
575
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
576
|
+
)
|
|
577
|
+
}
|
|
578
|
+
} else {
|
|
579
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
580
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
581
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
582
|
+
lines.push(
|
|
583
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
584
|
+
)
|
|
585
|
+
}
|
|
543
586
|
}
|
|
544
587
|
lines.push('')
|
|
545
588
|
|
|
546
589
|
// Reliability table
|
|
547
590
|
lines.push('## Reliability (passExpK)')
|
|
548
591
|
lines.push('')
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
592
|
+
if (hasCIs) {
|
|
593
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
594
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
595
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
596
|
+
const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
|
|
597
|
+
lines.push(
|
|
598
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
599
|
+
)
|
|
600
|
+
}
|
|
601
|
+
} else {
|
|
602
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
603
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
604
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
605
|
+
lines.push(
|
|
606
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
607
|
+
)
|
|
608
|
+
}
|
|
555
609
|
}
|
|
556
610
|
lines.push('')
|
|
557
611
|
|
package/src/pipeline/compare.ts
CHANGED
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
import { basename, extname } from 'node:path'
|
|
26
26
|
import { parseArgs } from 'node:util'
|
|
27
27
|
import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
|
|
28
|
+
import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
|
|
28
29
|
import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
|
|
29
30
|
import { grade as weightedGrade } from '../graders/compare-weighted.ts'
|
|
30
31
|
import type {
|
|
@@ -471,6 +472,36 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
471
472
|
}
|
|
472
473
|
}
|
|
473
474
|
|
|
475
|
+
// Compute confidence intervals when using statistical strategy
|
|
476
|
+
if (strategy === 'statistical') {
|
|
477
|
+
const bootstrapConfig = getBootstrapConfigFromEnv()
|
|
478
|
+
|
|
479
|
+
for (const label of runLabels) {
|
|
480
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
481
|
+
const results = [...resultsMap.values()]
|
|
482
|
+
const scores = results.map((r) => r.score?.score ?? 0)
|
|
483
|
+
const passes = results.map((r) => (r.score?.pass === true ? 1 : 0))
|
|
484
|
+
const latencies = results.map((r) => r.timing?.total ?? 0)
|
|
485
|
+
|
|
486
|
+
// Quality CIs
|
|
487
|
+
const qualityMetrics = quality[label]
|
|
488
|
+
if (qualityMetrics) {
|
|
489
|
+
qualityMetrics.confidenceIntervals = {
|
|
490
|
+
avgScore: bootstrap(scores, bootstrapConfig).ci,
|
|
491
|
+
passRate: bootstrap(passes, bootstrapConfig).ci,
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
// Performance CIs
|
|
496
|
+
const performanceMetrics = performance[label]
|
|
497
|
+
if (performanceMetrics) {
|
|
498
|
+
performanceMetrics.confidenceIntervals = {
|
|
499
|
+
latencyMean: bootstrap(latencies, bootstrapConfig).ci,
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
474
505
|
// Trajectory info
|
|
475
506
|
const trajectoryInfo: Record<string, TrajectoryInfo> = {}
|
|
476
507
|
for (const label of runLabels) {
|
|
@@ -586,27 +617,53 @@ const formatReportAsMarkdown = (report: ComparisonReport): string => {
|
|
|
586
617
|
lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
|
|
587
618
|
lines.push('')
|
|
588
619
|
|
|
620
|
+
// Check if any run has confidence intervals (statistical strategy was used)
|
|
621
|
+
const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
|
|
622
|
+
|
|
589
623
|
// Quality table
|
|
590
624
|
lines.push('## Quality')
|
|
591
625
|
lines.push('')
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
626
|
+
if (hasCIs) {
|
|
627
|
+
lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |')
|
|
628
|
+
lines.push('|-----|-----------|--------|-----------|--------|------|------|')
|
|
629
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
630
|
+
const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore)
|
|
631
|
+
const passRateCI = formatCI(q.confidenceIntervals?.passRate)
|
|
632
|
+
lines.push(
|
|
633
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`,
|
|
634
|
+
)
|
|
635
|
+
}
|
|
636
|
+
} else {
|
|
637
|
+
lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
|
|
638
|
+
lines.push('|-----|-----------|-----------|------|------|')
|
|
639
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
640
|
+
lines.push(
|
|
641
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
|
|
642
|
+
)
|
|
643
|
+
}
|
|
598
644
|
}
|
|
599
645
|
lines.push('')
|
|
600
646
|
|
|
601
647
|
// Performance table
|
|
602
648
|
lines.push('## Performance')
|
|
603
649
|
lines.push('')
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
650
|
+
if (hasCIs) {
|
|
651
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |')
|
|
652
|
+
lines.push('|-----|----------|----------|----------|-----------|--------|')
|
|
653
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
654
|
+
const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
|
|
655
|
+
lines.push(
|
|
656
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`,
|
|
657
|
+
)
|
|
658
|
+
}
|
|
659
|
+
} else {
|
|
660
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
|
|
661
|
+
lines.push('|-----|----------|----------|----------|-----------|')
|
|
662
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
663
|
+
lines.push(
|
|
664
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
|
|
665
|
+
)
|
|
666
|
+
}
|
|
610
667
|
}
|
|
611
668
|
lines.push('')
|
|
612
669
|
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration tests for compare command statistical strategy.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests verify confidence interval computation for the statistical strategy
|
|
6
|
+
* in the compare command with CaptureResult format.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
|
|
12
|
+
import type { CaptureResult } from '../../schemas.ts'
|
|
13
|
+
import { runCompare } from '../compare.ts'
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Test Fixtures
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({
|
|
20
|
+
id,
|
|
21
|
+
input: `Prompt for ${id}`,
|
|
22
|
+
output: `Output for ${id}`,
|
|
23
|
+
trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }],
|
|
24
|
+
metadata: {},
|
|
25
|
+
timing: {
|
|
26
|
+
start: Date.now(),
|
|
27
|
+
end: Date.now() + duration,
|
|
28
|
+
sessionCreation: 100,
|
|
29
|
+
total: duration,
|
|
30
|
+
},
|
|
31
|
+
toolErrors: false,
|
|
32
|
+
score: {
|
|
33
|
+
pass,
|
|
34
|
+
score,
|
|
35
|
+
reasoning: pass ? 'Passed' : 'Failed',
|
|
36
|
+
},
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical`
|
|
40
|
+
|
|
41
|
+
beforeAll(async () => {
|
|
42
|
+
await Bun.$`mkdir -p ${tempDir}`
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
afterAll(async () => {
|
|
46
|
+
await Bun.$`rm -rf ${tempDir}`
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
// ============================================================================
|
|
50
|
+
// Statistical Strategy CI Tests
|
|
51
|
+
// ============================================================================
|
|
52
|
+
|
|
53
|
+
describe('runCompare statistical strategy', () => {
|
|
54
|
+
test('computes confidence intervals for quality metrics', async () => {
|
|
55
|
+
const run1Path = `${tempDir}/ci-qual-run1.jsonl`
|
|
56
|
+
const run2Path = `${tempDir}/ci-qual-run2.jsonl`
|
|
57
|
+
|
|
58
|
+
// Create multiple prompts with varying scores for meaningful CI computation
|
|
59
|
+
const results1 = [
|
|
60
|
+
createCaptureResult('p1', 0.9, true, 1000),
|
|
61
|
+
createCaptureResult('p2', 0.85, true, 1100),
|
|
62
|
+
createCaptureResult('p3', 0.95, true, 900),
|
|
63
|
+
createCaptureResult('p4', 0.8, true, 1200),
|
|
64
|
+
]
|
|
65
|
+
const results2 = [
|
|
66
|
+
createCaptureResult('p1', 0.6, false, 2000),
|
|
67
|
+
createCaptureResult('p2', 0.5, false, 2100),
|
|
68
|
+
createCaptureResult('p3', 0.7, true, 1900),
|
|
69
|
+
createCaptureResult('p4', 0.55, false, 2200),
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
73
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
74
|
+
|
|
75
|
+
const report = await runCompare({
|
|
76
|
+
runs: [
|
|
77
|
+
{ label: 'high', path: run1Path },
|
|
78
|
+
{ label: 'low', path: run2Path },
|
|
79
|
+
],
|
|
80
|
+
strategy: 'statistical',
|
|
81
|
+
progress: false,
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
// Verify confidence intervals are computed for quality
|
|
85
|
+
const highQuality = report.quality.high
|
|
86
|
+
expect(highQuality).toBeDefined()
|
|
87
|
+
expect(highQuality?.confidenceIntervals).toBeDefined()
|
|
88
|
+
expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined()
|
|
89
|
+
expect(highQuality?.confidenceIntervals?.passRate).toBeDefined()
|
|
90
|
+
|
|
91
|
+
// avgScore CI should be a tuple [lower, upper]
|
|
92
|
+
const avgScoreCI = highQuality?.confidenceIntervals?.avgScore
|
|
93
|
+
expect(avgScoreCI).toHaveLength(2)
|
|
94
|
+
expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0)
|
|
95
|
+
|
|
96
|
+
// CI should contain the average (within reasonable bounds)
|
|
97
|
+
expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0)
|
|
98
|
+
expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1)
|
|
99
|
+
|
|
100
|
+
// passRate CI should also be valid
|
|
101
|
+
const passRateCI = highQuality?.confidenceIntervals?.passRate
|
|
102
|
+
expect(passRateCI).toHaveLength(2)
|
|
103
|
+
expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
test('computes confidence intervals for performance metrics', async () => {
|
|
107
|
+
const run1Path = `${tempDir}/ci-perf-run1.jsonl`
|
|
108
|
+
const run2Path = `${tempDir}/ci-perf-run2.jsonl`
|
|
109
|
+
|
|
110
|
+
// Create results with varying latencies
|
|
111
|
+
const results1 = [
|
|
112
|
+
createCaptureResult('p1', 0.9, true, 1000),
|
|
113
|
+
createCaptureResult('p2', 0.85, true, 1100),
|
|
114
|
+
createCaptureResult('p3', 0.95, true, 900),
|
|
115
|
+
createCaptureResult('p4', 0.8, true, 1050),
|
|
116
|
+
]
|
|
117
|
+
const results2 = [
|
|
118
|
+
createCaptureResult('p1', 0.7, true, 2000),
|
|
119
|
+
createCaptureResult('p2', 0.65, true, 2200),
|
|
120
|
+
createCaptureResult('p3', 0.75, true, 1800),
|
|
121
|
+
createCaptureResult('p4', 0.6, true, 2100),
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
125
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
126
|
+
|
|
127
|
+
const report = await runCompare({
|
|
128
|
+
runs: [
|
|
129
|
+
{ label: 'fast', path: run1Path },
|
|
130
|
+
{ label: 'slow', path: run2Path },
|
|
131
|
+
],
|
|
132
|
+
strategy: 'statistical',
|
|
133
|
+
progress: false,
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
// Verify confidence intervals are computed for performance
|
|
137
|
+
const fastPerf = report.performance.fast
|
|
138
|
+
expect(fastPerf).toBeDefined()
|
|
139
|
+
expect(fastPerf?.confidenceIntervals).toBeDefined()
|
|
140
|
+
expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined()
|
|
141
|
+
|
|
142
|
+
// latencyMean CI should be a tuple [lower, upper]
|
|
143
|
+
const latencyCI = fastPerf?.confidenceIntervals?.latencyMean
|
|
144
|
+
expect(latencyCI).toHaveLength(2)
|
|
145
|
+
expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0)
|
|
146
|
+
|
|
147
|
+
// Fast run should have lower latency CI than slow run
|
|
148
|
+
const slowPerf = report.performance.slow
|
|
149
|
+
const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean
|
|
150
|
+
expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0)
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
test('weighted strategy does not compute confidence intervals', async () => {
|
|
154
|
+
const run1Path = `${tempDir}/no-ci-run1.jsonl`
|
|
155
|
+
const run2Path = `${tempDir}/no-ci-run2.jsonl`
|
|
156
|
+
|
|
157
|
+
const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)]
|
|
158
|
+
const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)]
|
|
159
|
+
|
|
160
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
161
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
162
|
+
|
|
163
|
+
const report = await runCompare({
|
|
164
|
+
runs: [
|
|
165
|
+
{ label: 'run1', path: run1Path },
|
|
166
|
+
{ label: 'run2', path: run2Path },
|
|
167
|
+
],
|
|
168
|
+
strategy: 'weighted', // Default strategy
|
|
169
|
+
progress: false,
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
// Confidence intervals should NOT be present for weighted strategy
|
|
173
|
+
const quality = report.quality.run1
|
|
174
|
+
expect(quality?.confidenceIntervals).toBeUndefined()
|
|
175
|
+
|
|
176
|
+
const perf = report.performance.run1
|
|
177
|
+
expect(perf?.confidenceIntervals).toBeUndefined()
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
test('statistical strategy includes CIs in markdown output', async () => {
|
|
181
|
+
const run1Path = `${tempDir}/ci-md-run1.jsonl`
|
|
182
|
+
const run2Path = `${tempDir}/ci-md-run2.jsonl`
|
|
183
|
+
const outputPath = `${tempDir}/ci-report.md`
|
|
184
|
+
|
|
185
|
+
const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)]
|
|
186
|
+
const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)]
|
|
187
|
+
|
|
188
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
189
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
190
|
+
|
|
191
|
+
await runCompare({
|
|
192
|
+
runs: [
|
|
193
|
+
{ label: 'agent1', path: run1Path },
|
|
194
|
+
{ label: 'agent2', path: run2Path },
|
|
195
|
+
],
|
|
196
|
+
strategy: 'statistical',
|
|
197
|
+
outputPath,
|
|
198
|
+
format: 'markdown',
|
|
199
|
+
progress: false,
|
|
200
|
+
})
|
|
201
|
+
|
|
202
|
+
const content = await Bun.file(outputPath).text()
|
|
203
|
+
|
|
204
|
+
// Markdown should include 95% CI column headers
|
|
205
|
+
expect(content).toContain('95% CI')
|
|
206
|
+
// Should contain CI values in bracket format [lower, upper]
|
|
207
|
+
expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
test('handles single sample gracefully with degenerate CI', async () => {
|
|
211
|
+
const run1Path = `${tempDir}/single-run1.jsonl`
|
|
212
|
+
const run2Path = `${tempDir}/single-run2.jsonl`
|
|
213
|
+
|
|
214
|
+
// Single sample per run
|
|
215
|
+
const result1 = createCaptureResult('p1', 0.9, true)
|
|
216
|
+
const result2 = createCaptureResult('p1', 0.5, false)
|
|
217
|
+
|
|
218
|
+
await Bun.write(run1Path, JSON.stringify(result1))
|
|
219
|
+
await Bun.write(run2Path, JSON.stringify(result2))
|
|
220
|
+
|
|
221
|
+
const report = await runCompare({
|
|
222
|
+
runs: [
|
|
223
|
+
{ label: 'single1', path: run1Path },
|
|
224
|
+
{ label: 'single2', path: run2Path },
|
|
225
|
+
],
|
|
226
|
+
strategy: 'statistical',
|
|
227
|
+
progress: false,
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
// Should still compute CIs (they will be degenerate for single sample)
|
|
231
|
+
const quality = report.quality.single1
|
|
232
|
+
expect(quality?.confidenceIntervals).toBeDefined()
|
|
233
|
+
expect(quality?.confidenceIntervals?.avgScore).toBeDefined()
|
|
234
|
+
|
|
235
|
+
// For single sample, CI should collapse to the value
|
|
236
|
+
const ci = quality?.confidenceIntervals?.avgScore
|
|
237
|
+
expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2)
|
|
238
|
+
expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2)
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
test('JSON output includes confidence intervals structure', async () => {
|
|
242
|
+
const run1Path = `${tempDir}/json-ci-run1.jsonl`
|
|
243
|
+
const run2Path = `${tempDir}/json-ci-run2.jsonl`
|
|
244
|
+
const outputPath = `${tempDir}/ci-report.json`
|
|
245
|
+
|
|
246
|
+
const results1 = [
|
|
247
|
+
createCaptureResult('p1', 0.9, true),
|
|
248
|
+
createCaptureResult('p2', 0.85, true),
|
|
249
|
+
createCaptureResult('p3', 0.95, true),
|
|
250
|
+
]
|
|
251
|
+
const results2 = [
|
|
252
|
+
createCaptureResult('p1', 0.6, false),
|
|
253
|
+
createCaptureResult('p2', 0.5, false),
|
|
254
|
+
createCaptureResult('p3', 0.7, true),
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
258
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
259
|
+
|
|
260
|
+
await runCompare({
|
|
261
|
+
runs: [
|
|
262
|
+
{ label: 'high', path: run1Path },
|
|
263
|
+
{ label: 'low', path: run2Path },
|
|
264
|
+
],
|
|
265
|
+
strategy: 'statistical',
|
|
266
|
+
outputPath,
|
|
267
|
+
format: 'json',
|
|
268
|
+
progress: false,
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
const content = await Bun.file(outputPath).text()
|
|
272
|
+
const parsed = JSON.parse(content)
|
|
273
|
+
|
|
274
|
+
// Verify JSON structure includes confidenceIntervals
|
|
275
|
+
expect(parsed.quality.high.confidenceIntervals).toBeDefined()
|
|
276
|
+
expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array)
|
|
277
|
+
expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2)
|
|
278
|
+
expect(parsed.performance.high.confidenceIntervals).toBeDefined()
|
|
279
|
+
expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array)
|
|
280
|
+
})
|
|
281
|
+
})
|
|
@@ -210,6 +210,146 @@ describe('runTrialsCompare', () => {
|
|
|
210
210
|
expect(report.meta.runs).toEqual(['better', 'worse'])
|
|
211
211
|
})
|
|
212
212
|
|
|
213
|
+
test('statistical strategy computes confidence intervals for capability metrics', async () => {
|
|
214
|
+
const run1Path = `${tempDir}/ci-cap-run1.jsonl`
|
|
215
|
+
const run2Path = `${tempDir}/ci-cap-run2.jsonl`
|
|
216
|
+
|
|
217
|
+
// Create multiple prompts for meaningful CI computation
|
|
218
|
+
const trials1 = [
|
|
219
|
+
createTrialResult('p1', 0.9, 0.8),
|
|
220
|
+
createTrialResult('p2', 0.85, 0.7),
|
|
221
|
+
createTrialResult('p3', 0.95, 0.9),
|
|
222
|
+
]
|
|
223
|
+
const trials2 = [
|
|
224
|
+
createTrialResult('p1', 0.6, 0.4),
|
|
225
|
+
createTrialResult('p2', 0.5, 0.3),
|
|
226
|
+
createTrialResult('p3', 0.7, 0.5),
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
230
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
231
|
+
|
|
232
|
+
const report = await runTrialsCompare({
|
|
233
|
+
runs: [
|
|
234
|
+
{ label: 'high', path: run1Path },
|
|
235
|
+
{ label: 'low', path: run2Path },
|
|
236
|
+
],
|
|
237
|
+
strategy: 'statistical',
|
|
238
|
+
progress: false,
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
// Verify confidence intervals are computed for capability
|
|
242
|
+
const highCap = report.capability.high
|
|
243
|
+
expect(highCap).toBeDefined()
|
|
244
|
+
expect(highCap?.confidenceIntervals).toBeDefined()
|
|
245
|
+
expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined()
|
|
246
|
+
|
|
247
|
+
// CI should be a tuple [lower, upper]
|
|
248
|
+
const ci = highCap?.confidenceIntervals?.avgPassAtK
|
|
249
|
+
expect(ci).toHaveLength(2)
|
|
250
|
+
expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
|
|
251
|
+
|
|
252
|
+
// CI should contain the average (within reasonable bounds)
|
|
253
|
+
expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0)
|
|
254
|
+
expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1)
|
|
255
|
+
})
|
|
256
|
+
|
|
257
|
+
test('statistical strategy computes confidence intervals for reliability metrics', async () => {
|
|
258
|
+
const run1Path = `${tempDir}/ci-rel-run1.jsonl`
|
|
259
|
+
const run2Path = `${tempDir}/ci-rel-run2.jsonl`
|
|
260
|
+
|
|
261
|
+
const trials1 = [
|
|
262
|
+
createTrialResult('p1', 0.9, 0.85),
|
|
263
|
+
createTrialResult('p2', 0.8, 0.75),
|
|
264
|
+
createTrialResult('p3', 0.85, 0.8),
|
|
265
|
+
]
|
|
266
|
+
const trials2 = [
|
|
267
|
+
createTrialResult('p1', 0.7, 0.3),
|
|
268
|
+
createTrialResult('p2', 0.6, 0.2),
|
|
269
|
+
createTrialResult('p3', 0.65, 0.25),
|
|
270
|
+
]
|
|
271
|
+
|
|
272
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
273
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
274
|
+
|
|
275
|
+
const report = await runTrialsCompare({
|
|
276
|
+
runs: [
|
|
277
|
+
{ label: 'reliable', path: run1Path },
|
|
278
|
+
{ label: 'flaky', path: run2Path },
|
|
279
|
+
],
|
|
280
|
+
strategy: 'statistical',
|
|
281
|
+
progress: false,
|
|
282
|
+
})
|
|
283
|
+
|
|
284
|
+
// Verify confidence intervals are computed for reliability
|
|
285
|
+
const reliableRel = report.reliability.reliable
|
|
286
|
+
expect(reliableRel).toBeDefined()
|
|
287
|
+
expect(reliableRel?.confidenceIntervals).toBeDefined()
|
|
288
|
+
expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
|
|
289
|
+
|
|
290
|
+
// CI should be a tuple [lower, upper]
|
|
291
|
+
const ci = reliableRel?.confidenceIntervals?.avgPassExpK
|
|
292
|
+
expect(ci).toHaveLength(2)
|
|
293
|
+
expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
test('weighted strategy does not compute confidence intervals', async () => {
|
|
297
|
+
const run1Path = `${tempDir}/no-ci-run1.jsonl`
|
|
298
|
+
const run2Path = `${tempDir}/no-ci-run2.jsonl`
|
|
299
|
+
|
|
300
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
301
|
+
const trial2 = createTrialResult('test-001', 0.5, 0.3)
|
|
302
|
+
|
|
303
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
304
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
305
|
+
|
|
306
|
+
const report = await runTrialsCompare({
|
|
307
|
+
runs: [
|
|
308
|
+
{ label: 'run1', path: run1Path },
|
|
309
|
+
{ label: 'run2', path: run2Path },
|
|
310
|
+
],
|
|
311
|
+
strategy: 'weighted', // Default strategy
|
|
312
|
+
progress: false,
|
|
313
|
+
})
|
|
314
|
+
|
|
315
|
+
// Confidence intervals should NOT be present for weighted strategy
|
|
316
|
+
const cap = report.capability.run1
|
|
317
|
+
expect(cap?.confidenceIntervals).toBeUndefined()
|
|
318
|
+
|
|
319
|
+
const rel = report.reliability.run1
|
|
320
|
+
expect(rel?.confidenceIntervals).toBeUndefined()
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
test('statistical strategy includes CIs in markdown output', async () => {
|
|
324
|
+
const run1Path = `${tempDir}/ci-md-run1.jsonl`
|
|
325
|
+
const run2Path = `${tempDir}/ci-md-run2.jsonl`
|
|
326
|
+
const outputPath = `${tempDir}/ci-report.md`
|
|
327
|
+
|
|
328
|
+
const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)]
|
|
329
|
+
const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)]
|
|
330
|
+
|
|
331
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
332
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
333
|
+
|
|
334
|
+
await runTrialsCompare({
|
|
335
|
+
runs: [
|
|
336
|
+
{ label: 'agent1', path: run1Path },
|
|
337
|
+
{ label: 'agent2', path: run2Path },
|
|
338
|
+
],
|
|
339
|
+
strategy: 'statistical',
|
|
340
|
+
outputPath,
|
|
341
|
+
format: 'markdown',
|
|
342
|
+
progress: false,
|
|
343
|
+
})
|
|
344
|
+
|
|
345
|
+
const content = await Bun.file(outputPath).text()
|
|
346
|
+
|
|
347
|
+
// Markdown should include 95% CI column headers
|
|
348
|
+
expect(content).toContain('95% CI')
|
|
349
|
+
// Should contain CI values in bracket format [lower, upper]
|
|
350
|
+
expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
|
|
351
|
+
})
|
|
352
|
+
|
|
213
353
|
test('computes correct capability metrics', async () => {
|
|
214
354
|
const run1Path = `${tempDir}/cap-run1.jsonl`
|
|
215
355
|
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -573,6 +573,17 @@ export type ValidationResult = z.infer<typeof ValidationResultSchema>
|
|
|
573
573
|
// Comparison Report Schemas
|
|
574
574
|
// ============================================================================
|
|
575
575
|
|
|
576
|
+
/**
|
|
577
|
+
* Confidence interval schema as [lower, upper] bounds.
|
|
578
|
+
*
|
|
579
|
+
* @remarks
|
|
580
|
+
* Used for bootstrap-computed confidence intervals when strategy=statistical.
|
|
581
|
+
*/
|
|
582
|
+
export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()])
|
|
583
|
+
|
|
584
|
+
/** Confidence interval type */
|
|
585
|
+
export type ConfidenceInterval = z.infer<typeof ConfidenceIntervalSchema>
|
|
586
|
+
|
|
576
587
|
/**
|
|
577
588
|
* Score distribution histogram for quality analysis.
|
|
578
589
|
*
|
|
@@ -590,6 +601,19 @@ export const ScoreDistributionSchema = z.object({
|
|
|
590
601
|
/** Score distribution type */
|
|
591
602
|
export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
|
|
592
603
|
|
|
604
|
+
/**
|
|
605
|
+
* Confidence intervals for quality metrics.
|
|
606
|
+
*/
|
|
607
|
+
export const QualityConfidenceIntervalsSchema = z.object({
|
|
608
|
+
/** CI for avgScore */
|
|
609
|
+
avgScore: ConfidenceIntervalSchema.optional(),
|
|
610
|
+
/** CI for passRate */
|
|
611
|
+
passRate: ConfidenceIntervalSchema.optional(),
|
|
612
|
+
})
|
|
613
|
+
|
|
614
|
+
/** Quality confidence intervals type */
|
|
615
|
+
export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceIntervalsSchema>
|
|
616
|
+
|
|
593
617
|
/**
|
|
594
618
|
* Quality metrics for a single run in comparison.
|
|
595
619
|
*/
|
|
@@ -604,6 +628,8 @@ export const QualityMetricsSchema = z.object({
|
|
|
604
628
|
failCount: z.number(),
|
|
605
629
|
/** Score distribution histogram */
|
|
606
630
|
scoreDistribution: ScoreDistributionSchema,
|
|
631
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
632
|
+
confidenceIntervals: QualityConfidenceIntervalsSchema.optional(),
|
|
607
633
|
})
|
|
608
634
|
|
|
609
635
|
/** Quality metrics type */
|
|
@@ -630,6 +656,17 @@ export const LatencyStatsSchema = z.object({
|
|
|
630
656
|
/** Latency stats type */
|
|
631
657
|
export type LatencyStats = z.infer<typeof LatencyStatsSchema>
|
|
632
658
|
|
|
659
|
+
/**
|
|
660
|
+
* Confidence intervals for performance metrics.
|
|
661
|
+
*/
|
|
662
|
+
export const PerformanceConfidenceIntervalsSchema = z.object({
|
|
663
|
+
/** CI for latency mean */
|
|
664
|
+
latencyMean: ConfidenceIntervalSchema.optional(),
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
/** Performance confidence intervals type */
|
|
668
|
+
export type PerformanceConfidenceIntervals = z.infer<typeof PerformanceConfidenceIntervalsSchema>
|
|
669
|
+
|
|
633
670
|
/**
|
|
634
671
|
* Performance metrics for a single run in comparison.
|
|
635
672
|
*/
|
|
@@ -640,6 +677,8 @@ export const PerformanceMetricsSchema = z.object({
|
|
|
640
677
|
firstResponse: LatencyStatsSchema.optional(),
|
|
641
678
|
/** Sum of all run durations in milliseconds */
|
|
642
679
|
totalDuration: z.number(),
|
|
680
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
681
|
+
confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(),
|
|
643
682
|
})
|
|
644
683
|
|
|
645
684
|
/** Performance metrics type */
|
|
@@ -782,6 +821,17 @@ export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
|
|
|
782
821
|
// Trials Comparison Report Schemas
|
|
783
822
|
// ============================================================================
|
|
784
823
|
|
|
824
|
+
/**
|
|
825
|
+
* Confidence intervals for trials capability metrics.
|
|
826
|
+
*/
|
|
827
|
+
export const TrialsCapabilityConfidenceIntervalsSchema = z.object({
|
|
828
|
+
/** CI for avgPassAtK */
|
|
829
|
+
avgPassAtK: ConfidenceIntervalSchema.optional(),
|
|
830
|
+
})
|
|
831
|
+
|
|
832
|
+
/** Trials capability confidence intervals type */
|
|
833
|
+
export type TrialsCapabilityConfidenceIntervals = z.infer<typeof TrialsCapabilityConfidenceIntervalsSchema>
|
|
834
|
+
|
|
785
835
|
/**
|
|
786
836
|
* Capability metrics for trials comparison (passAtK-based).
|
|
787
837
|
*
|
|
@@ -798,11 +848,24 @@ export const TrialsCapabilityMetricsSchema = z.object({
|
|
|
798
848
|
p25PassAtK: z.number(),
|
|
799
849
|
/** 75th percentile passAtK */
|
|
800
850
|
p75PassAtK: z.number(),
|
|
851
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
852
|
+
confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(),
|
|
801
853
|
})
|
|
802
854
|
|
|
803
855
|
/** Trials capability metrics type */
|
|
804
856
|
export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
|
|
805
857
|
|
|
858
|
+
/**
|
|
859
|
+
* Confidence intervals for trials reliability metrics.
|
|
860
|
+
*/
|
|
861
|
+
export const TrialsReliabilityConfidenceIntervalsSchema = z.object({
|
|
862
|
+
/** CI for avgPassExpK */
|
|
863
|
+
avgPassExpK: ConfidenceIntervalSchema.optional(),
|
|
864
|
+
})
|
|
865
|
+
|
|
866
|
+
/** Trials reliability confidence intervals type */
|
|
867
|
+
export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabilityConfidenceIntervalsSchema>
|
|
868
|
+
|
|
806
869
|
/**
|
|
807
870
|
* Reliability metrics for trials comparison (passExpK-based).
|
|
808
871
|
*
|
|
@@ -819,6 +882,8 @@ export const TrialsReliabilityMetricsSchema = z.object({
|
|
|
819
882
|
p25PassExpK: z.number(),
|
|
820
883
|
/** 75th percentile passExpK */
|
|
821
884
|
p75PassExpK: z.number(),
|
|
885
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
886
|
+
confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(),
|
|
822
887
|
})
|
|
823
888
|
|
|
824
889
|
/** Trials reliability metrics type */
|