@plaited/agent-eval-harness 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -3
- package/src/graders/bootstrap.ts +135 -0
- package/src/graders/compare-statistical.ts +14 -86
- package/src/graders/tests/bootstrap.spec.ts +169 -0
- package/src/graders/trials-compare-statistical.ts +6 -11
- package/src/pipeline/compare-trials.ts +68 -13
- package/src/pipeline/compare.ts +70 -12
- package/src/pipeline/tests/compare-statistical.spec.ts +285 -0
- package/src/pipeline/tests/compare-trials.spec.ts +143 -0
- package/src/schemas/schemas.ts +69 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -42,8 +42,10 @@
|
|
|
42
42
|
"check:types": "tsc --noEmit",
|
|
43
43
|
"check:write": "biome check --write && format-package --write",
|
|
44
44
|
"prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
|
|
45
|
-
"test": "bun test
|
|
46
|
-
"test:
|
|
45
|
+
"test": "bun run test:bin && bun test:src",
|
|
46
|
+
"test:bin": "bun test bin/tests/*.spec.ts",
|
|
47
|
+
"test:integration": "bun test ./**/integration_tests/*.spec.ts",
|
|
48
|
+
"test:src": "bun test src/**/tests/*.spec.ts"
|
|
47
49
|
},
|
|
48
50
|
"lint-staged": {
|
|
49
51
|
"*.{js,cjs,jsx,tsx,ts}": [
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared bootstrap sampling utilities for confidence interval computation.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
6
|
+
* assuming a specific distribution. For small samples, it's more
|
|
7
|
+
* reliable than parametric methods.
|
|
8
|
+
*
|
|
9
|
+
* Environment variable configuration:
|
|
10
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** Default number of bootstrap iterations */
|
|
16
|
+
export const DEFAULT_ITERATIONS = 1000
|
|
17
|
+
|
|
18
|
+
/** Default confidence level (95%) */
|
|
19
|
+
export const DEFAULT_CONFIDENCE_LEVEL = 0.95
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Confidence interval as [lower, upper] bounds.
|
|
23
|
+
*/
|
|
24
|
+
export type ConfidenceInterval = [number, number]
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Bootstrap confidence interval result.
|
|
28
|
+
*/
|
|
29
|
+
export type BootstrapResult = {
|
|
30
|
+
/** Median of bootstrap sample means (50th percentile) */
|
|
31
|
+
median: number
|
|
32
|
+
/** Confidence interval [lower, upper] */
|
|
33
|
+
ci: ConfidenceInterval
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Configuration for bootstrap sampling.
|
|
38
|
+
*/
|
|
39
|
+
export type BootstrapConfig = {
|
|
40
|
+
/** Number of bootstrap iterations (default: 1000) */
|
|
41
|
+
iterations?: number
|
|
42
|
+
/** Confidence level between 0 and 1 (default: 0.95) */
|
|
43
|
+
confidenceLevel?: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Compute bootstrap confidence interval for sample mean.
|
|
48
|
+
*
|
|
49
|
+
* @remarks
|
|
50
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
51
|
+
* assuming a specific distribution. For small samples, it's more
|
|
52
|
+
* reliable than parametric methods.
|
|
53
|
+
*
|
|
54
|
+
* @param samples - Array of numeric samples
|
|
55
|
+
* @param config - Optional bootstrap configuration
|
|
56
|
+
* @returns Bootstrap median and confidence interval
|
|
57
|
+
*
|
|
58
|
+
* @public
|
|
59
|
+
*/
|
|
60
|
+
export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
|
|
61
|
+
const iterations = config?.iterations ?? DEFAULT_ITERATIONS
|
|
62
|
+
const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
|
|
63
|
+
|
|
64
|
+
if (samples.length === 0) {
|
|
65
|
+
return { median: 0, ci: [0, 0] }
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (samples.length === 1) {
|
|
69
|
+
const value = samples[0] ?? 0
|
|
70
|
+
return { median: value, ci: [value, value] }
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const means: number[] = []
|
|
74
|
+
|
|
75
|
+
for (let i = 0; i < iterations; i++) {
|
|
76
|
+
// Resample with replacement - we know samples.length > 1 at this point
|
|
77
|
+
const resampled = Array.from(
|
|
78
|
+
{ length: samples.length },
|
|
79
|
+
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
// Compute mean of resampled data
|
|
83
|
+
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
84
|
+
means.push(sum / resampled.length)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Sort means for percentile calculation
|
|
88
|
+
means.sort((a, b) => a - b)
|
|
89
|
+
|
|
90
|
+
// Compute percentile indices based on confidence level
|
|
91
|
+
// For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
|
|
92
|
+
const alpha = (1 - confidenceLevel) / 2
|
|
93
|
+
const lowerIdx = Math.floor(iterations * alpha)
|
|
94
|
+
const upperIdx = Math.floor(iterations * (1 - alpha))
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
median: means[Math.floor(iterations / 2)] ?? 0,
|
|
98
|
+
ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Format confidence interval as string.
|
|
104
|
+
*
|
|
105
|
+
* @param ci - Confidence interval [lower, upper]
|
|
106
|
+
* @param decimals - Number of decimal places (default: 3)
|
|
107
|
+
* @returns Formatted CI string or empty string if undefined
|
|
108
|
+
*
|
|
109
|
+
* @public
|
|
110
|
+
*/
|
|
111
|
+
export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
|
|
112
|
+
if (!ci) return ''
|
|
113
|
+
return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Get bootstrap configuration from environment variables.
|
|
118
|
+
*
|
|
119
|
+
* @remarks
|
|
120
|
+
* Reads configuration from:
|
|
121
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
|
|
122
|
+
*
|
|
123
|
+
* @returns Bootstrap configuration
|
|
124
|
+
*
|
|
125
|
+
* @public
|
|
126
|
+
*/
|
|
127
|
+
export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
|
|
128
|
+
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
129
|
+
if (!envValue) return { iterations: DEFAULT_ITERATIONS }
|
|
130
|
+
|
|
131
|
+
const parsed = Number.parseInt(envValue, 10)
|
|
132
|
+
const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
133
|
+
|
|
134
|
+
return { iterations }
|
|
135
|
+
}
|
|
@@ -12,81 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
|
|
15
|
-
|
|
16
|
-
/** Default number of bootstrap iterations */
|
|
17
|
-
const DEFAULT_ITERATIONS = 1000
|
|
18
|
-
|
|
19
|
-
/**
|
|
20
|
-
* Bootstrap confidence interval result.
|
|
21
|
-
*/
|
|
22
|
-
type BootstrapResult = {
|
|
23
|
-
/** Estimated mean from bootstrap */
|
|
24
|
-
mean: number
|
|
25
|
-
/** 95% confidence interval [lower, upper] */
|
|
26
|
-
ci95: [number, number]
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Compute bootstrap confidence interval for sample mean.
|
|
31
|
-
*
|
|
32
|
-
* @remarks
|
|
33
|
-
* Bootstrap resampling provides robust confidence intervals without
|
|
34
|
-
* assuming a specific distribution. For small samples, it's more
|
|
35
|
-
* reliable than parametric methods.
|
|
36
|
-
*
|
|
37
|
-
* @param samples - Array of numeric samples
|
|
38
|
-
* @param iterations - Number of bootstrap iterations
|
|
39
|
-
* @returns Bootstrap mean and 95% confidence interval
|
|
40
|
-
*/
|
|
41
|
-
const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
|
|
42
|
-
if (samples.length === 0) {
|
|
43
|
-
return { mean: 0, ci95: [0, 0] }
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
if (samples.length === 1) {
|
|
47
|
-
const value = samples[0] ?? 0
|
|
48
|
-
return { mean: value, ci95: [value, value] }
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
const means: number[] = []
|
|
52
|
-
|
|
53
|
-
for (let i = 0; i < iterations; i++) {
|
|
54
|
-
// Resample with replacement - we know samples.length > 1 at this point
|
|
55
|
-
const resampled = Array.from(
|
|
56
|
-
{ length: samples.length },
|
|
57
|
-
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
// Compute mean of resampled data
|
|
61
|
-
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
62
|
-
means.push(sum / resampled.length)
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
// Sort means for percentile calculation
|
|
66
|
-
means.sort((a, b) => a - b)
|
|
67
|
-
|
|
68
|
-
// 95% CI: 2.5th and 97.5th percentiles
|
|
69
|
-
const lowerIdx = Math.floor(iterations * 0.025)
|
|
70
|
-
const upperIdx = Math.floor(iterations * 0.975)
|
|
71
|
-
|
|
72
|
-
return {
|
|
73
|
-
mean: means[Math.floor(iterations / 2)] ?? 0,
|
|
74
|
-
ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Get bootstrap iterations from environment variable.
|
|
80
|
-
*
|
|
81
|
-
* @returns Number of bootstrap iterations
|
|
82
|
-
*/
|
|
83
|
-
const getIterationsFromEnv = (): number => {
|
|
84
|
-
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
85
|
-
if (!envValue) return DEFAULT_ITERATIONS
|
|
86
|
-
|
|
87
|
-
const parsed = Number.parseInt(envValue, 10)
|
|
88
|
-
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
89
|
-
}
|
|
15
|
+
import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
|
|
90
16
|
|
|
91
17
|
/**
|
|
92
18
|
* Statistical significance comparison grader.
|
|
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
|
|
|
107
33
|
* @public
|
|
108
34
|
*/
|
|
109
35
|
export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
110
|
-
const
|
|
36
|
+
const config = getBootstrapConfigFromEnv()
|
|
111
37
|
|
|
112
38
|
// Collect scores for each run
|
|
113
39
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
116
42
|
|
|
117
43
|
// For single-prompt comparison, we only have one sample
|
|
118
44
|
// In practice, this grader is most useful when aggregating across prompts
|
|
119
|
-
const stats = bootstrap([score],
|
|
45
|
+
const stats = bootstrap([score], config)
|
|
120
46
|
|
|
121
47
|
return { label, score, stats }
|
|
122
48
|
})
|
|
123
49
|
|
|
124
|
-
// Sort by bootstrap
|
|
125
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
50
|
+
// Sort by bootstrap median descending
|
|
51
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
126
52
|
|
|
127
53
|
// Check if winner is statistically significant
|
|
128
54
|
// CIs don't overlap = significant difference (approximately p<0.05)
|
|
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
131
57
|
const second = sorted[1]
|
|
132
58
|
if (first && second) {
|
|
133
59
|
// Non-overlapping: first's lower bound > second's upper bound
|
|
134
|
-
isSignificant = first.stats.
|
|
60
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
135
61
|
}
|
|
136
62
|
|
|
137
63
|
const reasoning = isSignificant
|
|
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
142
68
|
rankings: sorted.map((s, i) => ({
|
|
143
69
|
run: s.label,
|
|
144
70
|
rank: i + 1,
|
|
145
|
-
score: s.stats.
|
|
71
|
+
score: s.stats.median,
|
|
146
72
|
})),
|
|
147
73
|
reasoning,
|
|
148
74
|
}
|
|
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
156
82
|
*
|
|
157
83
|
* @public
|
|
158
84
|
*/
|
|
159
|
-
export const createStatisticalGrader = (iterations
|
|
85
|
+
export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
|
|
86
|
+
const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
|
|
87
|
+
|
|
160
88
|
return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
161
89
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
162
90
|
const score = run.score?.score ?? 0
|
|
163
|
-
const stats = bootstrap([score],
|
|
91
|
+
const stats = bootstrap([score], config)
|
|
164
92
|
return { label, score, stats }
|
|
165
93
|
})
|
|
166
94
|
|
|
167
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
95
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
168
96
|
|
|
169
97
|
let isSignificant = false
|
|
170
98
|
const first = sorted[0]
|
|
171
99
|
const second = sorted[1]
|
|
172
100
|
if (first && second) {
|
|
173
|
-
isSignificant = first.stats.
|
|
101
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
174
102
|
}
|
|
175
103
|
|
|
176
104
|
return {
|
|
177
105
|
rankings: sorted.map((s, i) => ({
|
|
178
106
|
run: s.label,
|
|
179
107
|
rank: i + 1,
|
|
180
|
-
score: s.stats.
|
|
108
|
+
score: s.stats.median,
|
|
181
109
|
})),
|
|
182
110
|
reasoning: isSignificant
|
|
183
111
|
? `Winner "${first?.label}" is statistically significant (p<0.05)`
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for bootstrap sampling utilities.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
6
|
+
import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
|
|
7
|
+
|
|
8
|
+
describe('bootstrap', () => {
|
|
9
|
+
describe('edge cases', () => {
|
|
10
|
+
test('returns {median: 0, ci: [0, 0]} for empty array', () => {
|
|
11
|
+
const result = bootstrap([])
|
|
12
|
+
expect(result.median).toBe(0)
|
|
13
|
+
expect(result.ci).toEqual([0, 0])
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
test('returns {median: value, ci: [value, value]} for single sample', () => {
|
|
17
|
+
const result = bootstrap([0.75])
|
|
18
|
+
expect(result.median).toBe(0.75)
|
|
19
|
+
expect(result.ci).toEqual([0.75, 0.75])
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
test('handles single sample of 0', () => {
|
|
23
|
+
const result = bootstrap([0])
|
|
24
|
+
expect(result.median).toBe(0)
|
|
25
|
+
expect(result.ci).toEqual([0, 0])
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('handles single sample of 1', () => {
|
|
29
|
+
const result = bootstrap([1])
|
|
30
|
+
expect(result.median).toBe(1)
|
|
31
|
+
expect(result.ci).toEqual([1, 1])
|
|
32
|
+
})
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
describe('confidence interval bounds', () => {
|
|
36
|
+
test('CI lower bound <= median <= CI upper bound', () => {
|
|
37
|
+
const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
|
|
38
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
39
|
+
|
|
40
|
+
expect(result.ci[0]).toBeLessThanOrEqual(result.median)
|
|
41
|
+
expect(result.median).toBeLessThanOrEqual(result.ci[1])
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
test('CI contains the true median for uniform samples', () => {
|
|
45
|
+
// For identical samples, CI should collapse to the value
|
|
46
|
+
const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
|
|
47
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
48
|
+
|
|
49
|
+
expect(result.median).toBeCloseTo(0.5, 2)
|
|
50
|
+
expect(result.ci[0]).toBeCloseTo(0.5, 2)
|
|
51
|
+
expect(result.ci[1]).toBeCloseTo(0.5, 2)
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('CI widens with more variance in samples', () => {
|
|
55
|
+
const lowVariance = [0.49, 0.5, 0.51]
|
|
56
|
+
const highVariance = [0.1, 0.5, 0.9]
|
|
57
|
+
|
|
58
|
+
const lowResult = bootstrap(lowVariance, { iterations: 1000 })
|
|
59
|
+
const highResult = bootstrap(highVariance, { iterations: 1000 })
|
|
60
|
+
|
|
61
|
+
const lowWidth = lowResult.ci[1] - lowResult.ci[0]
|
|
62
|
+
const highWidth = highResult.ci[1] - highResult.ci[0]
|
|
63
|
+
|
|
64
|
+
expect(highWidth).toBeGreaterThan(lowWidth)
|
|
65
|
+
})
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
describe('configuration', () => {
|
|
69
|
+
test('uses default iterations when not specified', () => {
|
|
70
|
+
// Just verify it runs without error with defaults
|
|
71
|
+
const result = bootstrap([0.5, 0.6, 0.7])
|
|
72
|
+
expect(result.median).toBeGreaterThan(0)
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('accepts custom iteration count', () => {
|
|
76
|
+
const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
|
|
77
|
+
expect(result.median).toBeGreaterThan(0)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('accepts custom confidence level', () => {
|
|
81
|
+
const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
|
|
82
|
+
|
|
83
|
+
// 90% CI should be narrower than 95% CI
|
|
84
|
+
const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
|
|
85
|
+
const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
|
|
86
|
+
|
|
87
|
+
const width90 = ci90.ci[1] - ci90.ci[0]
|
|
88
|
+
const width95 = ci95.ci[1] - ci95.ci[0]
|
|
89
|
+
|
|
90
|
+
// 95% CI should generally be wider than 90% CI
|
|
91
|
+
// Allow some tolerance due to randomness
|
|
92
|
+
expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
describe('statistical properties', () => {
|
|
97
|
+
test('median is close to sample mean', () => {
|
|
98
|
+
const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
|
|
99
|
+
const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
|
|
100
|
+
|
|
101
|
+
const result = bootstrap(samples, { iterations: 10000 })
|
|
102
|
+
|
|
103
|
+
// Bootstrap median should be close to sample mean for symmetric distributions
|
|
104
|
+
expect(result.median).toBeCloseTo(sampleMean, 1)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('is deterministic-ish for large iteration counts', () => {
|
|
108
|
+
const samples = [0.3, 0.5, 0.7]
|
|
109
|
+
|
|
110
|
+
// With many iterations, results should be similar across runs
|
|
111
|
+
const result1 = bootstrap(samples, { iterations: 10000 })
|
|
112
|
+
const result2 = bootstrap(samples, { iterations: 10000 })
|
|
113
|
+
|
|
114
|
+
expect(result1.median).toBeCloseTo(result2.median, 1)
|
|
115
|
+
})
|
|
116
|
+
})
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
describe('getBootstrapConfigFromEnv', () => {
|
|
120
|
+
const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
121
|
+
|
|
122
|
+
afterEach(() => {
|
|
123
|
+
if (originalEnv === undefined) {
|
|
124
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
125
|
+
} else {
|
|
126
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
test('returns default iterations when env var not set', () => {
|
|
131
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
132
|
+
const config = getBootstrapConfigFromEnv()
|
|
133
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
test('parses valid iteration count from env', () => {
|
|
137
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
|
|
138
|
+
const config = getBootstrapConfigFromEnv()
|
|
139
|
+
expect(config.iterations).toBe(5000)
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
test('returns default for invalid (non-numeric) env value', () => {
|
|
143
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
|
|
144
|
+
const config = getBootstrapConfigFromEnv()
|
|
145
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
test('returns default for iteration count below minimum (100)', () => {
|
|
149
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
|
|
150
|
+
const config = getBootstrapConfigFromEnv()
|
|
151
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
test('accepts iteration count at minimum (100)', () => {
|
|
155
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
|
|
156
|
+
const config = getBootstrapConfigFromEnv()
|
|
157
|
+
expect(config.iterations).toBe(100)
|
|
158
|
+
})
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
describe('constants', () => {
|
|
162
|
+
test('DEFAULT_ITERATIONS is 1000', () => {
|
|
163
|
+
expect(DEFAULT_ITERATIONS).toBe(1000)
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
|
|
167
|
+
expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
|
|
168
|
+
})
|
|
169
|
+
})
|
|
@@ -20,9 +20,7 @@ import type {
|
|
|
20
20
|
TrialsComparisonGrader,
|
|
21
21
|
TrialsComparisonGraderInput,
|
|
22
22
|
} from '../pipeline/pipeline.types.ts'
|
|
23
|
-
|
|
24
|
-
/** Default number of bootstrap iterations */
|
|
25
|
-
const DEFAULT_ITERATIONS = 1000
|
|
23
|
+
import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
|
|
26
24
|
|
|
27
25
|
/**
|
|
28
26
|
* Bootstrap confidence interval result.
|
|
@@ -82,16 +80,13 @@ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): Boot
|
|
|
82
80
|
}
|
|
83
81
|
|
|
84
82
|
/**
|
|
85
|
-
* Get bootstrap iterations from environment
|
|
83
|
+
* Get bootstrap iterations from environment or use default.
|
|
86
84
|
*
|
|
87
85
|
* @returns Number of bootstrap iterations
|
|
88
86
|
*/
|
|
89
|
-
const
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
const parsed = Number.parseInt(envValue, 10)
|
|
94
|
-
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
87
|
+
const getIterations = (): number => {
|
|
88
|
+
const config = getBootstrapConfigFromEnv()
|
|
89
|
+
return config.iterations ?? DEFAULT_ITERATIONS
|
|
95
90
|
}
|
|
96
91
|
|
|
97
92
|
/**
|
|
@@ -109,7 +104,7 @@ const getIterationsFromEnv = (): number => {
|
|
|
109
104
|
export const grade: TrialsComparisonGrader = async ({
|
|
110
105
|
runs,
|
|
111
106
|
}: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
112
|
-
const iterations =
|
|
107
|
+
const iterations = getIterations()
|
|
113
108
|
|
|
114
109
|
// Collect pass/fail outcomes for each run
|
|
115
110
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
*/
|
|
17
17
|
|
|
18
18
|
import { logProgress, writeOutput } from '../core.ts'
|
|
19
|
+
import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
|
|
19
20
|
import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
|
|
20
21
|
import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
|
|
21
22
|
import type {
|
|
@@ -194,13 +195,14 @@ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMet
|
|
|
194
195
|
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
195
196
|
|
|
196
197
|
if (passExpKValues.length === 0) {
|
|
197
|
-
return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
|
|
198
|
+
return { type: 'trial', avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
|
|
198
199
|
}
|
|
199
200
|
|
|
200
201
|
const sorted = [...passExpKValues].sort((a, b) => a - b)
|
|
201
202
|
const sum = passExpKValues.reduce((a, b) => a + b, 0)
|
|
202
203
|
|
|
203
204
|
return {
|
|
205
|
+
type: 'trial',
|
|
204
206
|
avgPassExpK: sum / passExpKValues.length,
|
|
205
207
|
medianPassExpK: percentile(sorted, 0.5),
|
|
206
208
|
p25PassExpK: percentile(sorted, 0.25),
|
|
@@ -407,6 +409,34 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
407
409
|
flakiness[label] = computeFlakinessMetrics(results)
|
|
408
410
|
}
|
|
409
411
|
|
|
412
|
+
// Compute confidence intervals when using statistical strategy
|
|
413
|
+
if (strategy === 'statistical') {
|
|
414
|
+
const bootstrapConfig = getBootstrapConfigFromEnv()
|
|
415
|
+
|
|
416
|
+
for (const label of runLabels) {
|
|
417
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
418
|
+
const results = [...resultsMap.values()]
|
|
419
|
+
const passAtKValues = results.map((r) => r.passAtK ?? 0)
|
|
420
|
+
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
421
|
+
|
|
422
|
+
// Capability CIs
|
|
423
|
+
const capabilityMetrics = capability[label]
|
|
424
|
+
if (capabilityMetrics) {
|
|
425
|
+
capabilityMetrics.confidenceIntervals = {
|
|
426
|
+
avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// Reliability CIs
|
|
431
|
+
const reliabilityMetrics = reliability[label]
|
|
432
|
+
if (reliabilityMetrics) {
|
|
433
|
+
reliabilityMetrics.confidenceIntervals = {
|
|
434
|
+
avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
410
440
|
// Compute pairwise comparisons
|
|
411
441
|
const capabilityPairwise: PairwiseComparison[] = []
|
|
412
442
|
const reliabilityPairwise: PairwiseComparison[] = []
|
|
@@ -531,27 +561,52 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
|
|
|
531
561
|
lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
|
|
532
562
|
lines.push('')
|
|
533
563
|
|
|
564
|
+
// Check if any run has confidence intervals (statistical strategy was used)
|
|
565
|
+
const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
|
|
566
|
+
|
|
534
567
|
// Capability table
|
|
535
568
|
lines.push('## Capability (passAtK)')
|
|
536
569
|
lines.push('')
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
570
|
+
if (hasCIs) {
|
|
571
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
572
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
573
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
574
|
+
const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
|
|
575
|
+
lines.push(
|
|
576
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
577
|
+
)
|
|
578
|
+
}
|
|
579
|
+
} else {
|
|
580
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
581
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
582
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
583
|
+
lines.push(
|
|
584
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
585
|
+
)
|
|
586
|
+
}
|
|
543
587
|
}
|
|
544
588
|
lines.push('')
|
|
545
589
|
|
|
546
590
|
// Reliability table
|
|
547
591
|
lines.push('## Reliability (passExpK)')
|
|
548
592
|
lines.push('')
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
593
|
+
if (hasCIs) {
|
|
594
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
595
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
596
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
597
|
+
const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
|
|
598
|
+
lines.push(
|
|
599
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
600
|
+
)
|
|
601
|
+
}
|
|
602
|
+
} else {
|
|
603
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
604
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
605
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
606
|
+
lines.push(
|
|
607
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
608
|
+
)
|
|
609
|
+
}
|
|
555
610
|
}
|
|
556
611
|
lines.push('')
|
|
557
612
|
|
package/src/pipeline/compare.ts
CHANGED
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
import { basename, extname } from 'node:path'
|
|
26
26
|
import { parseArgs } from 'node:util'
|
|
27
27
|
import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
|
|
28
|
+
import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
|
|
28
29
|
import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
|
|
29
30
|
import { grade as weightedGrade } from '../graders/compare-weighted.ts'
|
|
30
31
|
import type {
|
|
@@ -463,6 +464,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
463
464
|
const completedCount = results.filter((r) => r.output && !r.errors?.length).length
|
|
464
465
|
|
|
465
466
|
reliability[label] = {
|
|
467
|
+
type: 'run',
|
|
466
468
|
toolErrors: toolErrorCount,
|
|
467
469
|
toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
|
|
468
470
|
timeouts: timeoutCount,
|
|
@@ -471,6 +473,36 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
471
473
|
}
|
|
472
474
|
}
|
|
473
475
|
|
|
476
|
+
// Compute confidence intervals when using statistical strategy
|
|
477
|
+
if (strategy === 'statistical') {
|
|
478
|
+
const bootstrapConfig = getBootstrapConfigFromEnv()
|
|
479
|
+
|
|
480
|
+
for (const label of runLabels) {
|
|
481
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
482
|
+
const results = [...resultsMap.values()]
|
|
483
|
+
const scores = results.map((r) => r.score?.score ?? 0)
|
|
484
|
+
const passes = results.map((r) => (r.score?.pass === true ? 1 : 0))
|
|
485
|
+
const latencies = results.map((r) => r.timing?.total ?? 0)
|
|
486
|
+
|
|
487
|
+
// Quality CIs
|
|
488
|
+
const qualityMetrics = quality[label]
|
|
489
|
+
if (qualityMetrics) {
|
|
490
|
+
qualityMetrics.confidenceIntervals = {
|
|
491
|
+
avgScore: bootstrap(scores, bootstrapConfig).ci,
|
|
492
|
+
passRate: bootstrap(passes, bootstrapConfig).ci,
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
// Performance CIs
|
|
497
|
+
const performanceMetrics = performance[label]
|
|
498
|
+
if (performanceMetrics) {
|
|
499
|
+
performanceMetrics.confidenceIntervals = {
|
|
500
|
+
latencyMean: bootstrap(latencies, bootstrapConfig).ci,
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
474
506
|
// Trajectory info
|
|
475
507
|
const trajectoryInfo: Record<string, TrajectoryInfo> = {}
|
|
476
508
|
for (const label of runLabels) {
|
|
@@ -586,27 +618,53 @@ const formatReportAsMarkdown = (report: ComparisonReport): string => {
|
|
|
586
618
|
lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
|
|
587
619
|
lines.push('')
|
|
588
620
|
|
|
621
|
+
// Check if any run has confidence intervals (statistical strategy was used)
|
|
622
|
+
const hasCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
|
|
623
|
+
|
|
589
624
|
// Quality table
|
|
590
625
|
lines.push('## Quality')
|
|
591
626
|
lines.push('')
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
627
|
+
if (hasCIs) {
|
|
628
|
+
lines.push('| Run | Avg Score | 95% CI | Pass Rate | 95% CI | Pass | Fail |')
|
|
629
|
+
lines.push('|-----|-----------|--------|-----------|--------|------|------|')
|
|
630
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
631
|
+
const avgScoreCI = formatCI(q.confidenceIntervals?.avgScore)
|
|
632
|
+
const passRateCI = formatCI(q.confidenceIntervals?.passRate)
|
|
633
|
+
lines.push(
|
|
634
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${avgScoreCI} | ${(q.passRate * 100).toFixed(1)}% | ${passRateCI} | ${q.passCount} | ${q.failCount} |`,
|
|
635
|
+
)
|
|
636
|
+
}
|
|
637
|
+
} else {
|
|
638
|
+
lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
|
|
639
|
+
lines.push('|-----|-----------|-----------|------|------|')
|
|
640
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
641
|
+
lines.push(
|
|
642
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
|
|
643
|
+
)
|
|
644
|
+
}
|
|
598
645
|
}
|
|
599
646
|
lines.push('')
|
|
600
647
|
|
|
601
648
|
// Performance table
|
|
602
649
|
lines.push('## Performance')
|
|
603
650
|
lines.push('')
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
651
|
+
if (hasCIs) {
|
|
652
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI |')
|
|
653
|
+
lines.push('|-----|----------|----------|----------|-----------|--------|')
|
|
654
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
655
|
+
const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
|
|
656
|
+
lines.push(
|
|
657
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} |`,
|
|
658
|
+
)
|
|
659
|
+
}
|
|
660
|
+
} else {
|
|
661
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
|
|
662
|
+
lines.push('|-----|----------|----------|----------|-----------|')
|
|
663
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
664
|
+
lines.push(
|
|
665
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
|
|
666
|
+
)
|
|
667
|
+
}
|
|
610
668
|
}
|
|
611
669
|
lines.push('')
|
|
612
670
|
|
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration tests for compare command statistical strategy.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests verify confidence interval computation for the statistical strategy
|
|
6
|
+
* in the compare command with CaptureResult format.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
|
|
12
|
+
import type { CaptureResult } from '../../schemas.ts'
|
|
13
|
+
import { runCompare } from '../compare.ts'
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Test Fixtures
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
const createCaptureResult = (id: string, score: number, pass: boolean, duration: number = 1000): CaptureResult => ({
|
|
20
|
+
id,
|
|
21
|
+
input: `Prompt for ${id}`,
|
|
22
|
+
output: `Output for ${id}`,
|
|
23
|
+
trajectory: [{ type: 'message', content: `Output for ${id}`, timestamp: Date.now() }],
|
|
24
|
+
metadata: {},
|
|
25
|
+
timing: {
|
|
26
|
+
start: Date.now(),
|
|
27
|
+
end: Date.now() + duration,
|
|
28
|
+
sessionCreation: 100,
|
|
29
|
+
total: duration,
|
|
30
|
+
},
|
|
31
|
+
toolErrors: false,
|
|
32
|
+
score: {
|
|
33
|
+
pass,
|
|
34
|
+
score,
|
|
35
|
+
reasoning: pass ? 'Passed' : 'Failed',
|
|
36
|
+
},
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
const tempDir = `${import.meta.dir}/.test-tmp/compare-statistical`
|
|
40
|
+
|
|
41
|
+
beforeAll(async () => {
|
|
42
|
+
await Bun.$`mkdir -p ${tempDir}`
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
afterAll(async () => {
|
|
46
|
+
await Bun.$`rm -rf ${tempDir}`
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
// ============================================================================
|
|
50
|
+
// Statistical Strategy CI Tests
|
|
51
|
+
// ============================================================================
|
|
52
|
+
|
|
53
|
+
describe('runCompare statistical strategy', () => {
|
|
54
|
+
test('computes confidence intervals for quality metrics', async () => {
|
|
55
|
+
const run1Path = `${tempDir}/ci-qual-run1.jsonl`
|
|
56
|
+
const run2Path = `${tempDir}/ci-qual-run2.jsonl`
|
|
57
|
+
|
|
58
|
+
// Create multiple prompts with varying scores for meaningful CI computation
|
|
59
|
+
const results1 = [
|
|
60
|
+
createCaptureResult('p1', 0.9, true, 1000),
|
|
61
|
+
createCaptureResult('p2', 0.85, true, 1100),
|
|
62
|
+
createCaptureResult('p3', 0.95, true, 900),
|
|
63
|
+
createCaptureResult('p4', 0.8, true, 1200),
|
|
64
|
+
]
|
|
65
|
+
const results2 = [
|
|
66
|
+
createCaptureResult('p1', 0.6, false, 2000),
|
|
67
|
+
createCaptureResult('p2', 0.5, false, 2100),
|
|
68
|
+
createCaptureResult('p3', 0.7, true, 1900),
|
|
69
|
+
createCaptureResult('p4', 0.55, false, 2200),
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
73
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
74
|
+
|
|
75
|
+
const report = await runCompare({
|
|
76
|
+
runs: [
|
|
77
|
+
{ label: 'high', path: run1Path },
|
|
78
|
+
{ label: 'low', path: run2Path },
|
|
79
|
+
],
|
|
80
|
+
strategy: 'statistical',
|
|
81
|
+
progress: false,
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
// Verify confidence intervals are computed for quality
|
|
85
|
+
const highQuality = report.quality.high
|
|
86
|
+
expect(highQuality).toBeDefined()
|
|
87
|
+
expect(highQuality?.confidenceIntervals).toBeDefined()
|
|
88
|
+
expect(highQuality?.confidenceIntervals?.avgScore).toBeDefined()
|
|
89
|
+
expect(highQuality?.confidenceIntervals?.passRate).toBeDefined()
|
|
90
|
+
|
|
91
|
+
// avgScore CI should be a tuple [lower, upper]
|
|
92
|
+
const avgScoreCI = highQuality?.confidenceIntervals?.avgScore
|
|
93
|
+
expect(avgScoreCI).toHaveLength(2)
|
|
94
|
+
expect(avgScoreCI?.[0]).toBeLessThanOrEqual(avgScoreCI?.[1] ?? 0)
|
|
95
|
+
|
|
96
|
+
// CI should contain the average (within reasonable bounds)
|
|
97
|
+
expect(avgScoreCI?.[0]).toBeLessThanOrEqual(highQuality?.avgScore ?? 0)
|
|
98
|
+
expect(avgScoreCI?.[1]).toBeGreaterThanOrEqual(highQuality?.avgScore ?? 1)
|
|
99
|
+
|
|
100
|
+
// passRate CI should also be valid
|
|
101
|
+
const passRateCI = highQuality?.confidenceIntervals?.passRate
|
|
102
|
+
expect(passRateCI).toHaveLength(2)
|
|
103
|
+
expect(passRateCI?.[0]).toBeLessThanOrEqual(passRateCI?.[1] ?? 0)
|
|
104
|
+
|
|
105
|
+
// Verify reliability metrics include type discriminator
|
|
106
|
+
expect(report.reliability.high?.type).toBe('run')
|
|
107
|
+
expect(report.reliability.low?.type).toBe('run')
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
test('computes confidence intervals for performance metrics', async () => {
|
|
111
|
+
const run1Path = `${tempDir}/ci-perf-run1.jsonl`
|
|
112
|
+
const run2Path = `${tempDir}/ci-perf-run2.jsonl`
|
|
113
|
+
|
|
114
|
+
// Create results with varying latencies
|
|
115
|
+
const results1 = [
|
|
116
|
+
createCaptureResult('p1', 0.9, true, 1000),
|
|
117
|
+
createCaptureResult('p2', 0.85, true, 1100),
|
|
118
|
+
createCaptureResult('p3', 0.95, true, 900),
|
|
119
|
+
createCaptureResult('p4', 0.8, true, 1050),
|
|
120
|
+
]
|
|
121
|
+
const results2 = [
|
|
122
|
+
createCaptureResult('p1', 0.7, true, 2000),
|
|
123
|
+
createCaptureResult('p2', 0.65, true, 2200),
|
|
124
|
+
createCaptureResult('p3', 0.75, true, 1800),
|
|
125
|
+
createCaptureResult('p4', 0.6, true, 2100),
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
129
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
130
|
+
|
|
131
|
+
const report = await runCompare({
|
|
132
|
+
runs: [
|
|
133
|
+
{ label: 'fast', path: run1Path },
|
|
134
|
+
{ label: 'slow', path: run2Path },
|
|
135
|
+
],
|
|
136
|
+
strategy: 'statistical',
|
|
137
|
+
progress: false,
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
// Verify confidence intervals are computed for performance
|
|
141
|
+
const fastPerf = report.performance.fast
|
|
142
|
+
expect(fastPerf).toBeDefined()
|
|
143
|
+
expect(fastPerf?.confidenceIntervals).toBeDefined()
|
|
144
|
+
expect(fastPerf?.confidenceIntervals?.latencyMean).toBeDefined()
|
|
145
|
+
|
|
146
|
+
// latencyMean CI should be a tuple [lower, upper]
|
|
147
|
+
const latencyCI = fastPerf?.confidenceIntervals?.latencyMean
|
|
148
|
+
expect(latencyCI).toHaveLength(2)
|
|
149
|
+
expect(latencyCI?.[0]).toBeLessThanOrEqual(latencyCI?.[1] ?? 0)
|
|
150
|
+
|
|
151
|
+
// Fast run should have lower latency CI than slow run
|
|
152
|
+
const slowPerf = report.performance.slow
|
|
153
|
+
const slowLatencyCI = slowPerf?.confidenceIntervals?.latencyMean
|
|
154
|
+
expect(latencyCI?.[1]).toBeLessThan(slowLatencyCI?.[0] ?? 0)
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
test('weighted strategy does not compute confidence intervals', async () => {
|
|
158
|
+
const run1Path = `${tempDir}/no-ci-run1.jsonl`
|
|
159
|
+
const run2Path = `${tempDir}/no-ci-run2.jsonl`
|
|
160
|
+
|
|
161
|
+
const results1 = [createCaptureResult('p1', 0.9, true), createCaptureResult('p2', 0.85, true)]
|
|
162
|
+
const results2 = [createCaptureResult('p1', 0.6, false), createCaptureResult('p2', 0.5, false)]
|
|
163
|
+
|
|
164
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
165
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
166
|
+
|
|
167
|
+
const report = await runCompare({
|
|
168
|
+
runs: [
|
|
169
|
+
{ label: 'run1', path: run1Path },
|
|
170
|
+
{ label: 'run2', path: run2Path },
|
|
171
|
+
],
|
|
172
|
+
strategy: 'weighted', // Default strategy
|
|
173
|
+
progress: false,
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
// Confidence intervals should NOT be present for weighted strategy
|
|
177
|
+
const quality = report.quality.run1
|
|
178
|
+
expect(quality?.confidenceIntervals).toBeUndefined()
|
|
179
|
+
|
|
180
|
+
const perf = report.performance.run1
|
|
181
|
+
expect(perf?.confidenceIntervals).toBeUndefined()
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
test('statistical strategy includes CIs in markdown output', async () => {
|
|
185
|
+
const run1Path = `${tempDir}/ci-md-run1.jsonl`
|
|
186
|
+
const run2Path = `${tempDir}/ci-md-run2.jsonl`
|
|
187
|
+
const outputPath = `${tempDir}/ci-report.md`
|
|
188
|
+
|
|
189
|
+
const results1 = [createCaptureResult('p1', 0.9, true, 1000), createCaptureResult('p2', 0.85, true, 1100)]
|
|
190
|
+
const results2 = [createCaptureResult('p1', 0.6, false, 2000), createCaptureResult('p2', 0.5, false, 2100)]
|
|
191
|
+
|
|
192
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
193
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
194
|
+
|
|
195
|
+
await runCompare({
|
|
196
|
+
runs: [
|
|
197
|
+
{ label: 'agent1', path: run1Path },
|
|
198
|
+
{ label: 'agent2', path: run2Path },
|
|
199
|
+
],
|
|
200
|
+
strategy: 'statistical',
|
|
201
|
+
outputPath,
|
|
202
|
+
format: 'markdown',
|
|
203
|
+
progress: false,
|
|
204
|
+
})
|
|
205
|
+
|
|
206
|
+
const content = await Bun.file(outputPath).text()
|
|
207
|
+
|
|
208
|
+
// Markdown should include 95% CI column headers
|
|
209
|
+
expect(content).toContain('95% CI')
|
|
210
|
+
// Should contain CI values in bracket format [lower, upper]
|
|
211
|
+
expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
|
|
212
|
+
})
|
|
213
|
+
|
|
214
|
+
test('handles single sample gracefully with degenerate CI', async () => {
|
|
215
|
+
const run1Path = `${tempDir}/single-run1.jsonl`
|
|
216
|
+
const run2Path = `${tempDir}/single-run2.jsonl`
|
|
217
|
+
|
|
218
|
+
// Single sample per run
|
|
219
|
+
const result1 = createCaptureResult('p1', 0.9, true)
|
|
220
|
+
const result2 = createCaptureResult('p1', 0.5, false)
|
|
221
|
+
|
|
222
|
+
await Bun.write(run1Path, JSON.stringify(result1))
|
|
223
|
+
await Bun.write(run2Path, JSON.stringify(result2))
|
|
224
|
+
|
|
225
|
+
const report = await runCompare({
|
|
226
|
+
runs: [
|
|
227
|
+
{ label: 'single1', path: run1Path },
|
|
228
|
+
{ label: 'single2', path: run2Path },
|
|
229
|
+
],
|
|
230
|
+
strategy: 'statistical',
|
|
231
|
+
progress: false,
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
// Should still compute CIs (they will be degenerate for single sample)
|
|
235
|
+
const quality = report.quality.single1
|
|
236
|
+
expect(quality?.confidenceIntervals).toBeDefined()
|
|
237
|
+
expect(quality?.confidenceIntervals?.avgScore).toBeDefined()
|
|
238
|
+
|
|
239
|
+
// For single sample, CI should collapse to the value
|
|
240
|
+
const ci = quality?.confidenceIntervals?.avgScore
|
|
241
|
+
expect(ci?.[0]).toBeCloseTo(ci?.[1] ?? 0, 2)
|
|
242
|
+
expect(ci?.[0]).toBeCloseTo(quality?.avgScore ?? 0, 2)
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
test('JSON output includes confidence intervals structure', async () => {
|
|
246
|
+
const run1Path = `${tempDir}/json-ci-run1.jsonl`
|
|
247
|
+
const run2Path = `${tempDir}/json-ci-run2.jsonl`
|
|
248
|
+
const outputPath = `${tempDir}/ci-report.json`
|
|
249
|
+
|
|
250
|
+
const results1 = [
|
|
251
|
+
createCaptureResult('p1', 0.9, true),
|
|
252
|
+
createCaptureResult('p2', 0.85, true),
|
|
253
|
+
createCaptureResult('p3', 0.95, true),
|
|
254
|
+
]
|
|
255
|
+
const results2 = [
|
|
256
|
+
createCaptureResult('p1', 0.6, false),
|
|
257
|
+
createCaptureResult('p2', 0.5, false),
|
|
258
|
+
createCaptureResult('p3', 0.7, true),
|
|
259
|
+
]
|
|
260
|
+
|
|
261
|
+
await Bun.write(run1Path, results1.map((r) => JSON.stringify(r)).join('\n'))
|
|
262
|
+
await Bun.write(run2Path, results2.map((r) => JSON.stringify(r)).join('\n'))
|
|
263
|
+
|
|
264
|
+
await runCompare({
|
|
265
|
+
runs: [
|
|
266
|
+
{ label: 'high', path: run1Path },
|
|
267
|
+
{ label: 'low', path: run2Path },
|
|
268
|
+
],
|
|
269
|
+
strategy: 'statistical',
|
|
270
|
+
outputPath,
|
|
271
|
+
format: 'json',
|
|
272
|
+
progress: false,
|
|
273
|
+
})
|
|
274
|
+
|
|
275
|
+
const content = await Bun.file(outputPath).text()
|
|
276
|
+
const parsed = JSON.parse(content)
|
|
277
|
+
|
|
278
|
+
// Verify JSON structure includes confidenceIntervals
|
|
279
|
+
expect(parsed.quality.high.confidenceIntervals).toBeDefined()
|
|
280
|
+
expect(parsed.quality.high.confidenceIntervals.avgScore).toBeInstanceOf(Array)
|
|
281
|
+
expect(parsed.quality.high.confidenceIntervals.avgScore.length).toBe(2)
|
|
282
|
+
expect(parsed.performance.high.confidenceIntervals).toBeDefined()
|
|
283
|
+
expect(parsed.performance.high.confidenceIntervals.latencyMean).toBeInstanceOf(Array)
|
|
284
|
+
})
|
|
285
|
+
})
|
|
@@ -108,6 +108,8 @@ describe('runTrialsCompare', () => {
|
|
|
108
108
|
expect(report.meta.promptCount).toBe(2)
|
|
109
109
|
expect(report.capability).toBeDefined()
|
|
110
110
|
expect(report.reliability).toBeDefined()
|
|
111
|
+
expect(report.reliability.baseline?.type).toBe('trial')
|
|
112
|
+
expect(report.reliability.variant?.type).toBe('trial')
|
|
111
113
|
expect(report.flakiness).toBeDefined()
|
|
112
114
|
expect(report.headToHead.capability.length).toBeGreaterThan(0)
|
|
113
115
|
|
|
@@ -210,6 +212,147 @@ describe('runTrialsCompare', () => {
|
|
|
210
212
|
expect(report.meta.runs).toEqual(['better', 'worse'])
|
|
211
213
|
})
|
|
212
214
|
|
|
215
|
+
test('statistical strategy computes confidence intervals for capability metrics', async () => {
|
|
216
|
+
const run1Path = `${tempDir}/ci-cap-run1.jsonl`
|
|
217
|
+
const run2Path = `${tempDir}/ci-cap-run2.jsonl`
|
|
218
|
+
|
|
219
|
+
// Create multiple prompts for meaningful CI computation
|
|
220
|
+
const trials1 = [
|
|
221
|
+
createTrialResult('p1', 0.9, 0.8),
|
|
222
|
+
createTrialResult('p2', 0.85, 0.7),
|
|
223
|
+
createTrialResult('p3', 0.95, 0.9),
|
|
224
|
+
]
|
|
225
|
+
const trials2 = [
|
|
226
|
+
createTrialResult('p1', 0.6, 0.4),
|
|
227
|
+
createTrialResult('p2', 0.5, 0.3),
|
|
228
|
+
createTrialResult('p3', 0.7, 0.5),
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
232
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
233
|
+
|
|
234
|
+
const report = await runTrialsCompare({
|
|
235
|
+
runs: [
|
|
236
|
+
{ label: 'high', path: run1Path },
|
|
237
|
+
{ label: 'low', path: run2Path },
|
|
238
|
+
],
|
|
239
|
+
strategy: 'statistical',
|
|
240
|
+
progress: false,
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
// Verify confidence intervals are computed for capability
|
|
244
|
+
const highCap = report.capability.high
|
|
245
|
+
expect(highCap).toBeDefined()
|
|
246
|
+
expect(highCap?.confidenceIntervals).toBeDefined()
|
|
247
|
+
expect(highCap?.confidenceIntervals?.avgPassAtK).toBeDefined()
|
|
248
|
+
|
|
249
|
+
// CI should be a tuple [lower, upper]
|
|
250
|
+
const ci = highCap?.confidenceIntervals?.avgPassAtK
|
|
251
|
+
expect(ci).toHaveLength(2)
|
|
252
|
+
expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
|
|
253
|
+
|
|
254
|
+
// CI should contain the average (within reasonable bounds)
|
|
255
|
+
expect(ci?.[0]).toBeLessThanOrEqual(highCap?.avgPassAtK ?? 0)
|
|
256
|
+
expect(ci?.[1]).toBeGreaterThanOrEqual(highCap?.avgPassAtK ?? 1)
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
test('statistical strategy computes confidence intervals for reliability metrics', async () => {
|
|
260
|
+
const run1Path = `${tempDir}/ci-rel-run1.jsonl`
|
|
261
|
+
const run2Path = `${tempDir}/ci-rel-run2.jsonl`
|
|
262
|
+
|
|
263
|
+
const trials1 = [
|
|
264
|
+
createTrialResult('p1', 0.9, 0.85),
|
|
265
|
+
createTrialResult('p2', 0.8, 0.75),
|
|
266
|
+
createTrialResult('p3', 0.85, 0.8),
|
|
267
|
+
]
|
|
268
|
+
const trials2 = [
|
|
269
|
+
createTrialResult('p1', 0.7, 0.3),
|
|
270
|
+
createTrialResult('p2', 0.6, 0.2),
|
|
271
|
+
createTrialResult('p3', 0.65, 0.25),
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
275
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
276
|
+
|
|
277
|
+
const report = await runTrialsCompare({
|
|
278
|
+
runs: [
|
|
279
|
+
{ label: 'reliable', path: run1Path },
|
|
280
|
+
{ label: 'flaky', path: run2Path },
|
|
281
|
+
],
|
|
282
|
+
strategy: 'statistical',
|
|
283
|
+
progress: false,
|
|
284
|
+
})
|
|
285
|
+
|
|
286
|
+
// Verify confidence intervals are computed for reliability
|
|
287
|
+
const reliableRel = report.reliability.reliable
|
|
288
|
+
expect(reliableRel).toBeDefined()
|
|
289
|
+
expect(reliableRel?.type).toBe('trial')
|
|
290
|
+
expect(reliableRel?.confidenceIntervals).toBeDefined()
|
|
291
|
+
expect(reliableRel?.confidenceIntervals?.avgPassExpK).toBeDefined()
|
|
292
|
+
|
|
293
|
+
// CI should be a tuple [lower, upper]
|
|
294
|
+
const ci = reliableRel?.confidenceIntervals?.avgPassExpK
|
|
295
|
+
expect(ci).toHaveLength(2)
|
|
296
|
+
expect(ci?.[0]).toBeLessThanOrEqual(ci?.[1] ?? 0)
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
test('weighted strategy does not compute confidence intervals', async () => {
|
|
300
|
+
const run1Path = `${tempDir}/no-ci-run1.jsonl`
|
|
301
|
+
const run2Path = `${tempDir}/no-ci-run2.jsonl`
|
|
302
|
+
|
|
303
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
304
|
+
const trial2 = createTrialResult('test-001', 0.5, 0.3)
|
|
305
|
+
|
|
306
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
307
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
308
|
+
|
|
309
|
+
const report = await runTrialsCompare({
|
|
310
|
+
runs: [
|
|
311
|
+
{ label: 'run1', path: run1Path },
|
|
312
|
+
{ label: 'run2', path: run2Path },
|
|
313
|
+
],
|
|
314
|
+
strategy: 'weighted', // Default strategy
|
|
315
|
+
progress: false,
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
// Confidence intervals should NOT be present for weighted strategy
|
|
319
|
+
const cap = report.capability.run1
|
|
320
|
+
expect(cap?.confidenceIntervals).toBeUndefined()
|
|
321
|
+
|
|
322
|
+
const rel = report.reliability.run1
|
|
323
|
+
expect(rel?.confidenceIntervals).toBeUndefined()
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
test('statistical strategy includes CIs in markdown output', async () => {
|
|
327
|
+
const run1Path = `${tempDir}/ci-md-run1.jsonl`
|
|
328
|
+
const run2Path = `${tempDir}/ci-md-run2.jsonl`
|
|
329
|
+
const outputPath = `${tempDir}/ci-report.md`
|
|
330
|
+
|
|
331
|
+
const trials1 = [createTrialResult('p1', 0.9, 0.8), createTrialResult('p2', 0.85, 0.75)]
|
|
332
|
+
const trials2 = [createTrialResult('p1', 0.6, 0.4), createTrialResult('p2', 0.5, 0.3)]
|
|
333
|
+
|
|
334
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
335
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
336
|
+
|
|
337
|
+
await runTrialsCompare({
|
|
338
|
+
runs: [
|
|
339
|
+
{ label: 'agent1', path: run1Path },
|
|
340
|
+
{ label: 'agent2', path: run2Path },
|
|
341
|
+
],
|
|
342
|
+
strategy: 'statistical',
|
|
343
|
+
outputPath,
|
|
344
|
+
format: 'markdown',
|
|
345
|
+
progress: false,
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
const content = await Bun.file(outputPath).text()
|
|
349
|
+
|
|
350
|
+
// Markdown should include 95% CI column headers
|
|
351
|
+
expect(content).toContain('95% CI')
|
|
352
|
+
// Should contain CI values in bracket format [lower, upper]
|
|
353
|
+
expect(content).toMatch(/\[\d+\.\d+, \d+\.\d+\]/)
|
|
354
|
+
})
|
|
355
|
+
|
|
213
356
|
test('computes correct capability metrics', async () => {
|
|
214
357
|
const run1Path = `${tempDir}/cap-run1.jsonl`
|
|
215
358
|
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -573,6 +573,17 @@ export type ValidationResult = z.infer<typeof ValidationResultSchema>
|
|
|
573
573
|
// Comparison Report Schemas
|
|
574
574
|
// ============================================================================
|
|
575
575
|
|
|
576
|
+
/**
|
|
577
|
+
* Confidence interval schema as [lower, upper] bounds.
|
|
578
|
+
*
|
|
579
|
+
* @remarks
|
|
580
|
+
* Used for bootstrap-computed confidence intervals when strategy=statistical.
|
|
581
|
+
*/
|
|
582
|
+
export const ConfidenceIntervalSchema = z.tuple([z.number(), z.number()])
|
|
583
|
+
|
|
584
|
+
/** Confidence interval type */
|
|
585
|
+
export type ConfidenceInterval = z.infer<typeof ConfidenceIntervalSchema>
|
|
586
|
+
|
|
576
587
|
/**
|
|
577
588
|
* Score distribution histogram for quality analysis.
|
|
578
589
|
*
|
|
@@ -590,6 +601,19 @@ export const ScoreDistributionSchema = z.object({
|
|
|
590
601
|
/** Score distribution type */
|
|
591
602
|
export type ScoreDistribution = z.infer<typeof ScoreDistributionSchema>
|
|
592
603
|
|
|
604
|
+
/**
|
|
605
|
+
* Confidence intervals for quality metrics.
|
|
606
|
+
*/
|
|
607
|
+
export const QualityConfidenceIntervalsSchema = z.object({
|
|
608
|
+
/** CI for avgScore */
|
|
609
|
+
avgScore: ConfidenceIntervalSchema.optional(),
|
|
610
|
+
/** CI for passRate */
|
|
611
|
+
passRate: ConfidenceIntervalSchema.optional(),
|
|
612
|
+
})
|
|
613
|
+
|
|
614
|
+
/** Quality confidence intervals type */
|
|
615
|
+
export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceIntervalsSchema>
|
|
616
|
+
|
|
593
617
|
/**
|
|
594
618
|
* Quality metrics for a single run in comparison.
|
|
595
619
|
*/
|
|
@@ -604,6 +628,8 @@ export const QualityMetricsSchema = z.object({
|
|
|
604
628
|
failCount: z.number(),
|
|
605
629
|
/** Score distribution histogram */
|
|
606
630
|
scoreDistribution: ScoreDistributionSchema,
|
|
631
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
632
|
+
confidenceIntervals: QualityConfidenceIntervalsSchema.optional(),
|
|
607
633
|
})
|
|
608
634
|
|
|
609
635
|
/** Quality metrics type */
|
|
@@ -630,6 +656,17 @@ export const LatencyStatsSchema = z.object({
|
|
|
630
656
|
/** Latency stats type */
|
|
631
657
|
export type LatencyStats = z.infer<typeof LatencyStatsSchema>
|
|
632
658
|
|
|
659
|
+
/**
|
|
660
|
+
* Confidence intervals for performance metrics.
|
|
661
|
+
*/
|
|
662
|
+
export const PerformanceConfidenceIntervalsSchema = z.object({
|
|
663
|
+
/** CI for latency mean */
|
|
664
|
+
latencyMean: ConfidenceIntervalSchema.optional(),
|
|
665
|
+
})
|
|
666
|
+
|
|
667
|
+
/** Performance confidence intervals type */
|
|
668
|
+
export type PerformanceConfidenceIntervals = z.infer<typeof PerformanceConfidenceIntervalsSchema>
|
|
669
|
+
|
|
633
670
|
/**
|
|
634
671
|
* Performance metrics for a single run in comparison.
|
|
635
672
|
*/
|
|
@@ -640,6 +677,8 @@ export const PerformanceMetricsSchema = z.object({
|
|
|
640
677
|
firstResponse: LatencyStatsSchema.optional(),
|
|
641
678
|
/** Sum of all run durations in milliseconds */
|
|
642
679
|
totalDuration: z.number(),
|
|
680
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
681
|
+
confidenceIntervals: PerformanceConfidenceIntervalsSchema.optional(),
|
|
643
682
|
})
|
|
644
683
|
|
|
645
684
|
/** Performance metrics type */
|
|
@@ -649,6 +688,8 @@ export type PerformanceMetrics = z.infer<typeof PerformanceMetricsSchema>
|
|
|
649
688
|
* Reliability metrics for a single run in comparison.
|
|
650
689
|
*/
|
|
651
690
|
export const ReliabilityMetricsSchema = z.object({
|
|
691
|
+
/** Discriminator for run-based reliability metrics */
|
|
692
|
+
type: z.literal('run'),
|
|
652
693
|
/** Count of runs with toolErrors=true */
|
|
653
694
|
toolErrors: z.number(),
|
|
654
695
|
/** Percentage of runs with tool errors */
|
|
@@ -782,6 +823,17 @@ export type ComparisonReport = z.infer<typeof ComparisonReportSchema>
|
|
|
782
823
|
// Trials Comparison Report Schemas
|
|
783
824
|
// ============================================================================
|
|
784
825
|
|
|
826
|
+
/**
|
|
827
|
+
* Confidence intervals for trials capability metrics.
|
|
828
|
+
*/
|
|
829
|
+
export const TrialsCapabilityConfidenceIntervalsSchema = z.object({
|
|
830
|
+
/** CI for avgPassAtK */
|
|
831
|
+
avgPassAtK: ConfidenceIntervalSchema.optional(),
|
|
832
|
+
})
|
|
833
|
+
|
|
834
|
+
/** Trials capability confidence intervals type */
|
|
835
|
+
export type TrialsCapabilityConfidenceIntervals = z.infer<typeof TrialsCapabilityConfidenceIntervalsSchema>
|
|
836
|
+
|
|
785
837
|
/**
|
|
786
838
|
* Capability metrics for trials comparison (passAtK-based).
|
|
787
839
|
*
|
|
@@ -798,11 +850,24 @@ export const TrialsCapabilityMetricsSchema = z.object({
|
|
|
798
850
|
p25PassAtK: z.number(),
|
|
799
851
|
/** 75th percentile passAtK */
|
|
800
852
|
p75PassAtK: z.number(),
|
|
853
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
854
|
+
confidenceIntervals: TrialsCapabilityConfidenceIntervalsSchema.optional(),
|
|
801
855
|
})
|
|
802
856
|
|
|
803
857
|
/** Trials capability metrics type */
|
|
804
858
|
export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSchema>
|
|
805
859
|
|
|
860
|
+
/**
|
|
861
|
+
* Confidence intervals for trials reliability metrics.
|
|
862
|
+
*/
|
|
863
|
+
export const TrialsReliabilityConfidenceIntervalsSchema = z.object({
|
|
864
|
+
/** CI for avgPassExpK */
|
|
865
|
+
avgPassExpK: ConfidenceIntervalSchema.optional(),
|
|
866
|
+
})
|
|
867
|
+
|
|
868
|
+
/** Trials reliability confidence intervals type */
|
|
869
|
+
export type TrialsReliabilityConfidenceIntervals = z.infer<typeof TrialsReliabilityConfidenceIntervalsSchema>
|
|
870
|
+
|
|
806
871
|
/**
|
|
807
872
|
* Reliability metrics for trials comparison (passExpK-based).
|
|
808
873
|
*
|
|
@@ -811,6 +876,8 @@ export type TrialsCapabilityMetrics = z.infer<typeof TrialsCapabilityMetricsSche
|
|
|
811
876
|
* Higher passExpK means the agent reliably solves the task every time.
|
|
812
877
|
*/
|
|
813
878
|
export const TrialsReliabilityMetricsSchema = z.object({
|
|
879
|
+
/** Discriminator for trial-based reliability metrics */
|
|
880
|
+
type: z.literal('trial'),
|
|
814
881
|
/** Average passExpK across all prompts */
|
|
815
882
|
avgPassExpK: z.number(),
|
|
816
883
|
/** Median passExpK */
|
|
@@ -819,6 +886,8 @@ export const TrialsReliabilityMetricsSchema = z.object({
|
|
|
819
886
|
p25PassExpK: z.number(),
|
|
820
887
|
/** 75th percentile passExpK */
|
|
821
888
|
p75PassExpK: z.number(),
|
|
889
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
890
|
+
confidenceIntervals: TrialsReliabilityConfidenceIntervalsSchema.optional(),
|
|
822
891
|
})
|
|
823
892
|
|
|
824
893
|
/** Trials reliability metrics type */
|