@plaited/agent-eval-harness 0.7.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/package.json +5 -3
- package/src/graders/bootstrap.ts +135 -0
- package/src/graders/compare-statistical.ts +14 -86
- package/src/graders/tests/bootstrap.spec.ts +169 -0
- package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
- package/src/graders/trials-compare-statistical.ts +183 -0
- package/src/graders/trials-compare-weighted.ts +128 -0
- package/src/graders.ts +21 -1
- package/src/pipeline/compare-format-detection.ts +100 -0
- package/src/pipeline/compare-trials.ts +650 -0
- package/src/pipeline/compare.ts +144 -31
- package/src/pipeline/pipeline.types.ts +52 -1
- package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
- package/src/pipeline/tests/compare-statistical.spec.ts +281 -0
- package/src/pipeline/tests/compare-trials.spec.ts +417 -0
- package/src/schemas/schemas.ts +216 -0
- package/src/schemas.ts +13 -0
package/README.md
CHANGED
|
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
|
|
|
78
78
|
|
|
79
79
|
# Compare runs (built-in strategies: weighted, statistical, custom)
|
|
80
80
|
bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
81
|
+
|
|
82
|
+
# Compare trials for pass@k reliability analysis (auto-detects format)
|
|
83
|
+
bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
|
|
81
84
|
```
|
|
82
85
|
|
|
83
86
|
## Skills for AI Agents
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.1",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -42,8 +42,10 @@
|
|
|
42
42
|
"check:types": "tsc --noEmit",
|
|
43
43
|
"check:write": "biome check --write && format-package --write",
|
|
44
44
|
"prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
|
|
45
|
-
"test": "bun test
|
|
46
|
-
"test:
|
|
45
|
+
"test": "bun run test:bin && bun test:src",
|
|
46
|
+
"test:bin": "bun test bin/tests/*.spec.ts",
|
|
47
|
+
"test:integration": "bun test ./**/integration_tests/*.spec.ts",
|
|
48
|
+
"test:src": "bun test src/**/tests/*.spec.ts"
|
|
47
49
|
},
|
|
48
50
|
"lint-staged": {
|
|
49
51
|
"*.{js,cjs,jsx,tsx,ts}": [
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared bootstrap sampling utilities for confidence interval computation.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
6
|
+
* assuming a specific distribution. For small samples, it's more
|
|
7
|
+
* reliable than parametric methods.
|
|
8
|
+
*
|
|
9
|
+
* Environment variable configuration:
|
|
10
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
|
|
11
|
+
*
|
|
12
|
+
* @packageDocumentation
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/** Default number of bootstrap iterations */
|
|
16
|
+
export const DEFAULT_ITERATIONS = 1000
|
|
17
|
+
|
|
18
|
+
/** Default confidence level (95%) */
|
|
19
|
+
export const DEFAULT_CONFIDENCE_LEVEL = 0.95
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Confidence interval as [lower, upper] bounds.
|
|
23
|
+
*/
|
|
24
|
+
export type ConfidenceInterval = [number, number]
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Bootstrap confidence interval result.
|
|
28
|
+
*/
|
|
29
|
+
export type BootstrapResult = {
|
|
30
|
+
/** Median of bootstrap sample means (50th percentile) */
|
|
31
|
+
median: number
|
|
32
|
+
/** Confidence interval [lower, upper] */
|
|
33
|
+
ci: ConfidenceInterval
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Configuration for bootstrap sampling.
|
|
38
|
+
*/
|
|
39
|
+
export type BootstrapConfig = {
|
|
40
|
+
/** Number of bootstrap iterations (default: 1000) */
|
|
41
|
+
iterations?: number
|
|
42
|
+
/** Confidence level between 0 and 1 (default: 0.95) */
|
|
43
|
+
confidenceLevel?: number
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Compute bootstrap confidence interval for sample mean.
|
|
48
|
+
*
|
|
49
|
+
* @remarks
|
|
50
|
+
* Bootstrap resampling provides robust confidence intervals without
|
|
51
|
+
* assuming a specific distribution. For small samples, it's more
|
|
52
|
+
* reliable than parametric methods.
|
|
53
|
+
*
|
|
54
|
+
* @param samples - Array of numeric samples
|
|
55
|
+
* @param config - Optional bootstrap configuration
|
|
56
|
+
* @returns Bootstrap median and confidence interval
|
|
57
|
+
*
|
|
58
|
+
* @public
|
|
59
|
+
*/
|
|
60
|
+
export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
|
|
61
|
+
const iterations = config?.iterations ?? DEFAULT_ITERATIONS
|
|
62
|
+
const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
|
|
63
|
+
|
|
64
|
+
if (samples.length === 0) {
|
|
65
|
+
return { median: 0, ci: [0, 0] }
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (samples.length === 1) {
|
|
69
|
+
const value = samples[0] ?? 0
|
|
70
|
+
return { median: value, ci: [value, value] }
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const means: number[] = []
|
|
74
|
+
|
|
75
|
+
for (let i = 0; i < iterations; i++) {
|
|
76
|
+
// Resample with replacement - we know samples.length > 1 at this point
|
|
77
|
+
const resampled = Array.from(
|
|
78
|
+
{ length: samples.length },
|
|
79
|
+
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
// Compute mean of resampled data
|
|
83
|
+
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
84
|
+
means.push(sum / resampled.length)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Sort means for percentile calculation
|
|
88
|
+
means.sort((a, b) => a - b)
|
|
89
|
+
|
|
90
|
+
// Compute percentile indices based on confidence level
|
|
91
|
+
// For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
|
|
92
|
+
const alpha = (1 - confidenceLevel) / 2
|
|
93
|
+
const lowerIdx = Math.floor(iterations * alpha)
|
|
94
|
+
const upperIdx = Math.floor(iterations * (1 - alpha))
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
median: means[Math.floor(iterations / 2)] ?? 0,
|
|
98
|
+
ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Format confidence interval as string.
|
|
104
|
+
*
|
|
105
|
+
* @param ci - Confidence interval [lower, upper]
|
|
106
|
+
* @param decimals - Number of decimal places (default: 3)
|
|
107
|
+
* @returns Formatted CI string or empty string if undefined
|
|
108
|
+
*
|
|
109
|
+
* @public
|
|
110
|
+
*/
|
|
111
|
+
export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
|
|
112
|
+
if (!ci) return ''
|
|
113
|
+
return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Get bootstrap configuration from environment variables.
|
|
118
|
+
*
|
|
119
|
+
* @remarks
|
|
120
|
+
* Reads configuration from:
|
|
121
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
|
|
122
|
+
*
|
|
123
|
+
* @returns Bootstrap configuration
|
|
124
|
+
*
|
|
125
|
+
* @public
|
|
126
|
+
*/
|
|
127
|
+
export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
|
|
128
|
+
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
129
|
+
if (!envValue) return { iterations: DEFAULT_ITERATIONS }
|
|
130
|
+
|
|
131
|
+
const parsed = Number.parseInt(envValue, 10)
|
|
132
|
+
const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
133
|
+
|
|
134
|
+
return { iterations }
|
|
135
|
+
}
|
|
@@ -12,81 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
|
|
15
|
-
|
|
16
|
-
/** Default number of bootstrap iterations */
|
|
17
|
-
const DEFAULT_ITERATIONS = 1000
|
|
18
|
-
|
|
19
|
-
/**
|
|
20
|
-
* Bootstrap confidence interval result.
|
|
21
|
-
*/
|
|
22
|
-
type BootstrapResult = {
|
|
23
|
-
/** Estimated mean from bootstrap */
|
|
24
|
-
mean: number
|
|
25
|
-
/** 95% confidence interval [lower, upper] */
|
|
26
|
-
ci95: [number, number]
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Compute bootstrap confidence interval for sample mean.
|
|
31
|
-
*
|
|
32
|
-
* @remarks
|
|
33
|
-
* Bootstrap resampling provides robust confidence intervals without
|
|
34
|
-
* assuming a specific distribution. For small samples, it's more
|
|
35
|
-
* reliable than parametric methods.
|
|
36
|
-
*
|
|
37
|
-
* @param samples - Array of numeric samples
|
|
38
|
-
* @param iterations - Number of bootstrap iterations
|
|
39
|
-
* @returns Bootstrap mean and 95% confidence interval
|
|
40
|
-
*/
|
|
41
|
-
const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
|
|
42
|
-
if (samples.length === 0) {
|
|
43
|
-
return { mean: 0, ci95: [0, 0] }
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
if (samples.length === 1) {
|
|
47
|
-
const value = samples[0] ?? 0
|
|
48
|
-
return { mean: value, ci95: [value, value] }
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
const means: number[] = []
|
|
52
|
-
|
|
53
|
-
for (let i = 0; i < iterations; i++) {
|
|
54
|
-
// Resample with replacement - we know samples.length > 1 at this point
|
|
55
|
-
const resampled = Array.from(
|
|
56
|
-
{ length: samples.length },
|
|
57
|
-
() => samples[Math.floor(Math.random() * samples.length)] as number,
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
// Compute mean of resampled data
|
|
61
|
-
const sum = resampled.reduce((acc, val) => acc + val, 0)
|
|
62
|
-
means.push(sum / resampled.length)
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
// Sort means for percentile calculation
|
|
66
|
-
means.sort((a, b) => a - b)
|
|
67
|
-
|
|
68
|
-
// 95% CI: 2.5th and 97.5th percentiles
|
|
69
|
-
const lowerIdx = Math.floor(iterations * 0.025)
|
|
70
|
-
const upperIdx = Math.floor(iterations * 0.975)
|
|
71
|
-
|
|
72
|
-
return {
|
|
73
|
-
mean: means[Math.floor(iterations / 2)] ?? 0,
|
|
74
|
-
ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Get bootstrap iterations from environment variable.
|
|
80
|
-
*
|
|
81
|
-
* @returns Number of bootstrap iterations
|
|
82
|
-
*/
|
|
83
|
-
const getIterationsFromEnv = (): number => {
|
|
84
|
-
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
85
|
-
if (!envValue) return DEFAULT_ITERATIONS
|
|
86
|
-
|
|
87
|
-
const parsed = Number.parseInt(envValue, 10)
|
|
88
|
-
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
89
|
-
}
|
|
15
|
+
import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
|
|
90
16
|
|
|
91
17
|
/**
|
|
92
18
|
* Statistical significance comparison grader.
|
|
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
|
|
|
107
33
|
* @public
|
|
108
34
|
*/
|
|
109
35
|
export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
110
|
-
const
|
|
36
|
+
const config = getBootstrapConfigFromEnv()
|
|
111
37
|
|
|
112
38
|
// Collect scores for each run
|
|
113
39
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
116
42
|
|
|
117
43
|
// For single-prompt comparison, we only have one sample
|
|
118
44
|
// In practice, this grader is most useful when aggregating across prompts
|
|
119
|
-
const stats = bootstrap([score],
|
|
45
|
+
const stats = bootstrap([score], config)
|
|
120
46
|
|
|
121
47
|
return { label, score, stats }
|
|
122
48
|
})
|
|
123
49
|
|
|
124
|
-
// Sort by bootstrap
|
|
125
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
50
|
+
// Sort by bootstrap median descending
|
|
51
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
126
52
|
|
|
127
53
|
// Check if winner is statistically significant
|
|
128
54
|
// CIs don't overlap = significant difference (approximately p<0.05)
|
|
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
131
57
|
const second = sorted[1]
|
|
132
58
|
if (first && second) {
|
|
133
59
|
// Non-overlapping: first's lower bound > second's upper bound
|
|
134
|
-
isSignificant = first.stats.
|
|
60
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
135
61
|
}
|
|
136
62
|
|
|
137
63
|
const reasoning = isSignificant
|
|
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
142
68
|
rankings: sorted.map((s, i) => ({
|
|
143
69
|
run: s.label,
|
|
144
70
|
rank: i + 1,
|
|
145
|
-
score: s.stats.
|
|
71
|
+
score: s.stats.median,
|
|
146
72
|
})),
|
|
147
73
|
reasoning,
|
|
148
74
|
}
|
|
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
|
|
|
156
82
|
*
|
|
157
83
|
* @public
|
|
158
84
|
*/
|
|
159
|
-
export const createStatisticalGrader = (iterations
|
|
85
|
+
export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
|
|
86
|
+
const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
|
|
87
|
+
|
|
160
88
|
return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
161
89
|
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
162
90
|
const score = run.score?.score ?? 0
|
|
163
|
-
const stats = bootstrap([score],
|
|
91
|
+
const stats = bootstrap([score], config)
|
|
164
92
|
return { label, score, stats }
|
|
165
93
|
})
|
|
166
94
|
|
|
167
|
-
const sorted = runStats.sort((a, b) => b.stats.
|
|
95
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
168
96
|
|
|
169
97
|
let isSignificant = false
|
|
170
98
|
const first = sorted[0]
|
|
171
99
|
const second = sorted[1]
|
|
172
100
|
if (first && second) {
|
|
173
|
-
isSignificant = first.stats.
|
|
101
|
+
isSignificant = first.stats.ci[0] > second.stats.ci[1]
|
|
174
102
|
}
|
|
175
103
|
|
|
176
104
|
return {
|
|
177
105
|
rankings: sorted.map((s, i) => ({
|
|
178
106
|
run: s.label,
|
|
179
107
|
rank: i + 1,
|
|
180
|
-
score: s.stats.
|
|
108
|
+
score: s.stats.median,
|
|
181
109
|
})),
|
|
182
110
|
reasoning: isSignificant
|
|
183
111
|
? `Winner "${first?.label}" is statistically significant (p<0.05)`
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for bootstrap sampling utilities.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { afterEach, describe, expect, test } from 'bun:test'
|
|
6
|
+
import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
|
|
7
|
+
|
|
8
|
+
describe('bootstrap', () => {
|
|
9
|
+
describe('edge cases', () => {
|
|
10
|
+
test('returns {median: 0, ci: [0, 0]} for empty array', () => {
|
|
11
|
+
const result = bootstrap([])
|
|
12
|
+
expect(result.median).toBe(0)
|
|
13
|
+
expect(result.ci).toEqual([0, 0])
|
|
14
|
+
})
|
|
15
|
+
|
|
16
|
+
test('returns {median: value, ci: [value, value]} for single sample', () => {
|
|
17
|
+
const result = bootstrap([0.75])
|
|
18
|
+
expect(result.median).toBe(0.75)
|
|
19
|
+
expect(result.ci).toEqual([0.75, 0.75])
|
|
20
|
+
})
|
|
21
|
+
|
|
22
|
+
test('handles single sample of 0', () => {
|
|
23
|
+
const result = bootstrap([0])
|
|
24
|
+
expect(result.median).toBe(0)
|
|
25
|
+
expect(result.ci).toEqual([0, 0])
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
test('handles single sample of 1', () => {
|
|
29
|
+
const result = bootstrap([1])
|
|
30
|
+
expect(result.median).toBe(1)
|
|
31
|
+
expect(result.ci).toEqual([1, 1])
|
|
32
|
+
})
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
describe('confidence interval bounds', () => {
|
|
36
|
+
test('CI lower bound <= median <= CI upper bound', () => {
|
|
37
|
+
const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
|
|
38
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
39
|
+
|
|
40
|
+
expect(result.ci[0]).toBeLessThanOrEqual(result.median)
|
|
41
|
+
expect(result.median).toBeLessThanOrEqual(result.ci[1])
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
test('CI contains the true median for uniform samples', () => {
|
|
45
|
+
// For identical samples, CI should collapse to the value
|
|
46
|
+
const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
|
|
47
|
+
const result = bootstrap(samples, { iterations: 1000 })
|
|
48
|
+
|
|
49
|
+
expect(result.median).toBeCloseTo(0.5, 2)
|
|
50
|
+
expect(result.ci[0]).toBeCloseTo(0.5, 2)
|
|
51
|
+
expect(result.ci[1]).toBeCloseTo(0.5, 2)
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('CI widens with more variance in samples', () => {
|
|
55
|
+
const lowVariance = [0.49, 0.5, 0.51]
|
|
56
|
+
const highVariance = [0.1, 0.5, 0.9]
|
|
57
|
+
|
|
58
|
+
const lowResult = bootstrap(lowVariance, { iterations: 1000 })
|
|
59
|
+
const highResult = bootstrap(highVariance, { iterations: 1000 })
|
|
60
|
+
|
|
61
|
+
const lowWidth = lowResult.ci[1] - lowResult.ci[0]
|
|
62
|
+
const highWidth = highResult.ci[1] - highResult.ci[0]
|
|
63
|
+
|
|
64
|
+
expect(highWidth).toBeGreaterThan(lowWidth)
|
|
65
|
+
})
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
describe('configuration', () => {
|
|
69
|
+
test('uses default iterations when not specified', () => {
|
|
70
|
+
// Just verify it runs without error with defaults
|
|
71
|
+
const result = bootstrap([0.5, 0.6, 0.7])
|
|
72
|
+
expect(result.median).toBeGreaterThan(0)
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('accepts custom iteration count', () => {
|
|
76
|
+
const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
|
|
77
|
+
expect(result.median).toBeGreaterThan(0)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('accepts custom confidence level', () => {
|
|
81
|
+
const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
|
|
82
|
+
|
|
83
|
+
// 90% CI should be narrower than 95% CI
|
|
84
|
+
const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
|
|
85
|
+
const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
|
|
86
|
+
|
|
87
|
+
const width90 = ci90.ci[1] - ci90.ci[0]
|
|
88
|
+
const width95 = ci95.ci[1] - ci95.ci[0]
|
|
89
|
+
|
|
90
|
+
// 95% CI should generally be wider than 90% CI
|
|
91
|
+
// Allow some tolerance due to randomness
|
|
92
|
+
expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
describe('statistical properties', () => {
|
|
97
|
+
test('median is close to sample mean', () => {
|
|
98
|
+
const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
|
|
99
|
+
const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
|
|
100
|
+
|
|
101
|
+
const result = bootstrap(samples, { iterations: 10000 })
|
|
102
|
+
|
|
103
|
+
// Bootstrap median should be close to sample mean for symmetric distributions
|
|
104
|
+
expect(result.median).toBeCloseTo(sampleMean, 1)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('is deterministic-ish for large iteration counts', () => {
|
|
108
|
+
const samples = [0.3, 0.5, 0.7]
|
|
109
|
+
|
|
110
|
+
// With many iterations, results should be similar across runs
|
|
111
|
+
const result1 = bootstrap(samples, { iterations: 10000 })
|
|
112
|
+
const result2 = bootstrap(samples, { iterations: 10000 })
|
|
113
|
+
|
|
114
|
+
expect(result1.median).toBeCloseTo(result2.median, 1)
|
|
115
|
+
})
|
|
116
|
+
})
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
describe('getBootstrapConfigFromEnv', () => {
|
|
120
|
+
const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
121
|
+
|
|
122
|
+
afterEach(() => {
|
|
123
|
+
if (originalEnv === undefined) {
|
|
124
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
125
|
+
} else {
|
|
126
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
test('returns default iterations when env var not set', () => {
|
|
131
|
+
delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
132
|
+
const config = getBootstrapConfigFromEnv()
|
|
133
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
test('parses valid iteration count from env', () => {
|
|
137
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
|
|
138
|
+
const config = getBootstrapConfigFromEnv()
|
|
139
|
+
expect(config.iterations).toBe(5000)
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
test('returns default for invalid (non-numeric) env value', () => {
|
|
143
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
|
|
144
|
+
const config = getBootstrapConfigFromEnv()
|
|
145
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
test('returns default for iteration count below minimum (100)', () => {
|
|
149
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
|
|
150
|
+
const config = getBootstrapConfigFromEnv()
|
|
151
|
+
expect(config.iterations).toBe(DEFAULT_ITERATIONS)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
test('accepts iteration count at minimum (100)', () => {
|
|
155
|
+
process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
|
|
156
|
+
const config = getBootstrapConfigFromEnv()
|
|
157
|
+
expect(config.iterations).toBe(100)
|
|
158
|
+
})
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
describe('constants', () => {
|
|
162
|
+
test('DEFAULT_ITERATIONS is 1000', () => {
|
|
163
|
+
expect(DEFAULT_ITERATIONS).toBe(1000)
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
|
|
167
|
+
expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
|
|
168
|
+
})
|
|
169
|
+
})
|