@plaited/agent-eval-harness 0.7.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
78
78
 
79
79
  # Compare runs (built-in strategies: weighted, statistical, custom)
80
80
  bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
81
+
82
+ # Compare trials for pass@k reliability analysis (auto-detects format)
83
+ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
81
84
  ```
82
85
 
83
86
  ## Skills for AI Agents
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.7.0",
3
+ "version": "0.8.1",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -42,8 +42,10 @@
42
42
  "check:types": "tsc --noEmit",
43
43
  "check:write": "biome check --write && format-package --write",
44
44
  "prepare": "git rev-parse --git-dir > /dev/null 2>&1 && git config core.hooksPath .hooks || true",
45
- "test": "bun test ./**/tests/*.spec.ts",
46
- "test:integration": "bun test ./**/integration_tests/*.spec.ts"
45
+ "test": "bun run test:bin && bun test:src",
46
+ "test:bin": "bun test bin/tests/*.spec.ts",
47
+ "test:integration": "bun test ./**/integration_tests/*.spec.ts",
48
+ "test:src": "bun test src/**/tests/*.spec.ts"
47
49
  },
48
50
  "lint-staged": {
49
51
  "*.{js,cjs,jsx,tsx,ts}": [
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Shared bootstrap sampling utilities for confidence interval computation.
3
+ *
4
+ * @remarks
5
+ * Bootstrap resampling provides robust confidence intervals without
6
+ * assuming a specific distribution. For small samples, it's more
7
+ * reliable than parametric methods.
8
+ *
9
+ * Environment variable configuration:
10
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
11
+ *
12
+ * @packageDocumentation
13
+ */
14
+
15
+ /** Default number of bootstrap iterations */
16
+ export const DEFAULT_ITERATIONS = 1000
17
+
18
+ /** Default confidence level (95%) */
19
+ export const DEFAULT_CONFIDENCE_LEVEL = 0.95
20
+
21
+ /**
22
+ * Confidence interval as [lower, upper] bounds.
23
+ */
24
+ export type ConfidenceInterval = [number, number]
25
+
26
+ /**
27
+ * Bootstrap confidence interval result.
28
+ */
29
+ export type BootstrapResult = {
30
+ /** Median of bootstrap sample means (50th percentile) */
31
+ median: number
32
+ /** Confidence interval [lower, upper] */
33
+ ci: ConfidenceInterval
34
+ }
35
+
36
+ /**
37
+ * Configuration for bootstrap sampling.
38
+ */
39
+ export type BootstrapConfig = {
40
+ /** Number of bootstrap iterations (default: 1000) */
41
+ iterations?: number
42
+ /** Confidence level between 0 and 1 (default: 0.95) */
43
+ confidenceLevel?: number
44
+ }
45
+
46
+ /**
47
+ * Compute bootstrap confidence interval for sample mean.
48
+ *
49
+ * @remarks
50
+ * Bootstrap resampling provides robust confidence intervals without
51
+ * assuming a specific distribution. For small samples, it's more
52
+ * reliable than parametric methods.
53
+ *
54
+ * @param samples - Array of numeric samples
55
+ * @param config - Optional bootstrap configuration
56
+ * @returns Bootstrap median and confidence interval
57
+ *
58
+ * @public
59
+ */
60
+ export const bootstrap = (samples: number[], config?: BootstrapConfig): BootstrapResult => {
61
+ const iterations = config?.iterations ?? DEFAULT_ITERATIONS
62
+ const confidenceLevel = config?.confidenceLevel ?? DEFAULT_CONFIDENCE_LEVEL
63
+
64
+ if (samples.length === 0) {
65
+ return { median: 0, ci: [0, 0] }
66
+ }
67
+
68
+ if (samples.length === 1) {
69
+ const value = samples[0] ?? 0
70
+ return { median: value, ci: [value, value] }
71
+ }
72
+
73
+ const means: number[] = []
74
+
75
+ for (let i = 0; i < iterations; i++) {
76
+ // Resample with replacement - we know samples.length > 1 at this point
77
+ const resampled = Array.from(
78
+ { length: samples.length },
79
+ () => samples[Math.floor(Math.random() * samples.length)] as number,
80
+ )
81
+
82
+ // Compute mean of resampled data
83
+ const sum = resampled.reduce((acc, val) => acc + val, 0)
84
+ means.push(sum / resampled.length)
85
+ }
86
+
87
+ // Sort means for percentile calculation
88
+ means.sort((a, b) => a - b)
89
+
90
+ // Compute percentile indices based on confidence level
91
+ // For 95% CI: lower = 2.5th percentile, upper = 97.5th percentile
92
+ const alpha = (1 - confidenceLevel) / 2
93
+ const lowerIdx = Math.floor(iterations * alpha)
94
+ const upperIdx = Math.floor(iterations * (1 - alpha))
95
+
96
+ return {
97
+ median: means[Math.floor(iterations / 2)] ?? 0,
98
+ ci: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
99
+ }
100
+ }
101
+
102
+ /**
103
+ * Format confidence interval as string.
104
+ *
105
+ * @param ci - Confidence interval [lower, upper]
106
+ * @param decimals - Number of decimal places (default: 3)
107
+ * @returns Formatted CI string or empty string if undefined
108
+ *
109
+ * @public
110
+ */
111
+ export const formatCI = (ci: ConfidenceInterval | undefined, decimals: number = 3): string => {
112
+ if (!ci) return ''
113
+ return `[${ci[0].toFixed(decimals)}, ${ci[1].toFixed(decimals)}]`
114
+ }
115
+
116
+ /**
117
+ * Get bootstrap configuration from environment variables.
118
+ *
119
+ * @remarks
120
+ * Reads configuration from:
121
+ * - `COMPARE_BOOTSTRAP_ITERATIONS`: Number of iterations (min: 100)
122
+ *
123
+ * @returns Bootstrap configuration
124
+ *
125
+ * @public
126
+ */
127
+ export const getBootstrapConfigFromEnv = (): BootstrapConfig => {
128
+ const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
129
+ if (!envValue) return { iterations: DEFAULT_ITERATIONS }
130
+
131
+ const parsed = Number.parseInt(envValue, 10)
132
+ const iterations = Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
133
+
134
+ return { iterations }
135
+ }
@@ -12,81 +12,7 @@
12
12
  */
13
13
 
14
14
  import type { ComparisonGrader, ComparisonGraderInput, ComparisonGraderResult } from '../pipeline/pipeline.types.ts'
15
-
16
- /** Default number of bootstrap iterations */
17
- const DEFAULT_ITERATIONS = 1000
18
-
19
- /**
20
- * Bootstrap confidence interval result.
21
- */
22
- type BootstrapResult = {
23
- /** Estimated mean from bootstrap */
24
- mean: number
25
- /** 95% confidence interval [lower, upper] */
26
- ci95: [number, number]
27
- }
28
-
29
- /**
30
- * Compute bootstrap confidence interval for sample mean.
31
- *
32
- * @remarks
33
- * Bootstrap resampling provides robust confidence intervals without
34
- * assuming a specific distribution. For small samples, it's more
35
- * reliable than parametric methods.
36
- *
37
- * @param samples - Array of numeric samples
38
- * @param iterations - Number of bootstrap iterations
39
- * @returns Bootstrap mean and 95% confidence interval
40
- */
41
- const bootstrap = (samples: number[], iterations: number = DEFAULT_ITERATIONS): BootstrapResult => {
42
- if (samples.length === 0) {
43
- return { mean: 0, ci95: [0, 0] }
44
- }
45
-
46
- if (samples.length === 1) {
47
- const value = samples[0] ?? 0
48
- return { mean: value, ci95: [value, value] }
49
- }
50
-
51
- const means: number[] = []
52
-
53
- for (let i = 0; i < iterations; i++) {
54
- // Resample with replacement - we know samples.length > 1 at this point
55
- const resampled = Array.from(
56
- { length: samples.length },
57
- () => samples[Math.floor(Math.random() * samples.length)] as number,
58
- )
59
-
60
- // Compute mean of resampled data
61
- const sum = resampled.reduce((acc, val) => acc + val, 0)
62
- means.push(sum / resampled.length)
63
- }
64
-
65
- // Sort means for percentile calculation
66
- means.sort((a, b) => a - b)
67
-
68
- // 95% CI: 2.5th and 97.5th percentiles
69
- const lowerIdx = Math.floor(iterations * 0.025)
70
- const upperIdx = Math.floor(iterations * 0.975)
71
-
72
- return {
73
- mean: means[Math.floor(iterations / 2)] ?? 0,
74
- ci95: [means[lowerIdx] ?? 0, means[upperIdx] ?? 0],
75
- }
76
- }
77
-
78
- /**
79
- * Get bootstrap iterations from environment variable.
80
- *
81
- * @returns Number of bootstrap iterations
82
- */
83
- const getIterationsFromEnv = (): number => {
84
- const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
85
- if (!envValue) return DEFAULT_ITERATIONS
86
-
87
- const parsed = Number.parseInt(envValue, 10)
88
- return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
89
- }
15
+ import { bootstrap, getBootstrapConfigFromEnv } from './bootstrap.ts'
90
16
 
91
17
  /**
92
18
  * Statistical significance comparison grader.
@@ -107,7 +33,7 @@ const getIterationsFromEnv = (): number => {
107
33
  * @public
108
34
  */
109
35
  export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
110
- const iterations = getIterationsFromEnv()
36
+ const config = getBootstrapConfigFromEnv()
111
37
 
112
38
  // Collect scores for each run
113
39
  const runStats = Object.entries(runs).map(([label, run]) => {
@@ -116,13 +42,13 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
116
42
 
117
43
  // For single-prompt comparison, we only have one sample
118
44
  // In practice, this grader is most useful when aggregating across prompts
119
- const stats = bootstrap([score], iterations)
45
+ const stats = bootstrap([score], config)
120
46
 
121
47
  return { label, score, stats }
122
48
  })
123
49
 
124
- // Sort by bootstrap mean descending
125
- const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
50
+ // Sort by bootstrap median descending
51
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
126
52
 
127
53
  // Check if winner is statistically significant
128
54
  // CIs don't overlap = significant difference (approximately p<0.05)
@@ -131,7 +57,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
131
57
  const second = sorted[1]
132
58
  if (first && second) {
133
59
  // Non-overlapping: first's lower bound > second's upper bound
134
- isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
60
+ isSignificant = first.stats.ci[0] > second.stats.ci[1]
135
61
  }
136
62
 
137
63
  const reasoning = isSignificant
@@ -142,7 +68,7 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
142
68
  rankings: sorted.map((s, i) => ({
143
69
  run: s.label,
144
70
  rank: i + 1,
145
- score: s.stats.mean,
71
+ score: s.stats.median,
146
72
  })),
147
73
  reasoning,
148
74
  }
@@ -156,28 +82,30 @@ export const grade: ComparisonGrader = async ({ runs }: ComparisonGraderInput):
156
82
  *
157
83
  * @public
158
84
  */
159
- export const createStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): ComparisonGrader => {
85
+ export const createStatisticalGrader = (iterations?: number): ComparisonGrader => {
86
+ const config = iterations ? { iterations } : getBootstrapConfigFromEnv()
87
+
160
88
  return async ({ runs }: ComparisonGraderInput): Promise<ComparisonGraderResult> => {
161
89
  const runStats = Object.entries(runs).map(([label, run]) => {
162
90
  const score = run.score?.score ?? 0
163
- const stats = bootstrap([score], iterations)
91
+ const stats = bootstrap([score], config)
164
92
  return { label, score, stats }
165
93
  })
166
94
 
167
- const sorted = runStats.sort((a, b) => b.stats.mean - a.stats.mean)
95
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
168
96
 
169
97
  let isSignificant = false
170
98
  const first = sorted[0]
171
99
  const second = sorted[1]
172
100
  if (first && second) {
173
- isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
101
+ isSignificant = first.stats.ci[0] > second.stats.ci[1]
174
102
  }
175
103
 
176
104
  return {
177
105
  rankings: sorted.map((s, i) => ({
178
106
  run: s.label,
179
107
  rank: i + 1,
180
- score: s.stats.mean,
108
+ score: s.stats.median,
181
109
  })),
182
110
  reasoning: isSignificant
183
111
  ? `Winner "${first?.label}" is statistically significant (p<0.05)`
@@ -0,0 +1,169 @@
1
+ /**
2
+ * Unit tests for bootstrap sampling utilities.
3
+ */
4
+
5
+ import { afterEach, describe, expect, test } from 'bun:test'
6
+ import { bootstrap, DEFAULT_CONFIDENCE_LEVEL, DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from '../bootstrap.ts'
7
+
8
+ describe('bootstrap', () => {
9
+ describe('edge cases', () => {
10
+ test('returns {median: 0, ci: [0, 0]} for empty array', () => {
11
+ const result = bootstrap([])
12
+ expect(result.median).toBe(0)
13
+ expect(result.ci).toEqual([0, 0])
14
+ })
15
+
16
+ test('returns {median: value, ci: [value, value]} for single sample', () => {
17
+ const result = bootstrap([0.75])
18
+ expect(result.median).toBe(0.75)
19
+ expect(result.ci).toEqual([0.75, 0.75])
20
+ })
21
+
22
+ test('handles single sample of 0', () => {
23
+ const result = bootstrap([0])
24
+ expect(result.median).toBe(0)
25
+ expect(result.ci).toEqual([0, 0])
26
+ })
27
+
28
+ test('handles single sample of 1', () => {
29
+ const result = bootstrap([1])
30
+ expect(result.median).toBe(1)
31
+ expect(result.ci).toEqual([1, 1])
32
+ })
33
+ })
34
+
35
+ describe('confidence interval bounds', () => {
36
+ test('CI lower bound <= median <= CI upper bound', () => {
37
+ const samples = [0.5, 0.6, 0.7, 0.8, 0.9]
38
+ const result = bootstrap(samples, { iterations: 1000 })
39
+
40
+ expect(result.ci[0]).toBeLessThanOrEqual(result.median)
41
+ expect(result.median).toBeLessThanOrEqual(result.ci[1])
42
+ })
43
+
44
+ test('CI contains the true median for uniform samples', () => {
45
+ // For identical samples, CI should collapse to the value
46
+ const samples = [0.5, 0.5, 0.5, 0.5, 0.5]
47
+ const result = bootstrap(samples, { iterations: 1000 })
48
+
49
+ expect(result.median).toBeCloseTo(0.5, 2)
50
+ expect(result.ci[0]).toBeCloseTo(0.5, 2)
51
+ expect(result.ci[1]).toBeCloseTo(0.5, 2)
52
+ })
53
+
54
+ test('CI widens with more variance in samples', () => {
55
+ const lowVariance = [0.49, 0.5, 0.51]
56
+ const highVariance = [0.1, 0.5, 0.9]
57
+
58
+ const lowResult = bootstrap(lowVariance, { iterations: 1000 })
59
+ const highResult = bootstrap(highVariance, { iterations: 1000 })
60
+
61
+ const lowWidth = lowResult.ci[1] - lowResult.ci[0]
62
+ const highWidth = highResult.ci[1] - highResult.ci[0]
63
+
64
+ expect(highWidth).toBeGreaterThan(lowWidth)
65
+ })
66
+ })
67
+
68
+ describe('configuration', () => {
69
+ test('uses default iterations when not specified', () => {
70
+ // Just verify it runs without error with defaults
71
+ const result = bootstrap([0.5, 0.6, 0.7])
72
+ expect(result.median).toBeGreaterThan(0)
73
+ })
74
+
75
+ test('accepts custom iteration count', () => {
76
+ const result = bootstrap([0.5, 0.6, 0.7], { iterations: 100 })
77
+ expect(result.median).toBeGreaterThan(0)
78
+ })
79
+
80
+ test('accepts custom confidence level', () => {
81
+ const samples = [0.3, 0.4, 0.5, 0.6, 0.7]
82
+
83
+ // 90% CI should be narrower than 95% CI
84
+ const ci90 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.9 })
85
+ const ci95 = bootstrap(samples, { iterations: 1000, confidenceLevel: 0.95 })
86
+
87
+ const width90 = ci90.ci[1] - ci90.ci[0]
88
+ const width95 = ci95.ci[1] - ci95.ci[0]
89
+
90
+ // 95% CI should generally be wider than 90% CI
91
+ // Allow some tolerance due to randomness
92
+ expect(width95).toBeGreaterThanOrEqual(width90 * 0.8)
93
+ })
94
+ })
95
+
96
+ describe('statistical properties', () => {
97
+ test('median is close to sample mean', () => {
98
+ const samples = [0.2, 0.4, 0.6, 0.8, 1.0]
99
+ const sampleMean = samples.reduce((a, b) => a + b, 0) / samples.length
100
+
101
+ const result = bootstrap(samples, { iterations: 10000 })
102
+
103
+ // Bootstrap median should be close to sample mean for symmetric distributions
104
+ expect(result.median).toBeCloseTo(sampleMean, 1)
105
+ })
106
+
107
+ test('is deterministic-ish for large iteration counts', () => {
108
+ const samples = [0.3, 0.5, 0.7]
109
+
110
+ // With many iterations, results should be similar across runs
111
+ const result1 = bootstrap(samples, { iterations: 10000 })
112
+ const result2 = bootstrap(samples, { iterations: 10000 })
113
+
114
+ expect(result1.median).toBeCloseTo(result2.median, 1)
115
+ })
116
+ })
117
+ })
118
+
119
+ describe('getBootstrapConfigFromEnv', () => {
120
+ const originalEnv = process.env.COMPARE_BOOTSTRAP_ITERATIONS
121
+
122
+ afterEach(() => {
123
+ if (originalEnv === undefined) {
124
+ delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
125
+ } else {
126
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = originalEnv
127
+ }
128
+ })
129
+
130
+ test('returns default iterations when env var not set', () => {
131
+ delete process.env.COMPARE_BOOTSTRAP_ITERATIONS
132
+ const config = getBootstrapConfigFromEnv()
133
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
134
+ })
135
+
136
+ test('parses valid iteration count from env', () => {
137
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '5000'
138
+ const config = getBootstrapConfigFromEnv()
139
+ expect(config.iterations).toBe(5000)
140
+ })
141
+
142
+ test('returns default for invalid (non-numeric) env value', () => {
143
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = 'invalid'
144
+ const config = getBootstrapConfigFromEnv()
145
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
146
+ })
147
+
148
+ test('returns default for iteration count below minimum (100)', () => {
149
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '50'
150
+ const config = getBootstrapConfigFromEnv()
151
+ expect(config.iterations).toBe(DEFAULT_ITERATIONS)
152
+ })
153
+
154
+ test('accepts iteration count at minimum (100)', () => {
155
+ process.env.COMPARE_BOOTSTRAP_ITERATIONS = '100'
156
+ const config = getBootstrapConfigFromEnv()
157
+ expect(config.iterations).toBe(100)
158
+ })
159
+ })
160
+
161
+ describe('constants', () => {
162
+ test('DEFAULT_ITERATIONS is 1000', () => {
163
+ expect(DEFAULT_ITERATIONS).toBe(1000)
164
+ })
165
+
166
+ test('DEFAULT_CONFIDENCE_LEVEL is 0.95', () => {
167
+ expect(DEFAULT_CONFIDENCE_LEVEL).toBe(0.95)
168
+ })
169
+ })