@plaited/agent-eval-harness 0.7.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Unit tests for built-in trials comparison graders.
3
+ *
4
+ * @remarks
5
+ * Tests for:
6
+ * - trials-compare-weighted: Configurable weight grader for trials
7
+ * - trials-compare-statistical: Bootstrap confidence interval grader for trials
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { describe, expect, test } from 'bun:test'
13
+ import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
14
+ import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
15
+ import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
16
+
17
+ // ============================================================================
18
+ // Test Fixtures
19
+ // ============================================================================
20
+
21
+ const createMockTrialRuns = (
22
+ overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
23
+ ): Record<string, TrialsComparisonRunData> => ({
24
+ baseline: {
25
+ passRate: 0.67,
26
+ passAtK: 0.9,
27
+ passExpK: 0.3,
28
+ k: 3,
29
+ trials: [
30
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
31
+ { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
32
+ { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
33
+ ],
34
+ ...overrides.baseline,
35
+ },
36
+ variant: {
37
+ passRate: 1.0,
38
+ passAtK: 1.0,
39
+ passExpK: 1.0,
40
+ k: 3,
41
+ trials: [
42
+ { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
43
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
44
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
45
+ ],
46
+ ...overrides.variant,
47
+ },
48
+ })
49
+
50
+ const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
51
+ id: 'test-001',
52
+ input: 'Test prompt',
53
+ hint: 'Expected output',
54
+ runs,
55
+ })
56
+
57
+ // ============================================================================
58
+ // Weighted Grader Tests
59
+ // ============================================================================
60
+
61
+ describe('trials-compare-weighted grader', () => {
62
+ describe('DEFAULT_TRIALS_WEIGHTS', () => {
63
+ test('has expected default values', () => {
64
+ expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
65
+ expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
66
+ expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
67
+ })
68
+
69
+ test('weights sum to 1.0', () => {
70
+ const sum =
71
+ DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
72
+ expect(sum).toBe(1.0)
73
+ })
74
+ })
75
+
76
+ describe('createTrialsWeightedGrader', () => {
77
+ test('returns higher rank for better passAtK when capability weight is high', async () => {
78
+ const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
79
+ const runs = createMockTrialRuns({
80
+ baseline: { passAtK: 0.7 },
81
+ variant: { passAtK: 0.95 },
82
+ })
83
+ const input = createMockTrialInput(runs)
84
+
85
+ const result = await grader(input)
86
+
87
+ expect(result.rankings.length).toBe(2)
88
+ expect(result.rankings[0]?.run).toBe('variant')
89
+ expect(result.rankings[0]?.rank).toBe(1)
90
+ })
91
+
92
+ test('returns higher rank for better passExpK when reliability weight is high', async () => {
93
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
94
+ const runs = createMockTrialRuns({
95
+ baseline: { passExpK: 0.9 },
96
+ variant: { passExpK: 0.3 },
97
+ })
98
+ const input = createMockTrialInput(runs)
99
+
100
+ const result = await grader(input)
101
+
102
+ expect(result.rankings[0]?.run).toBe('baseline')
103
+ })
104
+
105
+ test('penalizes flaky runs when consistency weight is high', async () => {
106
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
107
+ const runs = createMockTrialRuns({
108
+ // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
109
+ baseline: { passAtK: 0.9, passExpK: 0.3 },
110
+ // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
111
+ variant: { passAtK: 0.8, passExpK: 0.8 },
112
+ })
113
+ const input = createMockTrialInput(runs)
114
+
115
+ const result = await grader(input)
116
+
117
+ // Variant should win due to lower flakiness (higher consistency)
118
+ expect(result.rankings[0]?.run).toBe('variant')
119
+ })
120
+
121
+ test('includes weights in reasoning', async () => {
122
+ const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
123
+ const grader = createTrialsWeightedGrader(weights)
124
+ const input = createMockTrialInput(createMockTrialRuns())
125
+
126
+ const result = await grader(input)
127
+
128
+ expect(result.reasoning).toContain('capability=0.5')
129
+ expect(result.reasoning).toContain('reliability=0.3')
130
+ expect(result.reasoning).toContain('consistency=0.2')
131
+ })
132
+
133
+ test('handles missing passAtK gracefully (treats as 0)', async () => {
134
+ const grader = createTrialsWeightedGrader()
135
+ const runs: Record<string, TrialsComparisonRunData> = {
136
+ baseline: {
137
+ k: 3,
138
+ trials: [],
139
+ },
140
+ variant: {
141
+ passAtK: 0.8,
142
+ passExpK: 0.5,
143
+ k: 3,
144
+ trials: [],
145
+ },
146
+ }
147
+ const input = createMockTrialInput(runs)
148
+
149
+ const result = await grader(input)
150
+
151
+ // Should not throw, variant should rank higher
152
+ expect(result.rankings.length).toBe(2)
153
+ expect(result.rankings[0]?.run).toBe('variant')
154
+ })
155
+
156
+ test('handles three or more runs', async () => {
157
+ const grader = createTrialsWeightedGrader()
158
+ const runs: Record<string, TrialsComparisonRunData> = {
159
+ a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
160
+ b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
161
+ c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
162
+ }
163
+ const input = createMockTrialInput(runs)
164
+
165
+ const result = await grader(input)
166
+
167
+ expect(result.rankings.length).toBe(3)
168
+ // Ranks should be 1, 2, 3
169
+ expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
170
+ })
171
+ })
172
+ })
173
+
174
+ // ============================================================================
175
+ // Statistical Grader Tests
176
+ // ============================================================================
177
+
178
+ describe('trials-compare-statistical grader', () => {
179
+ describe('createTrialsStatisticalGrader', () => {
180
+ test('returns rankings based on bootstrapped passAtK', async () => {
181
+ const grader = createTrialsStatisticalGrader(100)
182
+ const runs = createMockTrialRuns({
183
+ baseline: { passAtK: 0.6 },
184
+ variant: { passAtK: 0.95 },
185
+ })
186
+ const input = createMockTrialInput(runs)
187
+
188
+ const result = await grader(input)
189
+
190
+ expect(result.rankings.length).toBe(2)
191
+ expect(result.rankings[0]?.run).toBe('variant')
192
+ })
193
+
194
+ test('uses trial outcomes for bootstrap variance estimation', async () => {
195
+ const grader = createTrialsStatisticalGrader(100)
196
+ // All trials pass for variant, mixed for baseline
197
+ const runs: Record<string, TrialsComparisonRunData> = {
198
+ baseline: {
199
+ passAtK: 0.9,
200
+ passExpK: 0.3,
201
+ k: 5,
202
+ trials: [
203
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
204
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
205
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
206
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
207
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
208
+ ],
209
+ },
210
+ variant: {
211
+ passAtK: 1.0,
212
+ passExpK: 1.0,
213
+ k: 5,
214
+ trials: [
215
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
216
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
217
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
218
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
219
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
220
+ ],
221
+ },
222
+ }
223
+ const input = createMockTrialInput(runs)
224
+
225
+ const result = await grader(input)
226
+
227
+ // Variant with 100% pass rate should rank higher
228
+ expect(result.rankings[0]?.run).toBe('variant')
229
+ })
230
+
231
+ test('indicates significance when passAtK differs substantially', async () => {
232
+ const grader = createTrialsStatisticalGrader(500)
233
+ // Strong difference: all pass vs all fail
234
+ const runs: Record<string, TrialsComparisonRunData> = {
235
+ baseline: {
236
+ passAtK: 0,
237
+ k: 5,
238
+ trials: [
239
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
240
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
241
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
242
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
243
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
244
+ ],
245
+ },
246
+ variant: {
247
+ passAtK: 1.0,
248
+ k: 5,
249
+ trials: [
250
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
251
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
252
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
253
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
254
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
255
+ ],
256
+ },
257
+ }
258
+ const input = createMockTrialInput(runs)
259
+
260
+ const result = await grader(input)
261
+
262
+ expect(result.reasoning).toContain('clear separation')
263
+ })
264
+
265
+ test('handles empty trials array', async () => {
266
+ const grader = createTrialsStatisticalGrader(100)
267
+ const runs: Record<string, TrialsComparisonRunData> = {
268
+ baseline: { k: 3, trials: [] },
269
+ variant: {
270
+ k: 3,
271
+ trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
272
+ },
273
+ }
274
+ const input = createMockTrialInput(runs)
275
+
276
+ const result = await grader(input)
277
+
278
+ // Should not throw
279
+ expect(result.rankings.length).toBe(2)
280
+ })
281
+ })
282
+
283
+ describe('grade function', () => {
284
+ test('works with default iterations', async () => {
285
+ const runs = createMockTrialRuns()
286
+ const input = createMockTrialInput(runs)
287
+
288
+ const result = await statisticalGrade(input)
289
+
290
+ expect(result.rankings).toBeDefined()
291
+ expect(result.rankings.length).toBe(2)
292
+ })
293
+ })
294
+ })
295
+
296
+ // ============================================================================
297
+ // Edge Case Tests
298
+ // ============================================================================
299
+
300
+ describe('trials comparison grader edge cases', () => {
301
+ test('handles single run gracefully', async () => {
302
+ const grader = createTrialsWeightedGrader()
303
+ const runs: Record<string, TrialsComparisonRunData> = {
304
+ only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
305
+ }
306
+ const input = createMockTrialInput(runs)
307
+
308
+ const result = await grader(input)
309
+
310
+ expect(result.rankings.length).toBe(1)
311
+ expect(result.rankings[0]?.rank).toBe(1)
312
+ })
313
+
314
+ test('handles zero passAtK and passExpK', async () => {
315
+ const grader = createTrialsWeightedGrader()
316
+ const runs: Record<string, TrialsComparisonRunData> = {
317
+ baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
318
+ variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
319
+ }
320
+ const input = createMockTrialInput(runs)
321
+
322
+ const result = await grader(input)
323
+
324
+ expect(result.rankings[0]?.run).toBe('variant')
325
+ })
326
+
327
+ test('deterministic ordering for equal scores', async () => {
328
+ const grader = createTrialsWeightedGrader()
329
+ const runs = createMockTrialRuns({
330
+ baseline: { passAtK: 0.8, passExpK: 0.6 },
331
+ variant: { passAtK: 0.8, passExpK: 0.6 },
332
+ })
333
+ const input = createMockTrialInput(runs)
334
+
335
+ // Run multiple times to check stability
336
+ const results = await Promise.all([grader(input), grader(input), grader(input)])
337
+
338
+ // All should have same ordering
339
+ const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
340
+ expect(new Set(orders).size).toBe(1)
341
+ })
342
+
343
+ test('flakiness is clamped to non-negative', async () => {
344
+ // Edge case: passExpK > passAtK shouldn't happen but handle gracefully
345
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
346
+ const runs: Record<string, TrialsComparisonRunData> = {
347
+ baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
348
+ variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
349
+ }
350
+ const input = createMockTrialInput(runs)
351
+
352
+ const result = await grader(input)
353
+
354
+ // Both should have flakiness 0, so consistency score should be 1.0 for both
355
+ // Variant has higher capability/reliability so it wins on tiebreaker
356
+ expect(result.rankings).toBeDefined()
357
+ })
358
+ })
@@ -0,0 +1,183 @@
1
+ /**
2
+ * Built-in statistical significance comparison grader for trials data.
3
+ *
4
+ * @remarks
5
+ * Uses bootstrap sampling to compute confidence intervals for passAtK and passExpK.
6
+ * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
7
+ *
8
+ * Unlike the capture statistical grader which only has one score per prompt,
9
+ * trials data has multiple trial results per prompt, enabling proper bootstrap
10
+ * variance estimation.
11
+ *
12
+ * Bootstrap iterations can be customized via environment variable:
13
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
14
+ *
15
+ * @packageDocumentation
16
+ */
17
+
18
+ import type {
19
+ ComparisonGraderResult,
20
+ TrialsComparisonGrader,
21
+ TrialsComparisonGraderInput,
22
+ } from '../pipeline/pipeline.types.ts'
23
+ import { DEFAULT_ITERATIONS, getBootstrapConfigFromEnv } from './bootstrap.ts'
24
+
25
+ /**
26
+ * Bootstrap confidence interval result.
27
+ */
28
+ type BootstrapResult = {
29
+ /** Median estimate from bootstrap samples (more robust than mean) */
30
+ median: number
31
+ /** 95% confidence interval [lower, upper] */
32
+ ci95: [number, number]
33
+ }
34
+
35
+ /**
36
+ * Compute passAtK estimate from trial pass/fail samples via bootstrap.
37
+ *
38
+ * @remarks
39
+ * passAtK = 1 - (1 - p)^k where p is estimated pass rate.
40
+ * We bootstrap the pass rate and compute passAtK from each bootstrap sample.
41
+ *
42
+ * @param trials - Array of 0/1 values (0=fail, 1=pass)
43
+ * @param k - Number of trials
44
+ * @param iterations - Number of bootstrap iterations
45
+ * @returns Bootstrap estimate and CI for passAtK
46
+ */
47
+ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): BootstrapResult => {
48
+ if (trials.length === 0) {
49
+ return { median: 0, ci95: [0, 0] }
50
+ }
51
+
52
+ const passAtKValues: number[] = []
53
+
54
+ for (let i = 0; i < iterations; i++) {
55
+ // Resample with replacement
56
+ const resampled = Array.from(
57
+ { length: trials.length },
58
+ () => trials[Math.floor(Math.random() * trials.length)] as number,
59
+ )
60
+
61
+ // Compute pass rate from resample
62
+ const passRate = resampled.reduce((acc, val) => acc + val, 0) / resampled.length
63
+
64
+ // Compute passAtK: probability of at least one pass in k samples
65
+ // passAtK = 1 - (1 - p)^k
66
+ const passAtK = 1 - (1 - passRate) ** k
67
+ passAtKValues.push(passAtK)
68
+ }
69
+
70
+ // Sort for percentile calculation
71
+ passAtKValues.sort((a, b) => a - b)
72
+
73
+ const lowerIdx = Math.floor(iterations * 0.025)
74
+ const upperIdx = Math.floor(iterations * 0.975)
75
+
76
+ return {
77
+ median: passAtKValues[Math.floor(iterations / 2)] ?? 0,
78
+ ci95: [passAtKValues[lowerIdx] ?? 0, passAtKValues[upperIdx] ?? 0],
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Get bootstrap iterations from environment or use default.
84
+ *
85
+ * @returns Number of bootstrap iterations
86
+ */
87
+ const getIterations = (): number => {
88
+ const config = getBootstrapConfigFromEnv()
89
+ return config.iterations ?? DEFAULT_ITERATIONS
90
+ }
91
+
92
+ /**
93
+ * Statistical significance trials comparison grader.
94
+ *
95
+ * @remarks
96
+ * Compares runs using bootstrap sampling on trial outcomes to determine
97
+ * if differences in passAtK are statistically significant.
98
+ *
99
+ * Unlike single-sample comparisons, trials data provides multiple samples
100
+ * per prompt (k trials), enabling meaningful variance estimation.
101
+ *
102
+ * @public
103
+ */
104
+ export const grade: TrialsComparisonGrader = async ({
105
+ runs,
106
+ }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
107
+ const iterations = getIterations()
108
+
109
+ // Collect pass/fail outcomes for each run
110
+ const runStats = Object.entries(runs).map(([label, run]) => {
111
+ // Convert trials to 0/1 array
112
+ const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
113
+
114
+ // Bootstrap passAtK estimate
115
+ const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
116
+
117
+ return { label, passAtK: run.passAtK ?? 0, stats }
118
+ })
119
+
120
+ // Sort by bootstrap median passAtK descending
121
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
122
+
123
+ // Check if winner is statistically significant
124
+ // CIs don't overlap = significant difference (approximately p<0.05)
125
+ let isSignificant = false
126
+ const first = sorted[0]
127
+ const second = sorted[1]
128
+ if (first && second) {
129
+ // Non-overlapping: first's lower bound > second's upper bound
130
+ isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
131
+ }
132
+
133
+ const reasoning = isSignificant
134
+ ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs for passAtK)`
135
+ : 'No clear winner - confidence intervals overlap between top runs'
136
+
137
+ return {
138
+ rankings: sorted.map((s, i) => ({
139
+ run: s.label,
140
+ rank: i + 1,
141
+ score: s.stats.median,
142
+ })),
143
+ reasoning,
144
+ }
145
+ }
146
+
147
+ /**
148
+ * Create a statistical grader with custom iteration count.
149
+ *
150
+ * @param iterations - Number of bootstrap iterations
151
+ * @returns Trials comparison grader function
152
+ *
153
+ * @public
154
+ */
155
+ export const createTrialsStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): TrialsComparisonGrader => {
156
+ return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
157
+ const runStats = Object.entries(runs).map(([label, run]) => {
158
+ const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
159
+ const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
160
+ return { label, passAtK: run.passAtK ?? 0, stats }
161
+ })
162
+
163
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
164
+
165
+ let isSignificant = false
166
+ const first = sorted[0]
167
+ const second = sorted[1]
168
+ if (first && second) {
169
+ isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
170
+ }
171
+
172
+ return {
173
+ rankings: sorted.map((s, i) => ({
174
+ run: s.label,
175
+ rank: i + 1,
176
+ score: s.stats.median,
177
+ })),
178
+ reasoning: isSignificant
179
+ ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs)`
180
+ : 'No clear winner - confidence intervals overlap between top runs',
181
+ }
182
+ }
183
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Built-in weighted comparison grader for trials data.
3
+ *
4
+ * @remarks
5
+ * Configurable weights for capability (passAtK), reliability (passExpK),
6
+ * and consistency (1 - flakiness) dimensions.
7
+ *
8
+ * Weights can be customized via environment variables:
9
+ * - `COMPARE_CAPABILITY` (default: 0.4)
10
+ * - `COMPARE_RELIABILITY` (default: 0.4)
11
+ * - `COMPARE_CONSISTENCY` (default: 0.2)
12
+ *
13
+ * @packageDocumentation
14
+ */
15
+
16
+ import type {
17
+ ComparisonGraderResult,
18
+ TrialsComparisonGrader,
19
+ TrialsComparisonGraderInput,
20
+ } from '../pipeline/pipeline.types.ts'
21
+
22
+ /**
23
+ * Weight configuration for trials comparison dimensions.
24
+ */
25
+ export type TrialsWeights = {
26
+ /** Weight for capability (passAtK) - can the agent solve this at least once? */
27
+ capability: number
28
+ /** Weight for reliability (passExpK) - does the agent solve this consistently? */
29
+ reliability: number
30
+ /** Weight for consistency (1 - flakiness) - low gap between capability and reliability */
31
+ consistency: number
32
+ }
33
+
34
+ /** Default weights: capability=0.4, reliability=0.4, consistency=0.2 */
35
+ export const DEFAULT_TRIALS_WEIGHTS: TrialsWeights = {
36
+ capability: 0.4,
37
+ reliability: 0.4,
38
+ consistency: 0.2,
39
+ }
40
+
41
+ /**
42
+ * Read weights from environment variables with fallback to defaults.
43
+ *
44
+ * @remarks
45
+ * Validates that weights are non-negative. Invalid or negative values
46
+ * fall back to defaults.
47
+ *
48
+ * @returns TrialsWeights configuration
49
+ *
50
+ * @public
51
+ */
52
+ export const getTrialsWeightsFromEnv = (): TrialsWeights => {
53
+ const parseWeight = (envVar: string | undefined, defaultValue: number): number => {
54
+ if (!envVar) return defaultValue
55
+ const parsed = Number.parseFloat(envVar)
56
+ // Must be a valid non-negative number
57
+ if (Number.isNaN(parsed) || parsed < 0) return defaultValue
58
+ return parsed
59
+ }
60
+
61
+ return {
62
+ capability: parseWeight(process.env.COMPARE_CAPABILITY, DEFAULT_TRIALS_WEIGHTS.capability),
63
+ reliability: parseWeight(process.env.COMPARE_RELIABILITY, DEFAULT_TRIALS_WEIGHTS.reliability),
64
+ consistency: parseWeight(process.env.COMPARE_CONSISTENCY, DEFAULT_TRIALS_WEIGHTS.consistency),
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Create a weighted trials comparison grader with custom weights.
70
+ *
71
+ * @param weights - Weight configuration for comparison dimensions
72
+ * @returns Trials comparison grader function
73
+ *
74
+ * @public
75
+ */
76
+ export const createTrialsWeightedGrader = (weights: TrialsWeights = DEFAULT_TRIALS_WEIGHTS): TrialsComparisonGrader => {
77
+ return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
78
+ const scores = Object.entries(runs).map(([label, run]) => {
79
+ // Capability score: passAtK (0-1)
80
+ const capabilityScore = run.passAtK ?? 0
81
+
82
+ // Reliability score: passExpK (0-1)
83
+ const reliabilityScore = run.passExpK ?? 0
84
+
85
+ // Consistency score: 1 - flakiness
86
+ // Flakiness = passAtK - passExpK (how much gap between capability and reliability)
87
+ const flakiness = Math.max(0, capabilityScore - reliabilityScore)
88
+ const consistencyScore = 1 - flakiness
89
+
90
+ // Weighted combination
91
+ const weighted =
92
+ capabilityScore * weights.capability +
93
+ reliabilityScore * weights.reliability +
94
+ consistencyScore * weights.consistency
95
+
96
+ return { label, weighted, capabilityScore, reliabilityScore, consistencyScore, flakiness }
97
+ })
98
+
99
+ // Sort by weighted score descending (highest = best)
100
+ const sorted = scores.sort((a, b) => b.weighted - a.weighted)
101
+
102
+ return {
103
+ rankings: sorted.map((s, i) => ({
104
+ run: s.label,
105
+ rank: i + 1,
106
+ score: s.weighted,
107
+ })),
108
+ reasoning: `Weighted trials: capability=${weights.capability}, reliability=${weights.reliability}, consistency=${weights.consistency}`,
109
+ }
110
+ }
111
+ }
112
+
113
+ /**
114
+ * Default weighted trials comparison grader using environment or default weights.
115
+ *
116
+ * @remarks
117
+ * This is the default grader used when `--strategy weighted` is specified
118
+ * for trials format comparison.
119
+ *
120
+ * @public
121
+ */
122
+ export const grade: TrialsComparisonGrader = async (
123
+ input: TrialsComparisonGraderInput,
124
+ ): Promise<ComparisonGraderResult> => {
125
+ const weights = getTrialsWeightsFromEnv()
126
+ const grader = createTrialsWeightedGrader(weights)
127
+ return grader(input)
128
+ }