@plaited/agent-eval-harness 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
78
78
 
79
79
  # Compare runs (built-in strategies: weighted, statistical, custom)
80
80
  bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
81
+
82
+ # Compare trials for pass@k reliability analysis (auto-detects format)
83
+ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
81
84
  ```
82
85
 
83
86
  ## Skills for AI Agents
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.7.0",
3
+ "version": "0.8.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Unit tests for built-in trials comparison graders.
3
+ *
4
+ * @remarks
5
+ * Tests for:
6
+ * - trials-compare-weighted: Configurable weight grader for trials
7
+ * - trials-compare-statistical: Bootstrap confidence interval grader for trials
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { describe, expect, test } from 'bun:test'
13
+ import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
14
+ import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
15
+ import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
16
+
17
+ // ============================================================================
18
+ // Test Fixtures
19
+ // ============================================================================
20
+
21
+ const createMockTrialRuns = (
22
+ overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
23
+ ): Record<string, TrialsComparisonRunData> => ({
24
+ baseline: {
25
+ passRate: 0.67,
26
+ passAtK: 0.9,
27
+ passExpK: 0.3,
28
+ k: 3,
29
+ trials: [
30
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
31
+ { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
32
+ { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
33
+ ],
34
+ ...overrides.baseline,
35
+ },
36
+ variant: {
37
+ passRate: 1.0,
38
+ passAtK: 1.0,
39
+ passExpK: 1.0,
40
+ k: 3,
41
+ trials: [
42
+ { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
43
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
44
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
45
+ ],
46
+ ...overrides.variant,
47
+ },
48
+ })
49
+
50
+ const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
51
+ id: 'test-001',
52
+ input: 'Test prompt',
53
+ hint: 'Expected output',
54
+ runs,
55
+ })
56
+
57
+ // ============================================================================
58
+ // Weighted Grader Tests
59
+ // ============================================================================
60
+
61
+ describe('trials-compare-weighted grader', () => {
62
+ describe('DEFAULT_TRIALS_WEIGHTS', () => {
63
+ test('has expected default values', () => {
64
+ expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
65
+ expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
66
+ expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
67
+ })
68
+
69
+ test('weights sum to 1.0', () => {
70
+ const sum =
71
+ DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
72
+ expect(sum).toBe(1.0)
73
+ })
74
+ })
75
+
76
+ describe('createTrialsWeightedGrader', () => {
77
+ test('returns higher rank for better passAtK when capability weight is high', async () => {
78
+ const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
79
+ const runs = createMockTrialRuns({
80
+ baseline: { passAtK: 0.7 },
81
+ variant: { passAtK: 0.95 },
82
+ })
83
+ const input = createMockTrialInput(runs)
84
+
85
+ const result = await grader(input)
86
+
87
+ expect(result.rankings.length).toBe(2)
88
+ expect(result.rankings[0]?.run).toBe('variant')
89
+ expect(result.rankings[0]?.rank).toBe(1)
90
+ })
91
+
92
+ test('returns higher rank for better passExpK when reliability weight is high', async () => {
93
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
94
+ const runs = createMockTrialRuns({
95
+ baseline: { passExpK: 0.9 },
96
+ variant: { passExpK: 0.3 },
97
+ })
98
+ const input = createMockTrialInput(runs)
99
+
100
+ const result = await grader(input)
101
+
102
+ expect(result.rankings[0]?.run).toBe('baseline')
103
+ })
104
+
105
+ test('penalizes flaky runs when consistency weight is high', async () => {
106
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
107
+ const runs = createMockTrialRuns({
108
+ // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
109
+ baseline: { passAtK: 0.9, passExpK: 0.3 },
110
+ // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
111
+ variant: { passAtK: 0.8, passExpK: 0.8 },
112
+ })
113
+ const input = createMockTrialInput(runs)
114
+
115
+ const result = await grader(input)
116
+
117
+ // Variant should win due to lower flakiness (higher consistency)
118
+ expect(result.rankings[0]?.run).toBe('variant')
119
+ })
120
+
121
+ test('includes weights in reasoning', async () => {
122
+ const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
123
+ const grader = createTrialsWeightedGrader(weights)
124
+ const input = createMockTrialInput(createMockTrialRuns())
125
+
126
+ const result = await grader(input)
127
+
128
+ expect(result.reasoning).toContain('capability=0.5')
129
+ expect(result.reasoning).toContain('reliability=0.3')
130
+ expect(result.reasoning).toContain('consistency=0.2')
131
+ })
132
+
133
+ test('handles missing passAtK gracefully (treats as 0)', async () => {
134
+ const grader = createTrialsWeightedGrader()
135
+ const runs: Record<string, TrialsComparisonRunData> = {
136
+ baseline: {
137
+ k: 3,
138
+ trials: [],
139
+ },
140
+ variant: {
141
+ passAtK: 0.8,
142
+ passExpK: 0.5,
143
+ k: 3,
144
+ trials: [],
145
+ },
146
+ }
147
+ const input = createMockTrialInput(runs)
148
+
149
+ const result = await grader(input)
150
+
151
+ // Should not throw, variant should rank higher
152
+ expect(result.rankings.length).toBe(2)
153
+ expect(result.rankings[0]?.run).toBe('variant')
154
+ })
155
+
156
+ test('handles three or more runs', async () => {
157
+ const grader = createTrialsWeightedGrader()
158
+ const runs: Record<string, TrialsComparisonRunData> = {
159
+ a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
160
+ b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
161
+ c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
162
+ }
163
+ const input = createMockTrialInput(runs)
164
+
165
+ const result = await grader(input)
166
+
167
+ expect(result.rankings.length).toBe(3)
168
+ // Ranks should be 1, 2, 3
169
+ expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
170
+ })
171
+ })
172
+ })
173
+
174
+ // ============================================================================
175
+ // Statistical Grader Tests
176
+ // ============================================================================
177
+
178
+ describe('trials-compare-statistical grader', () => {
179
+ describe('createTrialsStatisticalGrader', () => {
180
+ test('returns rankings based on bootstrapped passAtK', async () => {
181
+ const grader = createTrialsStatisticalGrader(100)
182
+ const runs = createMockTrialRuns({
183
+ baseline: { passAtK: 0.6 },
184
+ variant: { passAtK: 0.95 },
185
+ })
186
+ const input = createMockTrialInput(runs)
187
+
188
+ const result = await grader(input)
189
+
190
+ expect(result.rankings.length).toBe(2)
191
+ expect(result.rankings[0]?.run).toBe('variant')
192
+ })
193
+
194
+ test('uses trial outcomes for bootstrap variance estimation', async () => {
195
+ const grader = createTrialsStatisticalGrader(100)
196
+ // All trials pass for variant, mixed for baseline
197
+ const runs: Record<string, TrialsComparisonRunData> = {
198
+ baseline: {
199
+ passAtK: 0.9,
200
+ passExpK: 0.3,
201
+ k: 5,
202
+ trials: [
203
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
204
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
205
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
206
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
207
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
208
+ ],
209
+ },
210
+ variant: {
211
+ passAtK: 1.0,
212
+ passExpK: 1.0,
213
+ k: 5,
214
+ trials: [
215
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
216
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
217
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
218
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
219
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
220
+ ],
221
+ },
222
+ }
223
+ const input = createMockTrialInput(runs)
224
+
225
+ const result = await grader(input)
226
+
227
+ // Variant with 100% pass rate should rank higher
228
+ expect(result.rankings[0]?.run).toBe('variant')
229
+ })
230
+
231
+ test('indicates significance when passAtK differs substantially', async () => {
232
+ const grader = createTrialsStatisticalGrader(500)
233
+ // Strong difference: all pass vs all fail
234
+ const runs: Record<string, TrialsComparisonRunData> = {
235
+ baseline: {
236
+ passAtK: 0,
237
+ k: 5,
238
+ trials: [
239
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
240
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
241
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
242
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
243
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
244
+ ],
245
+ },
246
+ variant: {
247
+ passAtK: 1.0,
248
+ k: 5,
249
+ trials: [
250
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
251
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
252
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
253
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
254
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
255
+ ],
256
+ },
257
+ }
258
+ const input = createMockTrialInput(runs)
259
+
260
+ const result = await grader(input)
261
+
262
+ expect(result.reasoning).toContain('clear separation')
263
+ })
264
+
265
+ test('handles empty trials array', async () => {
266
+ const grader = createTrialsStatisticalGrader(100)
267
+ const runs: Record<string, TrialsComparisonRunData> = {
268
+ baseline: { k: 3, trials: [] },
269
+ variant: {
270
+ k: 3,
271
+ trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
272
+ },
273
+ }
274
+ const input = createMockTrialInput(runs)
275
+
276
+ const result = await grader(input)
277
+
278
+ // Should not throw
279
+ expect(result.rankings.length).toBe(2)
280
+ })
281
+ })
282
+
283
+ describe('grade function', () => {
284
+ test('works with default iterations', async () => {
285
+ const runs = createMockTrialRuns()
286
+ const input = createMockTrialInput(runs)
287
+
288
+ const result = await statisticalGrade(input)
289
+
290
+ expect(result.rankings).toBeDefined()
291
+ expect(result.rankings.length).toBe(2)
292
+ })
293
+ })
294
+ })
295
+
296
+ // ============================================================================
297
+ // Edge Case Tests
298
+ // ============================================================================
299
+
300
+ describe('trials comparison grader edge cases', () => {
301
+ test('handles single run gracefully', async () => {
302
+ const grader = createTrialsWeightedGrader()
303
+ const runs: Record<string, TrialsComparisonRunData> = {
304
+ only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
305
+ }
306
+ const input = createMockTrialInput(runs)
307
+
308
+ const result = await grader(input)
309
+
310
+ expect(result.rankings.length).toBe(1)
311
+ expect(result.rankings[0]?.rank).toBe(1)
312
+ })
313
+
314
+ test('handles zero passAtK and passExpK', async () => {
315
+ const grader = createTrialsWeightedGrader()
316
+ const runs: Record<string, TrialsComparisonRunData> = {
317
+ baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
318
+ variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
319
+ }
320
+ const input = createMockTrialInput(runs)
321
+
322
+ const result = await grader(input)
323
+
324
+ expect(result.rankings[0]?.run).toBe('variant')
325
+ })
326
+
327
+ test('deterministic ordering for equal scores', async () => {
328
+ const grader = createTrialsWeightedGrader()
329
+ const runs = createMockTrialRuns({
330
+ baseline: { passAtK: 0.8, passExpK: 0.6 },
331
+ variant: { passAtK: 0.8, passExpK: 0.6 },
332
+ })
333
+ const input = createMockTrialInput(runs)
334
+
335
+ // Run multiple times to check stability
336
+ const results = await Promise.all([grader(input), grader(input), grader(input)])
337
+
338
+ // All should have same ordering
339
+ const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
340
+ expect(new Set(orders).size).toBe(1)
341
+ })
342
+
343
+ test('flakiness is clamped to non-negative', async () => {
344
+ // Edge case: passExpK > passAtK shouldn't happen but handle gracefully
345
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
346
+ const runs: Record<string, TrialsComparisonRunData> = {
347
+ baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
348
+ variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
349
+ }
350
+ const input = createMockTrialInput(runs)
351
+
352
+ const result = await grader(input)
353
+
354
+ // Both should have flakiness 0, so consistency score should be 1.0 for both
355
+ // Variant has higher capability/reliability so it wins on tiebreaker
356
+ expect(result.rankings).toBeDefined()
357
+ })
358
+ })
@@ -0,0 +1,188 @@
1
+ /**
2
+ * Built-in statistical significance comparison grader for trials data.
3
+ *
4
+ * @remarks
5
+ * Uses bootstrap sampling to compute confidence intervals for passAtK and passExpK.
6
+ * Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
7
+ *
8
+ * Unlike the capture statistical grader which only has one score per prompt,
9
+ * trials data has multiple trial results per prompt, enabling proper bootstrap
10
+ * variance estimation.
11
+ *
12
+ * Bootstrap iterations can be customized via environment variable:
13
+ * - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
14
+ *
15
+ * @packageDocumentation
16
+ */
17
+
18
+ import type {
19
+ ComparisonGraderResult,
20
+ TrialsComparisonGrader,
21
+ TrialsComparisonGraderInput,
22
+ } from '../pipeline/pipeline.types.ts'
23
+
24
+ /** Default number of bootstrap iterations */
25
+ const DEFAULT_ITERATIONS = 1000
26
+
27
+ /**
28
+ * Bootstrap confidence interval result.
29
+ */
30
+ type BootstrapResult = {
31
+ /** Median estimate from bootstrap samples (more robust than mean) */
32
+ median: number
33
+ /** 95% confidence interval [lower, upper] */
34
+ ci95: [number, number]
35
+ }
36
+
37
+ /**
38
+ * Compute passAtK estimate from trial pass/fail samples via bootstrap.
39
+ *
40
+ * @remarks
41
+ * passAtK = 1 - (1 - p)^k where p is estimated pass rate.
42
+ * We bootstrap the pass rate and compute passAtK from each bootstrap sample.
43
+ *
44
+ * @param trials - Array of 0/1 values (0=fail, 1=pass)
45
+ * @param k - Number of trials
46
+ * @param iterations - Number of bootstrap iterations
47
+ * @returns Bootstrap estimate and CI for passAtK
48
+ */
49
+ const bootstrapPassAtK = (trials: number[], k: number, iterations: number): BootstrapResult => {
50
+ if (trials.length === 0) {
51
+ return { median: 0, ci95: [0, 0] }
52
+ }
53
+
54
+ const passAtKValues: number[] = []
55
+
56
+ for (let i = 0; i < iterations; i++) {
57
+ // Resample with replacement
58
+ const resampled = Array.from(
59
+ { length: trials.length },
60
+ () => trials[Math.floor(Math.random() * trials.length)] as number,
61
+ )
62
+
63
+ // Compute pass rate from resample
64
+ const passRate = resampled.reduce((acc, val) => acc + val, 0) / resampled.length
65
+
66
+ // Compute passAtK: probability of at least one pass in k samples
67
+ // passAtK = 1 - (1 - p)^k
68
+ const passAtK = 1 - (1 - passRate) ** k
69
+ passAtKValues.push(passAtK)
70
+ }
71
+
72
+ // Sort for percentile calculation
73
+ passAtKValues.sort((a, b) => a - b)
74
+
75
+ const lowerIdx = Math.floor(iterations * 0.025)
76
+ const upperIdx = Math.floor(iterations * 0.975)
77
+
78
+ return {
79
+ median: passAtKValues[Math.floor(iterations / 2)] ?? 0,
80
+ ci95: [passAtKValues[lowerIdx] ?? 0, passAtKValues[upperIdx] ?? 0],
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Get bootstrap iterations from environment variable.
86
+ *
87
+ * @returns Number of bootstrap iterations
88
+ */
89
+ const getIterationsFromEnv = (): number => {
90
+ const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
91
+ if (!envValue) return DEFAULT_ITERATIONS
92
+
93
+ const parsed = Number.parseInt(envValue, 10)
94
+ return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
95
+ }
96
+
97
+ /**
98
+ * Statistical significance trials comparison grader.
99
+ *
100
+ * @remarks
101
+ * Compares runs using bootstrap sampling on trial outcomes to determine
102
+ * if differences in passAtK are statistically significant.
103
+ *
104
+ * Unlike single-sample comparisons, trials data provides multiple samples
105
+ * per prompt (k trials), enabling meaningful variance estimation.
106
+ *
107
+ * @public
108
+ */
109
+ export const grade: TrialsComparisonGrader = async ({
110
+ runs,
111
+ }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
112
+ const iterations = getIterationsFromEnv()
113
+
114
+ // Collect pass/fail outcomes for each run
115
+ const runStats = Object.entries(runs).map(([label, run]) => {
116
+ // Convert trials to 0/1 array
117
+ const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
118
+
119
+ // Bootstrap passAtK estimate
120
+ const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
121
+
122
+ return { label, passAtK: run.passAtK ?? 0, stats }
123
+ })
124
+
125
+ // Sort by bootstrap median passAtK descending
126
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
127
+
128
+ // Check if winner is statistically significant
129
+ // CIs don't overlap = significant difference (approximately p<0.05)
130
+ let isSignificant = false
131
+ const first = sorted[0]
132
+ const second = sorted[1]
133
+ if (first && second) {
134
+ // Non-overlapping: first's lower bound > second's upper bound
135
+ isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
136
+ }
137
+
138
+ const reasoning = isSignificant
139
+ ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs for passAtK)`
140
+ : 'No clear winner - confidence intervals overlap between top runs'
141
+
142
+ return {
143
+ rankings: sorted.map((s, i) => ({
144
+ run: s.label,
145
+ rank: i + 1,
146
+ score: s.stats.median,
147
+ })),
148
+ reasoning,
149
+ }
150
+ }
151
+
152
+ /**
153
+ * Create a statistical grader with custom iteration count.
154
+ *
155
+ * @param iterations - Number of bootstrap iterations
156
+ * @returns Trials comparison grader function
157
+ *
158
+ * @public
159
+ */
160
+ export const createTrialsStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): TrialsComparisonGrader => {
161
+ return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
162
+ const runStats = Object.entries(runs).map(([label, run]) => {
163
+ const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
164
+ const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
165
+ return { label, passAtK: run.passAtK ?? 0, stats }
166
+ })
167
+
168
+ const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
169
+
170
+ let isSignificant = false
171
+ const first = sorted[0]
172
+ const second = sorted[1]
173
+ if (first && second) {
174
+ isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
175
+ }
176
+
177
+ return {
178
+ rankings: sorted.map((s, i) => ({
179
+ run: s.label,
180
+ rank: i + 1,
181
+ score: s.stats.median,
182
+ })),
183
+ reasoning: isSignificant
184
+ ? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs)`
185
+ : 'No clear winner - confidence intervals overlap between top runs',
186
+ }
187
+ }
188
+ }
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Built-in weighted comparison grader for trials data.
3
+ *
4
+ * @remarks
5
+ * Configurable weights for capability (passAtK), reliability (passExpK),
6
+ * and consistency (1 - flakiness) dimensions.
7
+ *
8
+ * Weights can be customized via environment variables:
9
+ * - `COMPARE_CAPABILITY` (default: 0.4)
10
+ * - `COMPARE_RELIABILITY` (default: 0.4)
11
+ * - `COMPARE_CONSISTENCY` (default: 0.2)
12
+ *
13
+ * @packageDocumentation
14
+ */
15
+
16
+ import type {
17
+ ComparisonGraderResult,
18
+ TrialsComparisonGrader,
19
+ TrialsComparisonGraderInput,
20
+ } from '../pipeline/pipeline.types.ts'
21
+
22
+ /**
23
+ * Weight configuration for trials comparison dimensions.
24
+ */
25
+ export type TrialsWeights = {
26
+ /** Weight for capability (passAtK) - can the agent solve this at least once? */
27
+ capability: number
28
+ /** Weight for reliability (passExpK) - does the agent solve this consistently? */
29
+ reliability: number
30
+ /** Weight for consistency (1 - flakiness) - low gap between capability and reliability */
31
+ consistency: number
32
+ }
33
+
34
+ /** Default weights: capability=0.4, reliability=0.4, consistency=0.2 */
35
+ export const DEFAULT_TRIALS_WEIGHTS: TrialsWeights = {
36
+ capability: 0.4,
37
+ reliability: 0.4,
38
+ consistency: 0.2,
39
+ }
40
+
41
+ /**
42
+ * Read weights from environment variables with fallback to defaults.
43
+ *
44
+ * @remarks
45
+ * Validates that weights are non-negative. Invalid or negative values
46
+ * fall back to defaults.
47
+ *
48
+ * @returns TrialsWeights configuration
49
+ *
50
+ * @public
51
+ */
52
+ export const getTrialsWeightsFromEnv = (): TrialsWeights => {
53
+ const parseWeight = (envVar: string | undefined, defaultValue: number): number => {
54
+ if (!envVar) return defaultValue
55
+ const parsed = Number.parseFloat(envVar)
56
+ // Must be a valid non-negative number
57
+ if (Number.isNaN(parsed) || parsed < 0) return defaultValue
58
+ return parsed
59
+ }
60
+
61
+ return {
62
+ capability: parseWeight(process.env.COMPARE_CAPABILITY, DEFAULT_TRIALS_WEIGHTS.capability),
63
+ reliability: parseWeight(process.env.COMPARE_RELIABILITY, DEFAULT_TRIALS_WEIGHTS.reliability),
64
+ consistency: parseWeight(process.env.COMPARE_CONSISTENCY, DEFAULT_TRIALS_WEIGHTS.consistency),
65
+ }
66
+ }
67
+
68
+ /**
69
+ * Create a weighted trials comparison grader with custom weights.
70
+ *
71
+ * @param weights - Weight configuration for comparison dimensions
72
+ * @returns Trials comparison grader function
73
+ *
74
+ * @public
75
+ */
76
+ export const createTrialsWeightedGrader = (weights: TrialsWeights = DEFAULT_TRIALS_WEIGHTS): TrialsComparisonGrader => {
77
+ return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
78
+ const scores = Object.entries(runs).map(([label, run]) => {
79
+ // Capability score: passAtK (0-1)
80
+ const capabilityScore = run.passAtK ?? 0
81
+
82
+ // Reliability score: passExpK (0-1)
83
+ const reliabilityScore = run.passExpK ?? 0
84
+
85
+ // Consistency score: 1 - flakiness
86
+ // Flakiness = passAtK - passExpK (how much gap between capability and reliability)
87
+ const flakiness = Math.max(0, capabilityScore - reliabilityScore)
88
+ const consistencyScore = 1 - flakiness
89
+
90
+ // Weighted combination
91
+ const weighted =
92
+ capabilityScore * weights.capability +
93
+ reliabilityScore * weights.reliability +
94
+ consistencyScore * weights.consistency
95
+
96
+ return { label, weighted, capabilityScore, reliabilityScore, consistencyScore, flakiness }
97
+ })
98
+
99
+ // Sort by weighted score descending (highest = best)
100
+ const sorted = scores.sort((a, b) => b.weighted - a.weighted)
101
+
102
+ return {
103
+ rankings: sorted.map((s, i) => ({
104
+ run: s.label,
105
+ rank: i + 1,
106
+ score: s.weighted,
107
+ })),
108
+ reasoning: `Weighted trials: capability=${weights.capability}, reliability=${weights.reliability}, consistency=${weights.consistency}`,
109
+ }
110
+ }
111
+ }
112
+
113
+ /**
114
+ * Default weighted trials comparison grader using environment or default weights.
115
+ *
116
+ * @remarks
117
+ * This is the default grader used when `--strategy weighted` is specified
118
+ * for trials format comparison.
119
+ *
120
+ * @public
121
+ */
122
+ export const grade: TrialsComparisonGrader = async (
123
+ input: TrialsComparisonGraderInput,
124
+ ): Promise<ComparisonGraderResult> => {
125
+ const weights = getTrialsWeightsFromEnv()
126
+ const grader = createTrialsWeightedGrader(weights)
127
+ return grader(input)
128
+ }