@plaited/agent-eval-harness 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/package.json +1 -1
- package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
- package/src/graders/trials-compare-statistical.ts +188 -0
- package/src/graders/trials-compare-weighted.ts +128 -0
- package/src/graders.ts +21 -1
- package/src/pipeline/compare-format-detection.ts +100 -0
- package/src/pipeline/compare-trials.ts +596 -0
- package/src/pipeline/compare.ts +75 -19
- package/src/pipeline/pipeline.types.ts +52 -1
- package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
- package/src/pipeline/tests/compare-trials.spec.ts +277 -0
- package/src/schemas/schemas.ts +151 -0
- package/src/schemas.ts +13 -0
package/README.md
CHANGED
|
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
|
|
|
78
78
|
|
|
79
79
|
# Compare runs (built-in strategies: weighted, statistical, custom)
|
|
80
80
|
bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
81
|
+
|
|
82
|
+
# Compare trials for pass@k reliability analysis (auto-detects format)
|
|
83
|
+
bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
|
|
81
84
|
```
|
|
82
85
|
|
|
83
86
|
## Skills for AI Agents
|
package/package.json
CHANGED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for built-in trials comparison graders.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for:
|
|
6
|
+
* - trials-compare-weighted: Configurable weight grader for trials
|
|
7
|
+
* - trials-compare-statistical: Bootstrap confidence interval grader for trials
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { describe, expect, test } from 'bun:test'
|
|
13
|
+
import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
|
|
14
|
+
import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
|
|
15
|
+
import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Test Fixtures
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
const createMockTrialRuns = (
|
|
22
|
+
overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
|
|
23
|
+
): Record<string, TrialsComparisonRunData> => ({
|
|
24
|
+
baseline: {
|
|
25
|
+
passRate: 0.67,
|
|
26
|
+
passAtK: 0.9,
|
|
27
|
+
passExpK: 0.3,
|
|
28
|
+
k: 3,
|
|
29
|
+
trials: [
|
|
30
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
|
|
31
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
|
|
32
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
|
|
33
|
+
],
|
|
34
|
+
...overrides.baseline,
|
|
35
|
+
},
|
|
36
|
+
variant: {
|
|
37
|
+
passRate: 1.0,
|
|
38
|
+
passAtK: 1.0,
|
|
39
|
+
passExpK: 1.0,
|
|
40
|
+
k: 3,
|
|
41
|
+
trials: [
|
|
42
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
|
|
43
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
|
|
44
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
|
|
45
|
+
],
|
|
46
|
+
...overrides.variant,
|
|
47
|
+
},
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
|
|
51
|
+
id: 'test-001',
|
|
52
|
+
input: 'Test prompt',
|
|
53
|
+
hint: 'Expected output',
|
|
54
|
+
runs,
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Weighted Grader Tests
|
|
59
|
+
// ============================================================================
|
|
60
|
+
|
|
61
|
+
describe('trials-compare-weighted grader', () => {
|
|
62
|
+
describe('DEFAULT_TRIALS_WEIGHTS', () => {
|
|
63
|
+
test('has expected default values', () => {
|
|
64
|
+
expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
|
|
65
|
+
expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
|
|
66
|
+
expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('weights sum to 1.0', () => {
|
|
70
|
+
const sum =
|
|
71
|
+
DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
|
|
72
|
+
expect(sum).toBe(1.0)
|
|
73
|
+
})
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
describe('createTrialsWeightedGrader', () => {
|
|
77
|
+
test('returns higher rank for better passAtK when capability weight is high', async () => {
|
|
78
|
+
const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
|
|
79
|
+
const runs = createMockTrialRuns({
|
|
80
|
+
baseline: { passAtK: 0.7 },
|
|
81
|
+
variant: { passAtK: 0.95 },
|
|
82
|
+
})
|
|
83
|
+
const input = createMockTrialInput(runs)
|
|
84
|
+
|
|
85
|
+
const result = await grader(input)
|
|
86
|
+
|
|
87
|
+
expect(result.rankings.length).toBe(2)
|
|
88
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
89
|
+
expect(result.rankings[0]?.rank).toBe(1)
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
test('returns higher rank for better passExpK when reliability weight is high', async () => {
|
|
93
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
|
|
94
|
+
const runs = createMockTrialRuns({
|
|
95
|
+
baseline: { passExpK: 0.9 },
|
|
96
|
+
variant: { passExpK: 0.3 },
|
|
97
|
+
})
|
|
98
|
+
const input = createMockTrialInput(runs)
|
|
99
|
+
|
|
100
|
+
const result = await grader(input)
|
|
101
|
+
|
|
102
|
+
expect(result.rankings[0]?.run).toBe('baseline')
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
test('penalizes flaky runs when consistency weight is high', async () => {
|
|
106
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
|
|
107
|
+
const runs = createMockTrialRuns({
|
|
108
|
+
// baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
|
|
109
|
+
baseline: { passAtK: 0.9, passExpK: 0.3 },
|
|
110
|
+
// variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
|
|
111
|
+
variant: { passAtK: 0.8, passExpK: 0.8 },
|
|
112
|
+
})
|
|
113
|
+
const input = createMockTrialInput(runs)
|
|
114
|
+
|
|
115
|
+
const result = await grader(input)
|
|
116
|
+
|
|
117
|
+
// Variant should win due to lower flakiness (higher consistency)
|
|
118
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
test('includes weights in reasoning', async () => {
|
|
122
|
+
const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
|
|
123
|
+
const grader = createTrialsWeightedGrader(weights)
|
|
124
|
+
const input = createMockTrialInput(createMockTrialRuns())
|
|
125
|
+
|
|
126
|
+
const result = await grader(input)
|
|
127
|
+
|
|
128
|
+
expect(result.reasoning).toContain('capability=0.5')
|
|
129
|
+
expect(result.reasoning).toContain('reliability=0.3')
|
|
130
|
+
expect(result.reasoning).toContain('consistency=0.2')
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
test('handles missing passAtK gracefully (treats as 0)', async () => {
|
|
134
|
+
const grader = createTrialsWeightedGrader()
|
|
135
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
136
|
+
baseline: {
|
|
137
|
+
k: 3,
|
|
138
|
+
trials: [],
|
|
139
|
+
},
|
|
140
|
+
variant: {
|
|
141
|
+
passAtK: 0.8,
|
|
142
|
+
passExpK: 0.5,
|
|
143
|
+
k: 3,
|
|
144
|
+
trials: [],
|
|
145
|
+
},
|
|
146
|
+
}
|
|
147
|
+
const input = createMockTrialInput(runs)
|
|
148
|
+
|
|
149
|
+
const result = await grader(input)
|
|
150
|
+
|
|
151
|
+
// Should not throw, variant should rank higher
|
|
152
|
+
expect(result.rankings.length).toBe(2)
|
|
153
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
test('handles three or more runs', async () => {
|
|
157
|
+
const grader = createTrialsWeightedGrader()
|
|
158
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
159
|
+
a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
|
|
160
|
+
b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
|
|
161
|
+
c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
|
|
162
|
+
}
|
|
163
|
+
const input = createMockTrialInput(runs)
|
|
164
|
+
|
|
165
|
+
const result = await grader(input)
|
|
166
|
+
|
|
167
|
+
expect(result.rankings.length).toBe(3)
|
|
168
|
+
// Ranks should be 1, 2, 3
|
|
169
|
+
expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
|
|
170
|
+
})
|
|
171
|
+
})
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
// ============================================================================
|
|
175
|
+
// Statistical Grader Tests
|
|
176
|
+
// ============================================================================
|
|
177
|
+
|
|
178
|
+
describe('trials-compare-statistical grader', () => {
|
|
179
|
+
describe('createTrialsStatisticalGrader', () => {
|
|
180
|
+
test('returns rankings based on bootstrapped passAtK', async () => {
|
|
181
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
182
|
+
const runs = createMockTrialRuns({
|
|
183
|
+
baseline: { passAtK: 0.6 },
|
|
184
|
+
variant: { passAtK: 0.95 },
|
|
185
|
+
})
|
|
186
|
+
const input = createMockTrialInput(runs)
|
|
187
|
+
|
|
188
|
+
const result = await grader(input)
|
|
189
|
+
|
|
190
|
+
expect(result.rankings.length).toBe(2)
|
|
191
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
192
|
+
})
|
|
193
|
+
|
|
194
|
+
test('uses trial outcomes for bootstrap variance estimation', async () => {
|
|
195
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
196
|
+
// All trials pass for variant, mixed for baseline
|
|
197
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
198
|
+
baseline: {
|
|
199
|
+
passAtK: 0.9,
|
|
200
|
+
passExpK: 0.3,
|
|
201
|
+
k: 5,
|
|
202
|
+
trials: [
|
|
203
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
|
|
204
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
|
|
205
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
|
|
206
|
+
{ trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
|
|
207
|
+
{ trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
|
|
208
|
+
],
|
|
209
|
+
},
|
|
210
|
+
variant: {
|
|
211
|
+
passAtK: 1.0,
|
|
212
|
+
passExpK: 1.0,
|
|
213
|
+
k: 5,
|
|
214
|
+
trials: [
|
|
215
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
|
|
216
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
|
|
217
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
|
|
218
|
+
{ trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
|
|
219
|
+
{ trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
|
|
220
|
+
],
|
|
221
|
+
},
|
|
222
|
+
}
|
|
223
|
+
const input = createMockTrialInput(runs)
|
|
224
|
+
|
|
225
|
+
const result = await grader(input)
|
|
226
|
+
|
|
227
|
+
// Variant with 100% pass rate should rank higher
|
|
228
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
test('indicates significance when passAtK differs substantially', async () => {
|
|
232
|
+
const grader = createTrialsStatisticalGrader(500)
|
|
233
|
+
// Strong difference: all pass vs all fail
|
|
234
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
235
|
+
baseline: {
|
|
236
|
+
passAtK: 0,
|
|
237
|
+
k: 5,
|
|
238
|
+
trials: [
|
|
239
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
|
|
240
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
|
|
241
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
|
|
242
|
+
{ trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
|
|
243
|
+
{ trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
|
|
244
|
+
],
|
|
245
|
+
},
|
|
246
|
+
variant: {
|
|
247
|
+
passAtK: 1.0,
|
|
248
|
+
k: 5,
|
|
249
|
+
trials: [
|
|
250
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
|
|
251
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
|
|
252
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
|
|
253
|
+
{ trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
|
|
254
|
+
{ trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
|
|
255
|
+
],
|
|
256
|
+
},
|
|
257
|
+
}
|
|
258
|
+
const input = createMockTrialInput(runs)
|
|
259
|
+
|
|
260
|
+
const result = await grader(input)
|
|
261
|
+
|
|
262
|
+
expect(result.reasoning).toContain('clear separation')
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
test('handles empty trials array', async () => {
|
|
266
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
267
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
268
|
+
baseline: { k: 3, trials: [] },
|
|
269
|
+
variant: {
|
|
270
|
+
k: 3,
|
|
271
|
+
trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
|
|
272
|
+
},
|
|
273
|
+
}
|
|
274
|
+
const input = createMockTrialInput(runs)
|
|
275
|
+
|
|
276
|
+
const result = await grader(input)
|
|
277
|
+
|
|
278
|
+
// Should not throw
|
|
279
|
+
expect(result.rankings.length).toBe(2)
|
|
280
|
+
})
|
|
281
|
+
})
|
|
282
|
+
|
|
283
|
+
describe('grade function', () => {
|
|
284
|
+
test('works with default iterations', async () => {
|
|
285
|
+
const runs = createMockTrialRuns()
|
|
286
|
+
const input = createMockTrialInput(runs)
|
|
287
|
+
|
|
288
|
+
const result = await statisticalGrade(input)
|
|
289
|
+
|
|
290
|
+
expect(result.rankings).toBeDefined()
|
|
291
|
+
expect(result.rankings.length).toBe(2)
|
|
292
|
+
})
|
|
293
|
+
})
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
// ============================================================================
|
|
297
|
+
// Edge Case Tests
|
|
298
|
+
// ============================================================================
|
|
299
|
+
|
|
300
|
+
describe('trials comparison grader edge cases', () => {
|
|
301
|
+
test('handles single run gracefully', async () => {
|
|
302
|
+
const grader = createTrialsWeightedGrader()
|
|
303
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
304
|
+
only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
|
|
305
|
+
}
|
|
306
|
+
const input = createMockTrialInput(runs)
|
|
307
|
+
|
|
308
|
+
const result = await grader(input)
|
|
309
|
+
|
|
310
|
+
expect(result.rankings.length).toBe(1)
|
|
311
|
+
expect(result.rankings[0]?.rank).toBe(1)
|
|
312
|
+
})
|
|
313
|
+
|
|
314
|
+
test('handles zero passAtK and passExpK', async () => {
|
|
315
|
+
const grader = createTrialsWeightedGrader()
|
|
316
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
317
|
+
baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
|
|
318
|
+
variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
|
|
319
|
+
}
|
|
320
|
+
const input = createMockTrialInput(runs)
|
|
321
|
+
|
|
322
|
+
const result = await grader(input)
|
|
323
|
+
|
|
324
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
325
|
+
})
|
|
326
|
+
|
|
327
|
+
test('deterministic ordering for equal scores', async () => {
|
|
328
|
+
const grader = createTrialsWeightedGrader()
|
|
329
|
+
const runs = createMockTrialRuns({
|
|
330
|
+
baseline: { passAtK: 0.8, passExpK: 0.6 },
|
|
331
|
+
variant: { passAtK: 0.8, passExpK: 0.6 },
|
|
332
|
+
})
|
|
333
|
+
const input = createMockTrialInput(runs)
|
|
334
|
+
|
|
335
|
+
// Run multiple times to check stability
|
|
336
|
+
const results = await Promise.all([grader(input), grader(input), grader(input)])
|
|
337
|
+
|
|
338
|
+
// All should have same ordering
|
|
339
|
+
const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
|
|
340
|
+
expect(new Set(orders).size).toBe(1)
|
|
341
|
+
})
|
|
342
|
+
|
|
343
|
+
test('flakiness is clamped to non-negative', async () => {
|
|
344
|
+
// Edge case: passExpK > passAtK shouldn't happen but handle gracefully
|
|
345
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
|
|
346
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
347
|
+
baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
|
|
348
|
+
variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
|
|
349
|
+
}
|
|
350
|
+
const input = createMockTrialInput(runs)
|
|
351
|
+
|
|
352
|
+
const result = await grader(input)
|
|
353
|
+
|
|
354
|
+
// Both should have flakiness 0, so consistency score should be 1.0 for both
|
|
355
|
+
// Variant has higher capability/reliability so it wins on tiebreaker
|
|
356
|
+
expect(result.rankings).toBeDefined()
|
|
357
|
+
})
|
|
358
|
+
})
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in statistical significance comparison grader for trials data.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Uses bootstrap sampling to compute confidence intervals for passAtK and passExpK.
|
|
6
|
+
* Flags when the winner is statistically significant (p<0.05, non-overlapping CIs).
|
|
7
|
+
*
|
|
8
|
+
* Unlike the capture statistical grader which only has one score per prompt,
|
|
9
|
+
* trials data has multiple trial results per prompt, enabling proper bootstrap
|
|
10
|
+
* variance estimation.
|
|
11
|
+
*
|
|
12
|
+
* Bootstrap iterations can be customized via environment variable:
|
|
13
|
+
* - `COMPARE_BOOTSTRAP_ITERATIONS` (default: 1000)
|
|
14
|
+
*
|
|
15
|
+
* @packageDocumentation
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import type {
|
|
19
|
+
ComparisonGraderResult,
|
|
20
|
+
TrialsComparisonGrader,
|
|
21
|
+
TrialsComparisonGraderInput,
|
|
22
|
+
} from '../pipeline/pipeline.types.ts'
|
|
23
|
+
|
|
24
|
+
/** Default number of bootstrap iterations */
|
|
25
|
+
const DEFAULT_ITERATIONS = 1000
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Bootstrap confidence interval result.
|
|
29
|
+
*/
|
|
30
|
+
type BootstrapResult = {
|
|
31
|
+
/** Median estimate from bootstrap samples (more robust than mean) */
|
|
32
|
+
median: number
|
|
33
|
+
/** 95% confidence interval [lower, upper] */
|
|
34
|
+
ci95: [number, number]
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Compute passAtK estimate from trial pass/fail samples via bootstrap.
|
|
39
|
+
*
|
|
40
|
+
* @remarks
|
|
41
|
+
* passAtK = 1 - (1 - p)^k where p is estimated pass rate.
|
|
42
|
+
* We bootstrap the pass rate and compute passAtK from each bootstrap sample.
|
|
43
|
+
*
|
|
44
|
+
* @param trials - Array of 0/1 values (0=fail, 1=pass)
|
|
45
|
+
* @param k - Number of trials
|
|
46
|
+
* @param iterations - Number of bootstrap iterations
|
|
47
|
+
* @returns Bootstrap estimate and CI for passAtK
|
|
48
|
+
*/
|
|
49
|
+
const bootstrapPassAtK = (trials: number[], k: number, iterations: number): BootstrapResult => {
|
|
50
|
+
if (trials.length === 0) {
|
|
51
|
+
return { median: 0, ci95: [0, 0] }
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const passAtKValues: number[] = []
|
|
55
|
+
|
|
56
|
+
for (let i = 0; i < iterations; i++) {
|
|
57
|
+
// Resample with replacement
|
|
58
|
+
const resampled = Array.from(
|
|
59
|
+
{ length: trials.length },
|
|
60
|
+
() => trials[Math.floor(Math.random() * trials.length)] as number,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
// Compute pass rate from resample
|
|
64
|
+
const passRate = resampled.reduce((acc, val) => acc + val, 0) / resampled.length
|
|
65
|
+
|
|
66
|
+
// Compute passAtK: probability of at least one pass in k samples
|
|
67
|
+
// passAtK = 1 - (1 - p)^k
|
|
68
|
+
const passAtK = 1 - (1 - passRate) ** k
|
|
69
|
+
passAtKValues.push(passAtK)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Sort for percentile calculation
|
|
73
|
+
passAtKValues.sort((a, b) => a - b)
|
|
74
|
+
|
|
75
|
+
const lowerIdx = Math.floor(iterations * 0.025)
|
|
76
|
+
const upperIdx = Math.floor(iterations * 0.975)
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
median: passAtKValues[Math.floor(iterations / 2)] ?? 0,
|
|
80
|
+
ci95: [passAtKValues[lowerIdx] ?? 0, passAtKValues[upperIdx] ?? 0],
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Get bootstrap iterations from environment variable.
|
|
86
|
+
*
|
|
87
|
+
* @returns Number of bootstrap iterations
|
|
88
|
+
*/
|
|
89
|
+
const getIterationsFromEnv = (): number => {
|
|
90
|
+
const envValue = process.env.COMPARE_BOOTSTRAP_ITERATIONS
|
|
91
|
+
if (!envValue) return DEFAULT_ITERATIONS
|
|
92
|
+
|
|
93
|
+
const parsed = Number.parseInt(envValue, 10)
|
|
94
|
+
return Number.isNaN(parsed) || parsed < 100 ? DEFAULT_ITERATIONS : parsed
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Statistical significance trials comparison grader.
|
|
99
|
+
*
|
|
100
|
+
* @remarks
|
|
101
|
+
* Compares runs using bootstrap sampling on trial outcomes to determine
|
|
102
|
+
* if differences in passAtK are statistically significant.
|
|
103
|
+
*
|
|
104
|
+
* Unlike single-sample comparisons, trials data provides multiple samples
|
|
105
|
+
* per prompt (k trials), enabling meaningful variance estimation.
|
|
106
|
+
*
|
|
107
|
+
* @public
|
|
108
|
+
*/
|
|
109
|
+
export const grade: TrialsComparisonGrader = async ({
|
|
110
|
+
runs,
|
|
111
|
+
}: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
112
|
+
const iterations = getIterationsFromEnv()
|
|
113
|
+
|
|
114
|
+
// Collect pass/fail outcomes for each run
|
|
115
|
+
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
116
|
+
// Convert trials to 0/1 array
|
|
117
|
+
const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
|
|
118
|
+
|
|
119
|
+
// Bootstrap passAtK estimate
|
|
120
|
+
const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
|
|
121
|
+
|
|
122
|
+
return { label, passAtK: run.passAtK ?? 0, stats }
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
// Sort by bootstrap median passAtK descending
|
|
126
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
127
|
+
|
|
128
|
+
// Check if winner is statistically significant
|
|
129
|
+
// CIs don't overlap = significant difference (approximately p<0.05)
|
|
130
|
+
let isSignificant = false
|
|
131
|
+
const first = sorted[0]
|
|
132
|
+
const second = sorted[1]
|
|
133
|
+
if (first && second) {
|
|
134
|
+
// Non-overlapping: first's lower bound > second's upper bound
|
|
135
|
+
isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const reasoning = isSignificant
|
|
139
|
+
? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs for passAtK)`
|
|
140
|
+
: 'No clear winner - confidence intervals overlap between top runs'
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
rankings: sorted.map((s, i) => ({
|
|
144
|
+
run: s.label,
|
|
145
|
+
rank: i + 1,
|
|
146
|
+
score: s.stats.median,
|
|
147
|
+
})),
|
|
148
|
+
reasoning,
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Create a statistical grader with custom iteration count.
|
|
154
|
+
*
|
|
155
|
+
* @param iterations - Number of bootstrap iterations
|
|
156
|
+
* @returns Trials comparison grader function
|
|
157
|
+
*
|
|
158
|
+
* @public
|
|
159
|
+
*/
|
|
160
|
+
export const createTrialsStatisticalGrader = (iterations: number = DEFAULT_ITERATIONS): TrialsComparisonGrader => {
|
|
161
|
+
return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
162
|
+
const runStats = Object.entries(runs).map(([label, run]) => {
|
|
163
|
+
const trialOutcomes = run.trials.map((t) => (t.pass ? 1 : 0))
|
|
164
|
+
const stats = bootstrapPassAtK(trialOutcomes, run.k, iterations)
|
|
165
|
+
return { label, passAtK: run.passAtK ?? 0, stats }
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
const sorted = runStats.sort((a, b) => b.stats.median - a.stats.median)
|
|
169
|
+
|
|
170
|
+
let isSignificant = false
|
|
171
|
+
const first = sorted[0]
|
|
172
|
+
const second = sorted[1]
|
|
173
|
+
if (first && second) {
|
|
174
|
+
isSignificant = first.stats.ci95[0] > second.stats.ci95[1]
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
rankings: sorted.map((s, i) => ({
|
|
179
|
+
run: s.label,
|
|
180
|
+
rank: i + 1,
|
|
181
|
+
score: s.stats.median,
|
|
182
|
+
})),
|
|
183
|
+
reasoning: isSignificant
|
|
184
|
+
? `Winner "${first?.label}" shows clear separation (non-overlapping 95% CIs)`
|
|
185
|
+
: 'No clear winner - confidence intervals overlap between top runs',
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Built-in weighted comparison grader for trials data.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Configurable weights for capability (passAtK), reliability (passExpK),
|
|
6
|
+
* and consistency (1 - flakiness) dimensions.
|
|
7
|
+
*
|
|
8
|
+
* Weights can be customized via environment variables:
|
|
9
|
+
* - `COMPARE_CAPABILITY` (default: 0.4)
|
|
10
|
+
* - `COMPARE_RELIABILITY` (default: 0.4)
|
|
11
|
+
* - `COMPARE_CONSISTENCY` (default: 0.2)
|
|
12
|
+
*
|
|
13
|
+
* @packageDocumentation
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import type {
|
|
17
|
+
ComparisonGraderResult,
|
|
18
|
+
TrialsComparisonGrader,
|
|
19
|
+
TrialsComparisonGraderInput,
|
|
20
|
+
} from '../pipeline/pipeline.types.ts'
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Weight configuration for trials comparison dimensions.
|
|
24
|
+
*/
|
|
25
|
+
export type TrialsWeights = {
|
|
26
|
+
/** Weight for capability (passAtK) - can the agent solve this at least once? */
|
|
27
|
+
capability: number
|
|
28
|
+
/** Weight for reliability (passExpK) - does the agent solve this consistently? */
|
|
29
|
+
reliability: number
|
|
30
|
+
/** Weight for consistency (1 - flakiness) - low gap between capability and reliability */
|
|
31
|
+
consistency: number
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Default weights: capability=0.4, reliability=0.4, consistency=0.2 */
|
|
35
|
+
export const DEFAULT_TRIALS_WEIGHTS: TrialsWeights = {
|
|
36
|
+
capability: 0.4,
|
|
37
|
+
reliability: 0.4,
|
|
38
|
+
consistency: 0.2,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Read weights from environment variables with fallback to defaults.
|
|
43
|
+
*
|
|
44
|
+
* @remarks
|
|
45
|
+
* Validates that weights are non-negative. Invalid or negative values
|
|
46
|
+
* fall back to defaults.
|
|
47
|
+
*
|
|
48
|
+
* @returns TrialsWeights configuration
|
|
49
|
+
*
|
|
50
|
+
* @public
|
|
51
|
+
*/
|
|
52
|
+
export const getTrialsWeightsFromEnv = (): TrialsWeights => {
|
|
53
|
+
const parseWeight = (envVar: string | undefined, defaultValue: number): number => {
|
|
54
|
+
if (!envVar) return defaultValue
|
|
55
|
+
const parsed = Number.parseFloat(envVar)
|
|
56
|
+
// Must be a valid non-negative number
|
|
57
|
+
if (Number.isNaN(parsed) || parsed < 0) return defaultValue
|
|
58
|
+
return parsed
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
capability: parseWeight(process.env.COMPARE_CAPABILITY, DEFAULT_TRIALS_WEIGHTS.capability),
|
|
63
|
+
reliability: parseWeight(process.env.COMPARE_RELIABILITY, DEFAULT_TRIALS_WEIGHTS.reliability),
|
|
64
|
+
consistency: parseWeight(process.env.COMPARE_CONSISTENCY, DEFAULT_TRIALS_WEIGHTS.consistency),
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Create a weighted trials comparison grader with custom weights.
|
|
70
|
+
*
|
|
71
|
+
* @param weights - Weight configuration for comparison dimensions
|
|
72
|
+
* @returns Trials comparison grader function
|
|
73
|
+
*
|
|
74
|
+
* @public
|
|
75
|
+
*/
|
|
76
|
+
export const createTrialsWeightedGrader = (weights: TrialsWeights = DEFAULT_TRIALS_WEIGHTS): TrialsComparisonGrader => {
|
|
77
|
+
return async ({ runs }: TrialsComparisonGraderInput): Promise<ComparisonGraderResult> => {
|
|
78
|
+
const scores = Object.entries(runs).map(([label, run]) => {
|
|
79
|
+
// Capability score: passAtK (0-1)
|
|
80
|
+
const capabilityScore = run.passAtK ?? 0
|
|
81
|
+
|
|
82
|
+
// Reliability score: passExpK (0-1)
|
|
83
|
+
const reliabilityScore = run.passExpK ?? 0
|
|
84
|
+
|
|
85
|
+
// Consistency score: 1 - flakiness
|
|
86
|
+
// Flakiness = passAtK - passExpK (how much gap between capability and reliability)
|
|
87
|
+
const flakiness = Math.max(0, capabilityScore - reliabilityScore)
|
|
88
|
+
const consistencyScore = 1 - flakiness
|
|
89
|
+
|
|
90
|
+
// Weighted combination
|
|
91
|
+
const weighted =
|
|
92
|
+
capabilityScore * weights.capability +
|
|
93
|
+
reliabilityScore * weights.reliability +
|
|
94
|
+
consistencyScore * weights.consistency
|
|
95
|
+
|
|
96
|
+
return { label, weighted, capabilityScore, reliabilityScore, consistencyScore, flakiness }
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
// Sort by weighted score descending (highest = best)
|
|
100
|
+
const sorted = scores.sort((a, b) => b.weighted - a.weighted)
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
rankings: sorted.map((s, i) => ({
|
|
104
|
+
run: s.label,
|
|
105
|
+
rank: i + 1,
|
|
106
|
+
score: s.weighted,
|
|
107
|
+
})),
|
|
108
|
+
reasoning: `Weighted trials: capability=${weights.capability}, reliability=${weights.reliability}, consistency=${weights.consistency}`,
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Default weighted trials comparison grader using environment or default weights.
|
|
115
|
+
*
|
|
116
|
+
* @remarks
|
|
117
|
+
* This is the default grader used when `--strategy weighted` is specified
|
|
118
|
+
* for trials format comparison.
|
|
119
|
+
*
|
|
120
|
+
* @public
|
|
121
|
+
*/
|
|
122
|
+
export const grade: TrialsComparisonGrader = async (
|
|
123
|
+
input: TrialsComparisonGraderInput,
|
|
124
|
+
): Promise<ComparisonGraderResult> => {
|
|
125
|
+
const weights = getTrialsWeightsFromEnv()
|
|
126
|
+
const grader = createTrialsWeightedGrader(weights)
|
|
127
|
+
return grader(input)
|
|
128
|
+
}
|