@plaited/agent-eval-harness 0.6.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +146 -6
- package/package.json +1 -1
- package/src/commands/capture.ts +9 -1
- package/src/commands/trials.ts +6 -0
- package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
- package/src/graders/trials-compare-statistical.ts +188 -0
- package/src/graders/trials-compare-weighted.ts +128 -0
- package/src/graders.ts +21 -1
- package/src/pipeline/compare-format-detection.ts +100 -0
- package/src/pipeline/compare-trials.ts +596 -0
- package/src/pipeline/compare.ts +75 -19
- package/src/pipeline/grade.ts +6 -0
- package/src/pipeline/pipeline.types.ts +57 -1
- package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
- package/src/pipeline/tests/compare-trials.spec.ts +277 -0
- package/src/schemas/grader-loader.ts +4 -0
- package/src/schemas/schemas.ts +161 -0
- package/src/schemas/tests/fixtures/grader-git.ts +116 -0
- package/src/schemas/tests/grader-git.spec.ts +222 -0
- package/src/schemas.ts +13 -0
package/README.md
CHANGED
|
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
|
|
|
78
78
|
|
|
79
79
|
# Compare runs (built-in strategies: weighted, statistical, custom)
|
|
80
80
|
bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
81
|
+
|
|
82
|
+
# Compare trials for pass@k reliability analysis (auto-detects format)
|
|
83
|
+
bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
|
|
81
84
|
```
|
|
82
85
|
|
|
83
86
|
## Skills for AI Agents
|
|
@@ -184,11 +187,68 @@ Key fields:
|
|
|
184
187
|
|
|
185
188
|
## Graders
|
|
186
189
|
|
|
187
|
-
Graders score agent outputs. The harness supports two types:
|
|
190
|
+
Graders score agent outputs. The harness supports two types and two grading approaches:
|
|
191
|
+
|
|
192
|
+
### Git-Based Outcome Grading (Recommended for Coding Agents)
|
|
193
|
+
|
|
194
|
+
**Grade outcomes, not paths.** Use git to detect actual environmental changes:
|
|
195
|
+
|
|
196
|
+
```typescript
|
|
197
|
+
import type { Grader } from '@plaited/agent-eval-harness/schemas'
|
|
198
|
+
import { resolve } from 'node:path'
|
|
199
|
+
|
|
200
|
+
export const grade: Grader = async ({ output, hint, cwd }) => {
|
|
201
|
+
// Validate cwd to prevent command injection
|
|
202
|
+
const isValidPath = (path: string): boolean => {
|
|
203
|
+
const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
|
|
204
|
+
if (dangerousChars.test(path)) return false
|
|
205
|
+
if (path.includes('..') || path.startsWith('-')) return false
|
|
206
|
+
return true
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (!cwd || !isValidPath(cwd)) {
|
|
210
|
+
return {
|
|
211
|
+
pass: false,
|
|
212
|
+
score: 0,
|
|
213
|
+
reasoning: 'Invalid working directory path'
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const safeCwd = resolve(cwd)
|
|
218
|
+
|
|
219
|
+
// Detect file changes using git
|
|
220
|
+
const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
|
|
221
|
+
const filesCreated = status
|
|
222
|
+
.split('\n')
|
|
223
|
+
.filter(line => line.startsWith('??'))
|
|
224
|
+
.map(line => line.slice(3).trim())
|
|
225
|
+
|
|
226
|
+
// Run tests to verify outcome
|
|
227
|
+
const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
|
|
228
|
+
const testsPassed = testResult.exitCode === 0
|
|
229
|
+
|
|
230
|
+
return {
|
|
231
|
+
pass: filesCreated.length > 0 && testsPassed,
|
|
232
|
+
score: testsPassed ? 1.0 : 0.0,
|
|
233
|
+
reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
|
|
234
|
+
outcome: { // Optional: structured data for analysis
|
|
235
|
+
filesCreated,
|
|
236
|
+
testsPassed,
|
|
237
|
+
type: 'file_creation_with_tests'
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
**Benefits:**
|
|
244
|
+
- Detects actual file changes, test results, build success
|
|
245
|
+
- Works universally in any git repo, any language
|
|
246
|
+
- Returns structured `outcome` data for downstream analysis
|
|
247
|
+
- Zero configuration required
|
|
188
248
|
|
|
189
|
-
###
|
|
249
|
+
### Output-Based Grading (General Purpose)
|
|
190
250
|
|
|
191
|
-
|
|
251
|
+
For non-coding tasks or when git is unavailable:
|
|
192
252
|
|
|
193
253
|
```typescript
|
|
194
254
|
import type { Grader } from '@plaited/agent-eval-harness/schemas'
|
|
@@ -215,11 +275,62 @@ Any executable script using stdin/stdout JSON protocol:
|
|
|
215
275
|
#!/usr/bin/env python3
|
|
216
276
|
import json
|
|
217
277
|
import sys
|
|
278
|
+
import subprocess
|
|
279
|
+
import re
|
|
280
|
+
import os
|
|
218
281
|
|
|
219
282
|
data = json.load(sys.stdin)
|
|
220
283
|
output = data["output"].lower()
|
|
221
284
|
hint = (data.get("hint") or "").lower()
|
|
222
|
-
|
|
285
|
+
cwd = data.get("cwd")
|
|
286
|
+
|
|
287
|
+
# Validate cwd to prevent command injection
|
|
288
|
+
def is_valid_path(path):
|
|
289
|
+
if not path:
|
|
290
|
+
return False
|
|
291
|
+
# Reject shell metacharacters
|
|
292
|
+
if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
|
|
293
|
+
return False
|
|
294
|
+
# Reject directory traversal and option injection
|
|
295
|
+
if '..' in path or path.startswith('-'):
|
|
296
|
+
return False
|
|
297
|
+
return True
|
|
298
|
+
|
|
299
|
+
# Git-based grading if cwd is provided
|
|
300
|
+
if cwd:
|
|
301
|
+
if not is_valid_path(cwd):
|
|
302
|
+
print(json.dumps({
|
|
303
|
+
"pass": False,
|
|
304
|
+
"score": 0.0,
|
|
305
|
+
"reasoning": "Invalid working directory path"
|
|
306
|
+
}))
|
|
307
|
+
sys.exit(0)
|
|
308
|
+
|
|
309
|
+
safe_cwd = os.path.abspath(cwd)
|
|
310
|
+
|
|
311
|
+
try:
|
|
312
|
+
result = subprocess.run(
|
|
313
|
+
["git", "-C", safe_cwd, "status", "--porcelain"],
|
|
314
|
+
capture_output=True, text=True, check=True
|
|
315
|
+
)
|
|
316
|
+
files_created = [
|
|
317
|
+
line[3:].strip()
|
|
318
|
+
for line in result.stdout.split('\n')
|
|
319
|
+
if line.startswith('??')
|
|
320
|
+
]
|
|
321
|
+
has_changes = len(files_created) > 0
|
|
322
|
+
print(json.dumps({
|
|
323
|
+
"pass": has_changes,
|
|
324
|
+
"score": 1.0 if has_changes else 0.0,
|
|
325
|
+
"reasoning": f"Files created: {', '.join(files_created)}",
|
|
326
|
+
"outcome": {"filesCreated": files_created, "type": "git_check"}
|
|
327
|
+
}))
|
|
328
|
+
sys.exit(0)
|
|
329
|
+
except subprocess.CalledProcessError:
|
|
330
|
+
# Fall back to output-based grading
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
# Output-based grading fallback
|
|
223
334
|
pass_result = hint in output if hint else True
|
|
224
335
|
print(json.dumps({
|
|
225
336
|
"pass": pass_result,
|
|
@@ -234,11 +345,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
|
|
|
234
345
|
```
|
|
235
346
|
|
|
236
347
|
**Protocol:**
|
|
237
|
-
- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
|
|
238
|
-
- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
|
|
348
|
+
- Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
|
|
349
|
+
- Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
|
|
350
|
+
- `cwd` and `outcome` are optional fields
|
|
239
351
|
|
|
240
352
|
## Downstream Integration
|
|
241
353
|
|
|
354
|
+
The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
|
|
355
|
+
|
|
242
356
|
```bash
|
|
243
357
|
# Filter failures
|
|
244
358
|
cat results.jsonl | jq 'select(.score.pass == false)'
|
|
@@ -246,10 +360,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
|
|
|
246
360
|
# Extract tool usage patterns
|
|
247
361
|
cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
|
|
248
362
|
|
|
363
|
+
# Analyze outcomes from git-based graders
|
|
364
|
+
cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
|
|
365
|
+
cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
|
|
366
|
+
cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
|
|
367
|
+
|
|
249
368
|
# Use with your scoring pipeline
|
|
250
369
|
cat results.jsonl | your-scoring-script.ts
|
|
251
370
|
```
|
|
252
371
|
|
|
372
|
+
### Outcome Field
|
|
373
|
+
|
|
374
|
+
Git-based graders can return structured `outcome` data:
|
|
375
|
+
|
|
376
|
+
```jsonl
|
|
377
|
+
{
|
|
378
|
+
"id": "fix-tests",
|
|
379
|
+
"input": "Fix the failing authentication tests",
|
|
380
|
+
"output": "I fixed the auth tests by...",
|
|
381
|
+
"score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
|
|
382
|
+
"outcome": {
|
|
383
|
+
"testsPassed": true,
|
|
384
|
+
"filesModified": ["src/auth.ts", "src/auth.spec.ts"],
|
|
385
|
+
"exitCode": 0,
|
|
386
|
+
"type": "test_execution"
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
This enables rich analysis across evaluations without re-parsing trajectories.
|
|
392
|
+
|
|
253
393
|
## Development
|
|
254
394
|
|
|
255
395
|
```bash
|
package/package.json
CHANGED
package/src/commands/capture.ts
CHANGED
|
@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
|
|
|
225
225
|
|
|
226
226
|
// Apply grader if provided
|
|
227
227
|
if (grader) {
|
|
228
|
-
|
|
228
|
+
const graderResult = await grader({
|
|
229
229
|
input: promptCase.input,
|
|
230
230
|
output,
|
|
231
231
|
hint: promptCase.hint,
|
|
232
232
|
trajectory,
|
|
233
233
|
metadata: promptCase.metadata,
|
|
234
|
+
cwd: session.cwd,
|
|
234
235
|
})
|
|
236
|
+
|
|
237
|
+
result.score = graderResult
|
|
238
|
+
|
|
239
|
+
// Merge outcome from grader if present
|
|
240
|
+
if (graderResult.outcome) {
|
|
241
|
+
result.outcome = graderResult.outcome
|
|
242
|
+
}
|
|
235
243
|
}
|
|
236
244
|
|
|
237
245
|
// Clean up session
|
package/src/commands/trials.ts
CHANGED
|
@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
217
217
|
hint: promptCase.hint,
|
|
218
218
|
trajectory,
|
|
219
219
|
metadata: promptCase.metadata,
|
|
220
|
+
cwd: session.cwd,
|
|
220
221
|
})
|
|
221
222
|
entry.pass = graderResult.pass
|
|
222
223
|
entry.score = graderResult.score
|
|
223
224
|
entry.reasoning = graderResult.reasoning
|
|
225
|
+
|
|
226
|
+
// Merge outcome from grader if present
|
|
227
|
+
if (graderResult.outcome) {
|
|
228
|
+
entry.outcome = graderResult.outcome
|
|
229
|
+
}
|
|
224
230
|
}
|
|
225
231
|
|
|
226
232
|
trialEntries.push(entry)
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for built-in trials comparison graders.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for:
|
|
6
|
+
* - trials-compare-weighted: Configurable weight grader for trials
|
|
7
|
+
* - trials-compare-statistical: Bootstrap confidence interval grader for trials
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { describe, expect, test } from 'bun:test'
|
|
13
|
+
import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
|
|
14
|
+
import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
|
|
15
|
+
import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
|
|
16
|
+
|
|
17
|
+
// ============================================================================
|
|
18
|
+
// Test Fixtures
|
|
19
|
+
// ============================================================================
|
|
20
|
+
|
|
21
|
+
const createMockTrialRuns = (
|
|
22
|
+
overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
|
|
23
|
+
): Record<string, TrialsComparisonRunData> => ({
|
|
24
|
+
baseline: {
|
|
25
|
+
passRate: 0.67,
|
|
26
|
+
passAtK: 0.9,
|
|
27
|
+
passExpK: 0.3,
|
|
28
|
+
k: 3,
|
|
29
|
+
trials: [
|
|
30
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
|
|
31
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
|
|
32
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
|
|
33
|
+
],
|
|
34
|
+
...overrides.baseline,
|
|
35
|
+
},
|
|
36
|
+
variant: {
|
|
37
|
+
passRate: 1.0,
|
|
38
|
+
passAtK: 1.0,
|
|
39
|
+
passExpK: 1.0,
|
|
40
|
+
k: 3,
|
|
41
|
+
trials: [
|
|
42
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
|
|
43
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
|
|
44
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
|
|
45
|
+
],
|
|
46
|
+
...overrides.variant,
|
|
47
|
+
},
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
|
|
51
|
+
id: 'test-001',
|
|
52
|
+
input: 'Test prompt',
|
|
53
|
+
hint: 'Expected output',
|
|
54
|
+
runs,
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
// ============================================================================
|
|
58
|
+
// Weighted Grader Tests
|
|
59
|
+
// ============================================================================
|
|
60
|
+
|
|
61
|
+
describe('trials-compare-weighted grader', () => {
|
|
62
|
+
describe('DEFAULT_TRIALS_WEIGHTS', () => {
|
|
63
|
+
test('has expected default values', () => {
|
|
64
|
+
expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
|
|
65
|
+
expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
|
|
66
|
+
expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('weights sum to 1.0', () => {
|
|
70
|
+
const sum =
|
|
71
|
+
DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
|
|
72
|
+
expect(sum).toBe(1.0)
|
|
73
|
+
})
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
describe('createTrialsWeightedGrader', () => {
|
|
77
|
+
test('returns higher rank for better passAtK when capability weight is high', async () => {
|
|
78
|
+
const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
|
|
79
|
+
const runs = createMockTrialRuns({
|
|
80
|
+
baseline: { passAtK: 0.7 },
|
|
81
|
+
variant: { passAtK: 0.95 },
|
|
82
|
+
})
|
|
83
|
+
const input = createMockTrialInput(runs)
|
|
84
|
+
|
|
85
|
+
const result = await grader(input)
|
|
86
|
+
|
|
87
|
+
expect(result.rankings.length).toBe(2)
|
|
88
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
89
|
+
expect(result.rankings[0]?.rank).toBe(1)
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
test('returns higher rank for better passExpK when reliability weight is high', async () => {
|
|
93
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
|
|
94
|
+
const runs = createMockTrialRuns({
|
|
95
|
+
baseline: { passExpK: 0.9 },
|
|
96
|
+
variant: { passExpK: 0.3 },
|
|
97
|
+
})
|
|
98
|
+
const input = createMockTrialInput(runs)
|
|
99
|
+
|
|
100
|
+
const result = await grader(input)
|
|
101
|
+
|
|
102
|
+
expect(result.rankings[0]?.run).toBe('baseline')
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
test('penalizes flaky runs when consistency weight is high', async () => {
|
|
106
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
|
|
107
|
+
const runs = createMockTrialRuns({
|
|
108
|
+
// baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
|
|
109
|
+
baseline: { passAtK: 0.9, passExpK: 0.3 },
|
|
110
|
+
// variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
|
|
111
|
+
variant: { passAtK: 0.8, passExpK: 0.8 },
|
|
112
|
+
})
|
|
113
|
+
const input = createMockTrialInput(runs)
|
|
114
|
+
|
|
115
|
+
const result = await grader(input)
|
|
116
|
+
|
|
117
|
+
// Variant should win due to lower flakiness (higher consistency)
|
|
118
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
test('includes weights in reasoning', async () => {
|
|
122
|
+
const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
|
|
123
|
+
const grader = createTrialsWeightedGrader(weights)
|
|
124
|
+
const input = createMockTrialInput(createMockTrialRuns())
|
|
125
|
+
|
|
126
|
+
const result = await grader(input)
|
|
127
|
+
|
|
128
|
+
expect(result.reasoning).toContain('capability=0.5')
|
|
129
|
+
expect(result.reasoning).toContain('reliability=0.3')
|
|
130
|
+
expect(result.reasoning).toContain('consistency=0.2')
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
test('handles missing passAtK gracefully (treats as 0)', async () => {
|
|
134
|
+
const grader = createTrialsWeightedGrader()
|
|
135
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
136
|
+
baseline: {
|
|
137
|
+
k: 3,
|
|
138
|
+
trials: [],
|
|
139
|
+
},
|
|
140
|
+
variant: {
|
|
141
|
+
passAtK: 0.8,
|
|
142
|
+
passExpK: 0.5,
|
|
143
|
+
k: 3,
|
|
144
|
+
trials: [],
|
|
145
|
+
},
|
|
146
|
+
}
|
|
147
|
+
const input = createMockTrialInput(runs)
|
|
148
|
+
|
|
149
|
+
const result = await grader(input)
|
|
150
|
+
|
|
151
|
+
// Should not throw, variant should rank higher
|
|
152
|
+
expect(result.rankings.length).toBe(2)
|
|
153
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
test('handles three or more runs', async () => {
|
|
157
|
+
const grader = createTrialsWeightedGrader()
|
|
158
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
159
|
+
a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
|
|
160
|
+
b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
|
|
161
|
+
c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
|
|
162
|
+
}
|
|
163
|
+
const input = createMockTrialInput(runs)
|
|
164
|
+
|
|
165
|
+
const result = await grader(input)
|
|
166
|
+
|
|
167
|
+
expect(result.rankings.length).toBe(3)
|
|
168
|
+
// Ranks should be 1, 2, 3
|
|
169
|
+
expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
|
|
170
|
+
})
|
|
171
|
+
})
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
// ============================================================================
|
|
175
|
+
// Statistical Grader Tests
|
|
176
|
+
// ============================================================================
|
|
177
|
+
|
|
178
|
+
describe('trials-compare-statistical grader', () => {
|
|
179
|
+
describe('createTrialsStatisticalGrader', () => {
|
|
180
|
+
test('returns rankings based on bootstrapped passAtK', async () => {
|
|
181
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
182
|
+
const runs = createMockTrialRuns({
|
|
183
|
+
baseline: { passAtK: 0.6 },
|
|
184
|
+
variant: { passAtK: 0.95 },
|
|
185
|
+
})
|
|
186
|
+
const input = createMockTrialInput(runs)
|
|
187
|
+
|
|
188
|
+
const result = await grader(input)
|
|
189
|
+
|
|
190
|
+
expect(result.rankings.length).toBe(2)
|
|
191
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
192
|
+
})
|
|
193
|
+
|
|
194
|
+
test('uses trial outcomes for bootstrap variance estimation', async () => {
|
|
195
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
196
|
+
// All trials pass for variant, mixed for baseline
|
|
197
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
198
|
+
baseline: {
|
|
199
|
+
passAtK: 0.9,
|
|
200
|
+
passExpK: 0.3,
|
|
201
|
+
k: 5,
|
|
202
|
+
trials: [
|
|
203
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
|
|
204
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
|
|
205
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
|
|
206
|
+
{ trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
|
|
207
|
+
{ trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
|
|
208
|
+
],
|
|
209
|
+
},
|
|
210
|
+
variant: {
|
|
211
|
+
passAtK: 1.0,
|
|
212
|
+
passExpK: 1.0,
|
|
213
|
+
k: 5,
|
|
214
|
+
trials: [
|
|
215
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
|
|
216
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
|
|
217
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
|
|
218
|
+
{ trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
|
|
219
|
+
{ trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
|
|
220
|
+
],
|
|
221
|
+
},
|
|
222
|
+
}
|
|
223
|
+
const input = createMockTrialInput(runs)
|
|
224
|
+
|
|
225
|
+
const result = await grader(input)
|
|
226
|
+
|
|
227
|
+
// Variant with 100% pass rate should rank higher
|
|
228
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
test('indicates significance when passAtK differs substantially', async () => {
|
|
232
|
+
const grader = createTrialsStatisticalGrader(500)
|
|
233
|
+
// Strong difference: all pass vs all fail
|
|
234
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
235
|
+
baseline: {
|
|
236
|
+
passAtK: 0,
|
|
237
|
+
k: 5,
|
|
238
|
+
trials: [
|
|
239
|
+
{ trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
|
|
240
|
+
{ trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
|
|
241
|
+
{ trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
|
|
242
|
+
{ trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
|
|
243
|
+
{ trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
|
|
244
|
+
],
|
|
245
|
+
},
|
|
246
|
+
variant: {
|
|
247
|
+
passAtK: 1.0,
|
|
248
|
+
k: 5,
|
|
249
|
+
trials: [
|
|
250
|
+
{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
|
|
251
|
+
{ trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
|
|
252
|
+
{ trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
|
|
253
|
+
{ trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
|
|
254
|
+
{ trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
|
|
255
|
+
],
|
|
256
|
+
},
|
|
257
|
+
}
|
|
258
|
+
const input = createMockTrialInput(runs)
|
|
259
|
+
|
|
260
|
+
const result = await grader(input)
|
|
261
|
+
|
|
262
|
+
expect(result.reasoning).toContain('clear separation')
|
|
263
|
+
})
|
|
264
|
+
|
|
265
|
+
test('handles empty trials array', async () => {
|
|
266
|
+
const grader = createTrialsStatisticalGrader(100)
|
|
267
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
268
|
+
baseline: { k: 3, trials: [] },
|
|
269
|
+
variant: {
|
|
270
|
+
k: 3,
|
|
271
|
+
trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
|
|
272
|
+
},
|
|
273
|
+
}
|
|
274
|
+
const input = createMockTrialInput(runs)
|
|
275
|
+
|
|
276
|
+
const result = await grader(input)
|
|
277
|
+
|
|
278
|
+
// Should not throw
|
|
279
|
+
expect(result.rankings.length).toBe(2)
|
|
280
|
+
})
|
|
281
|
+
})
|
|
282
|
+
|
|
283
|
+
describe('grade function', () => {
|
|
284
|
+
test('works with default iterations', async () => {
|
|
285
|
+
const runs = createMockTrialRuns()
|
|
286
|
+
const input = createMockTrialInput(runs)
|
|
287
|
+
|
|
288
|
+
const result = await statisticalGrade(input)
|
|
289
|
+
|
|
290
|
+
expect(result.rankings).toBeDefined()
|
|
291
|
+
expect(result.rankings.length).toBe(2)
|
|
292
|
+
})
|
|
293
|
+
})
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
// ============================================================================
|
|
297
|
+
// Edge Case Tests
|
|
298
|
+
// ============================================================================
|
|
299
|
+
|
|
300
|
+
describe('trials comparison grader edge cases', () => {
|
|
301
|
+
test('handles single run gracefully', async () => {
|
|
302
|
+
const grader = createTrialsWeightedGrader()
|
|
303
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
304
|
+
only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
|
|
305
|
+
}
|
|
306
|
+
const input = createMockTrialInput(runs)
|
|
307
|
+
|
|
308
|
+
const result = await grader(input)
|
|
309
|
+
|
|
310
|
+
expect(result.rankings.length).toBe(1)
|
|
311
|
+
expect(result.rankings[0]?.rank).toBe(1)
|
|
312
|
+
})
|
|
313
|
+
|
|
314
|
+
test('handles zero passAtK and passExpK', async () => {
|
|
315
|
+
const grader = createTrialsWeightedGrader()
|
|
316
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
317
|
+
baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
|
|
318
|
+
variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
|
|
319
|
+
}
|
|
320
|
+
const input = createMockTrialInput(runs)
|
|
321
|
+
|
|
322
|
+
const result = await grader(input)
|
|
323
|
+
|
|
324
|
+
expect(result.rankings[0]?.run).toBe('variant')
|
|
325
|
+
})
|
|
326
|
+
|
|
327
|
+
test('deterministic ordering for equal scores', async () => {
|
|
328
|
+
const grader = createTrialsWeightedGrader()
|
|
329
|
+
const runs = createMockTrialRuns({
|
|
330
|
+
baseline: { passAtK: 0.8, passExpK: 0.6 },
|
|
331
|
+
variant: { passAtK: 0.8, passExpK: 0.6 },
|
|
332
|
+
})
|
|
333
|
+
const input = createMockTrialInput(runs)
|
|
334
|
+
|
|
335
|
+
// Run multiple times to check stability
|
|
336
|
+
const results = await Promise.all([grader(input), grader(input), grader(input)])
|
|
337
|
+
|
|
338
|
+
// All should have same ordering
|
|
339
|
+
const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
|
|
340
|
+
expect(new Set(orders).size).toBe(1)
|
|
341
|
+
})
|
|
342
|
+
|
|
343
|
+
test('flakiness is clamped to non-negative', async () => {
|
|
344
|
+
// Edge case: passExpK > passAtK shouldn't happen but handle gracefully
|
|
345
|
+
const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
|
|
346
|
+
const runs: Record<string, TrialsComparisonRunData> = {
|
|
347
|
+
baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
|
|
348
|
+
variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
|
|
349
|
+
}
|
|
350
|
+
const input = createMockTrialInput(runs)
|
|
351
|
+
|
|
352
|
+
const result = await grader(input)
|
|
353
|
+
|
|
354
|
+
// Both should have flakiness 0, so consistency score should be 1.0 for both
|
|
355
|
+
// Variant has higher capability/reliability so it wins on tiebreaker
|
|
356
|
+
expect(result.rankings).toBeDefined()
|
|
357
|
+
})
|
|
358
|
+
})
|