@plaited/agent-eval-harness 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -78,6 +78,9 @@ cat prompts.jsonl | \
78
78
 
79
79
  # Compare runs (built-in strategies: weighted, statistical, custom)
80
80
  bunx @plaited/agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
81
+
82
+ # Compare trials for pass@k reliability analysis (auto-detects format)
83
+ bunx @plaited/agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
81
84
  ```
82
85
 
83
86
  ## Skills for AI Agents
@@ -184,11 +187,68 @@ Key fields:
184
187
 
185
188
  ## Graders
186
189
 
187
- Graders score agent outputs. The harness supports two types:
190
+ Graders score agent outputs. The harness supports two types and two grading approaches:
191
+
192
+ ### Git-Based Outcome Grading (Recommended for Coding Agents)
193
+
194
+ **Grade outcomes, not paths.** Use git to detect actual environmental changes:
195
+
196
+ ```typescript
197
+ import type { Grader } from '@plaited/agent-eval-harness/schemas'
198
+ import { resolve } from 'node:path'
199
+
200
+ export const grade: Grader = async ({ output, hint, cwd }) => {
201
+ // Validate cwd to prevent command injection
202
+ const isValidPath = (path: string): boolean => {
203
+ const dangerousChars = /[;&|`$(){}[\]<>'"\\]/
204
+ if (dangerousChars.test(path)) return false
205
+ if (path.includes('..') || path.startsWith('-')) return false
206
+ return true
207
+ }
208
+
209
+ if (!cwd || !isValidPath(cwd)) {
210
+ return {
211
+ pass: false,
212
+ score: 0,
213
+ reasoning: 'Invalid working directory path'
214
+ }
215
+ }
216
+
217
+ const safeCwd = resolve(cwd)
218
+
219
+ // Detect file changes using git
220
+ const status = await Bun.$`git -C ${safeCwd} status --porcelain`.text()
221
+ const filesCreated = status
222
+ .split('\n')
223
+ .filter(line => line.startsWith('??'))
224
+ .map(line => line.slice(3).trim())
225
+
226
+ // Run tests to verify outcome
227
+ const testResult = await Bun.$`cd ${safeCwd} && bun test`.nothrow()
228
+ const testsPassed = testResult.exitCode === 0
229
+
230
+ return {
231
+ pass: filesCreated.length > 0 && testsPassed,
232
+ score: testsPassed ? 1.0 : 0.0,
233
+ reasoning: `Files created: ${filesCreated.join(', ')}. Tests: ${testsPassed ? 'pass' : 'fail'}`,
234
+ outcome: { // Optional: structured data for analysis
235
+ filesCreated,
236
+ testsPassed,
237
+ type: 'file_creation_with_tests'
238
+ }
239
+ }
240
+ }
241
+ ```
242
+
243
+ **Benefits:**
244
+ - Detects actual file changes, test results, build success
245
+ - Works universally in any git repo, any language
246
+ - Returns structured `outcome` data for downstream analysis
247
+ - Zero configuration required
188
248
 
189
- ### TypeScript/JavaScript Graders
249
+ ### Output-Based Grading (General Purpose)
190
250
 
191
- Export a `grade` function:
251
+ For non-coding tasks or when git is unavailable:
192
252
 
193
253
  ```typescript
194
254
  import type { Grader } from '@plaited/agent-eval-harness/schemas'
@@ -215,11 +275,62 @@ Any executable script using stdin/stdout JSON protocol:
215
275
  #!/usr/bin/env python3
216
276
  import json
217
277
  import sys
278
+ import subprocess
279
+ import re
280
+ import os
218
281
 
219
282
  data = json.load(sys.stdin)
220
283
  output = data["output"].lower()
221
284
  hint = (data.get("hint") or "").lower()
222
-
285
+ cwd = data.get("cwd")
286
+
287
+ # Validate cwd to prevent command injection
288
+ def is_valid_path(path):
289
+ if not path:
290
+ return False
291
+ # Reject shell metacharacters
292
+ if re.search(r'[;&|`$(){}\[\]<>\'"\\]', path):
293
+ return False
294
+ # Reject directory traversal and option injection
295
+ if '..' in path or path.startswith('-'):
296
+ return False
297
+ return True
298
+
299
+ # Git-based grading if cwd is provided
300
+ if cwd:
301
+ if not is_valid_path(cwd):
302
+ print(json.dumps({
303
+ "pass": False,
304
+ "score": 0.0,
305
+ "reasoning": "Invalid working directory path"
306
+ }))
307
+ sys.exit(0)
308
+
309
+ safe_cwd = os.path.abspath(cwd)
310
+
311
+ try:
312
+ result = subprocess.run(
313
+ ["git", "-C", safe_cwd, "status", "--porcelain"],
314
+ capture_output=True, text=True, check=True
315
+ )
316
+ files_created = [
317
+ line[3:].strip()
318
+ for line in result.stdout.split('\n')
319
+ if line.startswith('??')
320
+ ]
321
+ has_changes = len(files_created) > 0
322
+ print(json.dumps({
323
+ "pass": has_changes,
324
+ "score": 1.0 if has_changes else 0.0,
325
+ "reasoning": f"Files created: {', '.join(files_created)}",
326
+ "outcome": {"filesCreated": files_created, "type": "git_check"}
327
+ }))
328
+ sys.exit(0)
329
+ except subprocess.CalledProcessError:
330
+ # Fall back to output-based grading
331
+ pass
332
+
333
+ # Output-based grading fallback
223
334
  pass_result = hint in output if hint else True
224
335
  print(json.dumps({
225
336
  "pass": pass_result,
@@ -234,11 +345,14 @@ agent-eval-harness capture prompts.jsonl --schema ./claude.json --grader ./grade
234
345
  ```
235
346
 
236
347
  **Protocol:**
237
- - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...]}`
238
- - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "..."}`
348
+ - Input (stdin): `{"input": "...", "output": "...", "hint": "...", "trajectory": [...], "cwd": "/path/to/dir"}`
349
+ - Output (stdout): `{"pass": true, "score": 1.0, "reasoning": "...", "outcome": {...}}`
350
+ - `cwd` and `outcome` are optional fields
239
351
 
240
352
  ## Downstream Integration
241
353
 
354
+ The harness outputs standard JSONL. When graders return the optional `outcome` field, it's merged onto results for powerful downstream analysis:
355
+
242
356
  ```bash
243
357
  # Filter failures
244
358
  cat results.jsonl | jq 'select(.score.pass == false)'
@@ -246,10 +360,36 @@ cat results.jsonl | jq 'select(.score.pass == false)'
246
360
  # Extract tool usage patterns
247
361
  cat results.jsonl | jq '.trajectory[] | select(.type == "tool_call") | .name'
248
362
 
363
+ # Analyze outcomes from git-based graders
364
+ cat results.jsonl | jq 'select(.outcome.type == "test_execution")'
365
+ cat results.jsonl | jq -s 'map(select(.outcome.testsPassed)) | length'
366
+ cat results.jsonl | jq 'select(.outcome.touchedCriticalFiles == true)'
367
+
249
368
  # Use with your scoring pipeline
250
369
  cat results.jsonl | your-scoring-script.ts
251
370
  ```
252
371
 
372
+ ### Outcome Field
373
+
374
+ Git-based graders can return structured `outcome` data:
375
+
376
+ ```jsonl
377
+ {
378
+ "id": "fix-tests",
379
+ "input": "Fix the failing authentication tests",
380
+ "output": "I fixed the auth tests by...",
381
+ "score": {"pass": true, "score": 1.0, "reasoning": "Tests pass"},
382
+ "outcome": {
383
+ "testsPassed": true,
384
+ "filesModified": ["src/auth.ts", "src/auth.spec.ts"],
385
+ "exitCode": 0,
386
+ "type": "test_execution"
387
+ }
388
+ }
389
+ ```
390
+
391
+ This enables rich analysis across evaluations without re-parsing trajectories.
392
+
253
393
  ## Development
254
394
 
255
395
  ```bash
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.6.2",
3
+ "version": "0.8.0",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -225,13 +225,21 @@ export const runCapture = async (config: CaptureConfig): Promise<CaptureResult[]
225
225
 
226
226
  // Apply grader if provided
227
227
  if (grader) {
228
- result.score = await grader({
228
+ const graderResult = await grader({
229
229
  input: promptCase.input,
230
230
  output,
231
231
  hint: promptCase.hint,
232
232
  trajectory,
233
233
  metadata: promptCase.metadata,
234
+ cwd: session.cwd,
234
235
  })
236
+
237
+ result.score = graderResult
238
+
239
+ // Merge outcome from grader if present
240
+ if (graderResult.outcome) {
241
+ result.outcome = graderResult.outcome
242
+ }
235
243
  }
236
244
 
237
245
  // Clean up session
@@ -217,10 +217,16 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
217
217
  hint: promptCase.hint,
218
218
  trajectory,
219
219
  metadata: promptCase.metadata,
220
+ cwd: session.cwd,
220
221
  })
221
222
  entry.pass = graderResult.pass
222
223
  entry.score = graderResult.score
223
224
  entry.reasoning = graderResult.reasoning
225
+
226
+ // Merge outcome from grader if present
227
+ if (graderResult.outcome) {
228
+ entry.outcome = graderResult.outcome
229
+ }
224
230
  }
225
231
 
226
232
  trialEntries.push(entry)
@@ -0,0 +1,358 @@
1
+ /**
2
+ * Unit tests for built-in trials comparison graders.
3
+ *
4
+ * @remarks
5
+ * Tests for:
6
+ * - trials-compare-weighted: Configurable weight grader for trials
7
+ * - trials-compare-statistical: Bootstrap confidence interval grader for trials
8
+ *
9
+ * @packageDocumentation
10
+ */
11
+
12
+ import { describe, expect, test } from 'bun:test'
13
+ import type { TrialsComparisonGraderInput, TrialsComparisonRunData } from '../../pipeline/pipeline.types.ts'
14
+ import { createTrialsStatisticalGrader, grade as statisticalGrade } from '../trials-compare-statistical.ts'
15
+ import { createTrialsWeightedGrader, DEFAULT_TRIALS_WEIGHTS, type TrialsWeights } from '../trials-compare-weighted.ts'
16
+
17
+ // ============================================================================
18
+ // Test Fixtures
19
+ // ============================================================================
20
+
21
+ const createMockTrialRuns = (
22
+ overrides: Partial<Record<string, Partial<TrialsComparisonRunData>>> = {},
23
+ ): Record<string, TrialsComparisonRunData> => ({
24
+ baseline: {
25
+ passRate: 0.67,
26
+ passAtK: 0.9,
27
+ passExpK: 0.3,
28
+ k: 3,
29
+ trials: [
30
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true, score: 1.0 },
31
+ { trialNum: 2, output: 'B', trajectory: [], duration: 110, pass: true, score: 0.9 },
32
+ { trialNum: 3, output: 'C', trajectory: [], duration: 120, pass: false, score: 0.2 },
33
+ ],
34
+ ...overrides.baseline,
35
+ },
36
+ variant: {
37
+ passRate: 1.0,
38
+ passAtK: 1.0,
39
+ passExpK: 1.0,
40
+ k: 3,
41
+ trials: [
42
+ { trialNum: 1, output: 'X', trajectory: [], duration: 150, pass: true, score: 1.0 },
43
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 160, pass: true, score: 1.0 },
44
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 170, pass: true, score: 1.0 },
45
+ ],
46
+ ...overrides.variant,
47
+ },
48
+ })
49
+
50
+ const createMockTrialInput = (runs: Record<string, TrialsComparisonRunData>): TrialsComparisonGraderInput => ({
51
+ id: 'test-001',
52
+ input: 'Test prompt',
53
+ hint: 'Expected output',
54
+ runs,
55
+ })
56
+
57
+ // ============================================================================
58
+ // Weighted Grader Tests
59
+ // ============================================================================
60
+
61
+ describe('trials-compare-weighted grader', () => {
62
+ describe('DEFAULT_TRIALS_WEIGHTS', () => {
63
+ test('has expected default values', () => {
64
+ expect(DEFAULT_TRIALS_WEIGHTS.capability).toBe(0.4)
65
+ expect(DEFAULT_TRIALS_WEIGHTS.reliability).toBe(0.4)
66
+ expect(DEFAULT_TRIALS_WEIGHTS.consistency).toBe(0.2)
67
+ })
68
+
69
+ test('weights sum to 1.0', () => {
70
+ const sum =
71
+ DEFAULT_TRIALS_WEIGHTS.capability + DEFAULT_TRIALS_WEIGHTS.reliability + DEFAULT_TRIALS_WEIGHTS.consistency
72
+ expect(sum).toBe(1.0)
73
+ })
74
+ })
75
+
76
+ describe('createTrialsWeightedGrader', () => {
77
+ test('returns higher rank for better passAtK when capability weight is high', async () => {
78
+ const grader = createTrialsWeightedGrader({ capability: 1.0, reliability: 0.0, consistency: 0.0 })
79
+ const runs = createMockTrialRuns({
80
+ baseline: { passAtK: 0.7 },
81
+ variant: { passAtK: 0.95 },
82
+ })
83
+ const input = createMockTrialInput(runs)
84
+
85
+ const result = await grader(input)
86
+
87
+ expect(result.rankings.length).toBe(2)
88
+ expect(result.rankings[0]?.run).toBe('variant')
89
+ expect(result.rankings[0]?.rank).toBe(1)
90
+ })
91
+
92
+ test('returns higher rank for better passExpK when reliability weight is high', async () => {
93
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 1.0, consistency: 0.0 })
94
+ const runs = createMockTrialRuns({
95
+ baseline: { passExpK: 0.9 },
96
+ variant: { passExpK: 0.3 },
97
+ })
98
+ const input = createMockTrialInput(runs)
99
+
100
+ const result = await grader(input)
101
+
102
+ expect(result.rankings[0]?.run).toBe('baseline')
103
+ })
104
+
105
+ test('penalizes flaky runs when consistency weight is high', async () => {
106
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
107
+ const runs = createMockTrialRuns({
108
+ // baseline: passAtK=0.9, passExpK=0.3, flakiness=0.6
109
+ baseline: { passAtK: 0.9, passExpK: 0.3 },
110
+ // variant: passAtK=0.8, passExpK=0.8, flakiness=0.0
111
+ variant: { passAtK: 0.8, passExpK: 0.8 },
112
+ })
113
+ const input = createMockTrialInput(runs)
114
+
115
+ const result = await grader(input)
116
+
117
+ // Variant should win due to lower flakiness (higher consistency)
118
+ expect(result.rankings[0]?.run).toBe('variant')
119
+ })
120
+
121
+ test('includes weights in reasoning', async () => {
122
+ const weights: TrialsWeights = { capability: 0.5, reliability: 0.3, consistency: 0.2 }
123
+ const grader = createTrialsWeightedGrader(weights)
124
+ const input = createMockTrialInput(createMockTrialRuns())
125
+
126
+ const result = await grader(input)
127
+
128
+ expect(result.reasoning).toContain('capability=0.5')
129
+ expect(result.reasoning).toContain('reliability=0.3')
130
+ expect(result.reasoning).toContain('consistency=0.2')
131
+ })
132
+
133
+ test('handles missing passAtK gracefully (treats as 0)', async () => {
134
+ const grader = createTrialsWeightedGrader()
135
+ const runs: Record<string, TrialsComparisonRunData> = {
136
+ baseline: {
137
+ k: 3,
138
+ trials: [],
139
+ },
140
+ variant: {
141
+ passAtK: 0.8,
142
+ passExpK: 0.5,
143
+ k: 3,
144
+ trials: [],
145
+ },
146
+ }
147
+ const input = createMockTrialInput(runs)
148
+
149
+ const result = await grader(input)
150
+
151
+ // Should not throw, variant should rank higher
152
+ expect(result.rankings.length).toBe(2)
153
+ expect(result.rankings[0]?.run).toBe('variant')
154
+ })
155
+
156
+ test('handles three or more runs', async () => {
157
+ const grader = createTrialsWeightedGrader()
158
+ const runs: Record<string, TrialsComparisonRunData> = {
159
+ a: { passAtK: 0.9, passExpK: 0.8, k: 3, trials: [] },
160
+ b: { passAtK: 0.7, passExpK: 0.7, k: 3, trials: [] },
161
+ c: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
162
+ }
163
+ const input = createMockTrialInput(runs)
164
+
165
+ const result = await grader(input)
166
+
167
+ expect(result.rankings.length).toBe(3)
168
+ // Ranks should be 1, 2, 3
169
+ expect(result.rankings.map((r) => r.rank)).toEqual([1, 2, 3])
170
+ })
171
+ })
172
+ })
173
+
174
+ // ============================================================================
175
+ // Statistical Grader Tests
176
+ // ============================================================================
177
+
178
+ describe('trials-compare-statistical grader', () => {
179
+ describe('createTrialsStatisticalGrader', () => {
180
+ test('returns rankings based on bootstrapped passAtK', async () => {
181
+ const grader = createTrialsStatisticalGrader(100)
182
+ const runs = createMockTrialRuns({
183
+ baseline: { passAtK: 0.6 },
184
+ variant: { passAtK: 0.95 },
185
+ })
186
+ const input = createMockTrialInput(runs)
187
+
188
+ const result = await grader(input)
189
+
190
+ expect(result.rankings.length).toBe(2)
191
+ expect(result.rankings[0]?.run).toBe('variant')
192
+ })
193
+
194
+ test('uses trial outcomes for bootstrap variance estimation', async () => {
195
+ const grader = createTrialsStatisticalGrader(100)
196
+ // All trials pass for variant, mixed for baseline
197
+ const runs: Record<string, TrialsComparisonRunData> = {
198
+ baseline: {
199
+ passAtK: 0.9,
200
+ passExpK: 0.3,
201
+ k: 5,
202
+ trials: [
203
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: true },
204
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: true },
205
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
206
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: true },
207
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
208
+ ],
209
+ },
210
+ variant: {
211
+ passAtK: 1.0,
212
+ passExpK: 1.0,
213
+ k: 5,
214
+ trials: [
215
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
216
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
217
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
218
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
219
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
220
+ ],
221
+ },
222
+ }
223
+ const input = createMockTrialInput(runs)
224
+
225
+ const result = await grader(input)
226
+
227
+ // Variant with 100% pass rate should rank higher
228
+ expect(result.rankings[0]?.run).toBe('variant')
229
+ })
230
+
231
+ test('indicates significance when passAtK differs substantially', async () => {
232
+ const grader = createTrialsStatisticalGrader(500)
233
+ // Strong difference: all pass vs all fail
234
+ const runs: Record<string, TrialsComparisonRunData> = {
235
+ baseline: {
236
+ passAtK: 0,
237
+ k: 5,
238
+ trials: [
239
+ { trialNum: 1, output: 'A', trajectory: [], duration: 100, pass: false },
240
+ { trialNum: 2, output: 'B', trajectory: [], duration: 100, pass: false },
241
+ { trialNum: 3, output: 'C', trajectory: [], duration: 100, pass: false },
242
+ { trialNum: 4, output: 'D', trajectory: [], duration: 100, pass: false },
243
+ { trialNum: 5, output: 'E', trajectory: [], duration: 100, pass: false },
244
+ ],
245
+ },
246
+ variant: {
247
+ passAtK: 1.0,
248
+ k: 5,
249
+ trials: [
250
+ { trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true },
251
+ { trialNum: 2, output: 'Y', trajectory: [], duration: 100, pass: true },
252
+ { trialNum: 3, output: 'Z', trajectory: [], duration: 100, pass: true },
253
+ { trialNum: 4, output: 'W', trajectory: [], duration: 100, pass: true },
254
+ { trialNum: 5, output: 'V', trajectory: [], duration: 100, pass: true },
255
+ ],
256
+ },
257
+ }
258
+ const input = createMockTrialInput(runs)
259
+
260
+ const result = await grader(input)
261
+
262
+ expect(result.reasoning).toContain('clear separation')
263
+ })
264
+
265
+ test('handles empty trials array', async () => {
266
+ const grader = createTrialsStatisticalGrader(100)
267
+ const runs: Record<string, TrialsComparisonRunData> = {
268
+ baseline: { k: 3, trials: [] },
269
+ variant: {
270
+ k: 3,
271
+ trials: [{ trialNum: 1, output: 'X', trajectory: [], duration: 100, pass: true }],
272
+ },
273
+ }
274
+ const input = createMockTrialInput(runs)
275
+
276
+ const result = await grader(input)
277
+
278
+ // Should not throw
279
+ expect(result.rankings.length).toBe(2)
280
+ })
281
+ })
282
+
283
+ describe('grade function', () => {
284
+ test('works with default iterations', async () => {
285
+ const runs = createMockTrialRuns()
286
+ const input = createMockTrialInput(runs)
287
+
288
+ const result = await statisticalGrade(input)
289
+
290
+ expect(result.rankings).toBeDefined()
291
+ expect(result.rankings.length).toBe(2)
292
+ })
293
+ })
294
+ })
295
+
296
+ // ============================================================================
297
+ // Edge Case Tests
298
+ // ============================================================================
299
+
300
+ describe('trials comparison grader edge cases', () => {
301
+ test('handles single run gracefully', async () => {
302
+ const grader = createTrialsWeightedGrader()
303
+ const runs: Record<string, TrialsComparisonRunData> = {
304
+ only: { passAtK: 1.0, passExpK: 0.8, k: 3, trials: [] },
305
+ }
306
+ const input = createMockTrialInput(runs)
307
+
308
+ const result = await grader(input)
309
+
310
+ expect(result.rankings.length).toBe(1)
311
+ expect(result.rankings[0]?.rank).toBe(1)
312
+ })
313
+
314
+ test('handles zero passAtK and passExpK', async () => {
315
+ const grader = createTrialsWeightedGrader()
316
+ const runs: Record<string, TrialsComparisonRunData> = {
317
+ baseline: { passAtK: 0, passExpK: 0, k: 3, trials: [] },
318
+ variant: { passAtK: 0.5, passExpK: 0.2, k: 3, trials: [] },
319
+ }
320
+ const input = createMockTrialInput(runs)
321
+
322
+ const result = await grader(input)
323
+
324
+ expect(result.rankings[0]?.run).toBe('variant')
325
+ })
326
+
327
+ test('deterministic ordering for equal scores', async () => {
328
+ const grader = createTrialsWeightedGrader()
329
+ const runs = createMockTrialRuns({
330
+ baseline: { passAtK: 0.8, passExpK: 0.6 },
331
+ variant: { passAtK: 0.8, passExpK: 0.6 },
332
+ })
333
+ const input = createMockTrialInput(runs)
334
+
335
+ // Run multiple times to check stability
336
+ const results = await Promise.all([grader(input), grader(input), grader(input)])
337
+
338
+ // All should have same ordering
339
+ const orders = results.map((r) => r.rankings.map((rank) => rank.run).join(','))
340
+ expect(new Set(orders).size).toBe(1)
341
+ })
342
+
343
+ test('flakiness is clamped to non-negative', async () => {
344
+ // Edge case: passExpK > passAtK shouldn't happen but handle gracefully
345
+ const grader = createTrialsWeightedGrader({ capability: 0.0, reliability: 0.0, consistency: 1.0 })
346
+ const runs: Record<string, TrialsComparisonRunData> = {
347
+ baseline: { passAtK: 0.5, passExpK: 0.7, k: 3, trials: [] }, // Invalid but should work
348
+ variant: { passAtK: 0.8, passExpK: 0.8, k: 3, trials: [] },
349
+ }
350
+ const input = createMockTrialInput(runs)
351
+
352
+ const result = await grader(input)
353
+
354
+ // Both should have flakiness 0, so consistency score should be 1.0 for both
355
+ // Variant has higher capability/reliability so it wins on tiebreaker
356
+ expect(result.rankings).toBeDefined()
357
+ })
358
+ })