@plaited/agent-eval-harness 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,6 +42,8 @@ import type {
42
42
  TrajectoryInfo,
43
43
  TrajectoryRichness,
44
44
  } from '../schemas.ts'
45
+ import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
46
+ import { runTrialsCompare } from './compare-trials.ts'
45
47
  import type {
46
48
  CompareConfig,
47
49
  ComparisonGrader,
@@ -647,6 +649,7 @@ export const compare = async (args: string[]): Promise<void> => {
647
649
  strategy: { type: 'string', short: 's' },
648
650
  output: { type: 'string', short: 'o' },
649
651
  format: { type: 'string', short: 'f' },
652
+ 'input-format': { type: 'string' },
650
653
  progress: { type: 'boolean', default: false },
651
654
  help: { type: 'boolean', short: 'h' },
652
655
  },
@@ -658,6 +661,7 @@ export const compare = async (args: string[]): Promise<void> => {
658
661
  Usage: agent-eval-harness compare [files...] [options]
659
662
 
660
663
  Compare multiple runs of the same prompts and generate aggregate report.
664
+ Supports both CaptureResult (single-run) and TrialResult (multi-run reliability) formats.
661
665
 
662
666
  Arguments:
663
667
  files... Result files to compare (positional, unlimited)
@@ -668,30 +672,47 @@ Options:
668
672
  -g, --grader Path to custom grader (required if strategy=custom)
669
673
  -o, --output Output file (default: stdout)
670
674
  -f, --format Output format: json (default) or markdown
675
+ --input-format Input format: auto (default), capture, or trials
671
676
  --progress Show progress to stderr
672
677
  -h, --help Show this help message
673
678
 
679
+ Input Formats:
680
+ auto Auto-detect from file content (default)
681
+ capture CaptureResult format (trajectory/timing fields)
682
+ trials TrialResult format (trials/k fields) for pass@k analysis
683
+
674
684
  Built-in Strategies:
675
- weighted Configurable weights for quality, latency, reliability
676
- Customize via: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
677
- statistical Bootstrap sampling for confidence intervals
678
- Customize via: COMPARE_BOOTSTRAP_ITERATIONS
685
+ For CaptureResult (capture format):
686
+ weighted Configurable weights for quality, latency, reliability
687
+ Env vars: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
688
+ statistical Bootstrap sampling for confidence intervals
689
+ Env var: COMPARE_BOOTSTRAP_ITERATIONS
690
+
691
+ For TrialResult (trials format):
692
+ weighted Configurable weights for capability, reliability, consistency
693
+ Env vars: COMPARE_CAPABILITY, COMPARE_RELIABILITY, COMPARE_CONSISTENCY
694
+ statistical Bootstrap sampling for passAtK confidence intervals
695
+ Env var: COMPARE_BOOTSTRAP_ITERATIONS
679
696
 
680
697
  Custom Grader:
681
698
  Must export 'grade' or 'compare' function with signature:
682
- (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
699
+ CaptureResult: (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
700
+ TrialResult: (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
683
701
 
684
702
  Examples:
685
- # Default: weighted strategy with JSON output
703
+ # Default: auto-detect format, weighted strategy, JSON output
686
704
  agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
687
705
 
706
+ # Explicit trials format for pass@k comparison
707
+ agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json
708
+
709
+ # Trials comparison with custom weights
710
+ COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \\
711
+ agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
712
+
688
713
  # Statistical significance strategy
689
714
  agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
690
715
 
691
- # Custom weights
692
- COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \\
693
- agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
694
-
695
716
  # Markdown report
696
717
  agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
697
718
 
@@ -749,19 +770,54 @@ Examples:
749
770
  process.exit(1)
750
771
  }
751
772
 
752
- // Validate format (explicit format takes precedence, otherwise infer from extension)
773
+ // Validate output format (explicit format takes precedence, otherwise infer from extension)
753
774
  const format = inferFormat(values.output, values.format)
754
775
  if (values.format && !['json', 'markdown'].includes(values.format)) {
755
776
  console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
756
777
  process.exit(1)
757
778
  }
758
779
 
759
- await runCompare({
760
- runs,
761
- strategy,
762
- graderPath: values.grader,
763
- outputPath: values.output,
764
- progress: values.progress,
765
- format,
766
- })
780
+ // Validate input format
781
+ const inputFormatArg = values['input-format']
782
+ if (inputFormatArg && !['auto', 'capture', 'trials'].includes(inputFormatArg)) {
783
+ console.error(`Error: Invalid input-format '${inputFormatArg}'. Use: auto, capture, or trials`)
784
+ process.exit(1)
785
+ }
786
+
787
+ // Detect or use specified input format
788
+ let inputFormat: CompareInputFormat
789
+ try {
790
+ if (inputFormatArg === 'capture') {
791
+ inputFormat = 'capture'
792
+ } else if (inputFormatArg === 'trials') {
793
+ inputFormat = 'trials'
794
+ } else {
795
+ // Auto-detect from file content
796
+ inputFormat = await detectAndValidateFormat(runs.map((r) => r.path))
797
+ }
798
+ } catch (error) {
799
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
800
+ process.exit(1)
801
+ }
802
+
803
+ // Route to appropriate comparison function based on input format
804
+ if (inputFormat === 'trials') {
805
+ await runTrialsCompare({
806
+ runs,
807
+ strategy,
808
+ graderPath: values.grader,
809
+ outputPath: values.output,
810
+ progress: values.progress,
811
+ format,
812
+ })
813
+ } else {
814
+ await runCompare({
815
+ runs,
816
+ strategy,
817
+ graderPath: values.grader,
818
+ outputPath: values.output,
819
+ progress: values.progress,
820
+ format,
821
+ })
822
+ }
767
823
  }
@@ -52,6 +52,7 @@ export const runGrade = async (
52
52
  hint: extracted.hint,
53
53
  trajectory: extracted.trajectory,
54
54
  metadata: extracted.metadata,
55
+ cwd: extracted.cwd,
55
56
  })
56
57
 
57
58
  const graded: GradedResult = {
@@ -59,6 +60,11 @@ export const runGrade = async (
59
60
  score,
60
61
  }
61
62
 
63
+ // Merge outcome from grader if present
64
+ if (score.outcome) {
65
+ graded.outcome = score.outcome
66
+ }
67
+
62
68
  const icon = score.pass ? '✓' : '✗'
63
69
  logProgress(` ${icon} score=${score.score.toFixed(2)}`, progress)
64
70
 
@@ -10,7 +10,7 @@
10
10
  * @packageDocumentation
11
11
  */
12
12
 
13
- import type { GraderResult, TrajectoryStep } from '../schemas.ts'
13
+ import type { GraderResult, TrajectoryStep, TrialEntry } from '../schemas.ts'
14
14
 
15
15
  /**
16
16
  * Raw output from the `run` command.
@@ -62,6 +62,8 @@ export type ExtractedResult = {
62
62
  toolErrors: boolean
63
63
  /** Optional metadata from original prompt */
64
64
  metadata?: Record<string, unknown>
65
+ /** Working directory path (optional, for git-based grading) */
66
+ cwd?: string
65
67
  /** Timing metadata */
66
68
  timing: {
67
69
  start: number
@@ -77,10 +79,13 @@ export type ExtractedResult = {
77
79
  *
78
80
  * @remarks
79
81
  * Adds grader score to extracted result.
82
+ * Outcome field is merged from grader result if present.
80
83
  */
81
84
  export type GradedResult = ExtractedResult & {
82
85
  /** Grader score */
83
86
  score: GraderResult
87
+ /** Outcome data from grader (if grader returned outcome) */
88
+ outcome?: Record<string, unknown>
84
89
  }
85
90
 
86
91
  /**
@@ -267,3 +272,54 @@ export type ComparisonResult = {
267
272
  /** Optional reasoning */
268
273
  reasoning?: string
269
274
  }
275
+
276
+ // ============================================================================
277
+ // Trials Comparison Types
278
+ // ============================================================================
279
+
280
+ /**
281
+ * Run data for trials comparison.
282
+ *
283
+ * @remarks
284
+ * Contains the trials-specific metrics (passAtK, passExpK) plus
285
+ * the individual trial entries for deeper analysis.
286
+ */
287
+ export type TrialsComparisonRunData = {
288
+ /** Simple pass rate: passes / k */
289
+ passRate?: number
290
+ /** pass@k: probability of at least one pass in k samples */
291
+ passAtK?: number
292
+ /** pass^k: probability of all k samples passing */
293
+ passExpK?: number
294
+ /** Number of trials (k) */
295
+ k: number
296
+ /** Individual trial results */
297
+ trials: TrialEntry[]
298
+ }
299
+
300
+ /**
301
+ * Input to trials comparison grader function.
302
+ *
303
+ * @remarks
304
+ * Provides all runs' trial results for a single prompt ID
305
+ * so the grader can compare capability and reliability.
306
+ */
307
+ export type TrialsComparisonGraderInput = {
308
+ /** Test case identifier */
309
+ id: string
310
+ /** Original prompt input */
311
+ input: string | string[]
312
+ /** Grader context hint */
313
+ hint?: string
314
+ /** Results keyed by run label */
315
+ runs: Record<string, TrialsComparisonRunData>
316
+ }
317
+
318
+ /**
319
+ * Trials comparison grader function type.
320
+ *
321
+ * @remarks
322
+ * User-provided graders implement this interface to compare
323
+ * multiple runs of the same prompt using trials data.
324
+ */
325
+ export type TrialsComparisonGrader = (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Unit tests for compare format detection.
3
+ *
4
+ * @remarks
5
+ * Tests for auto-detecting CaptureResult vs TrialResult format.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
11
+ import { detectAndValidateFormat, detectInputFormat } from '../compare-format-detection.ts'
12
+
13
+ // ============================================================================
14
+ // Test Fixtures
15
+ // ============================================================================
16
+
17
+ const CAPTURE_RESULT = JSON.stringify({
18
+ id: 'test-001',
19
+ input: 'Hello',
20
+ output: 'Hi there',
21
+ trajectory: [{ type: 'message', content: 'Hi', timestamp: 1234567890 }],
22
+ timing: { start: 1234567890, end: 1234567891, total: 1, sessionCreation: 0 },
23
+ metadata: {},
24
+ toolErrors: false,
25
+ })
26
+
27
+ const TRIAL_RESULT = JSON.stringify({
28
+ id: 'test-001',
29
+ input: 'Hello',
30
+ k: 3,
31
+ passRate: 0.67,
32
+ passAtK: 0.9,
33
+ passExpK: 0.3,
34
+ trials: [
35
+ { trialNum: 1, output: 'Hi', trajectory: [], duration: 100, pass: true, score: 1.0 },
36
+ { trialNum: 2, output: 'Hello', trajectory: [], duration: 120, pass: true, score: 0.8 },
37
+ { trialNum: 3, output: 'Error', trajectory: [], duration: 150, pass: false, score: 0.2 },
38
+ ],
39
+ })
40
+
41
+ const tempDir = `${import.meta.dir}/.test-tmp/format-detection`
42
+
43
+ beforeAll(async () => {
44
+ await Bun.$`mkdir -p ${tempDir}`
45
+ })
46
+
47
+ afterAll(async () => {
48
+ await Bun.$`rm -rf ${tempDir}`
49
+ })
50
+
51
+ // ============================================================================
52
+ // detectInputFormat Tests
53
+ // ============================================================================
54
+
55
+ describe('detectInputFormat', () => {
56
+ test('detects CaptureResult format', async () => {
57
+ const path = `${tempDir}/capture.jsonl`
58
+ await Bun.write(path, `${CAPTURE_RESULT}\n`)
59
+
60
+ const format = await detectInputFormat(path)
61
+
62
+ expect(format).toBe('capture')
63
+ })
64
+
65
+ test('detects TrialResult format', async () => {
66
+ const path = `${tempDir}/trial.jsonl`
67
+ await Bun.write(path, `${TRIAL_RESULT}\n`)
68
+
69
+ const format = await detectInputFormat(path)
70
+
71
+ expect(format).toBe('trials')
72
+ })
73
+
74
+ test('throws on empty file', async () => {
75
+ const path = `${tempDir}/empty.jsonl`
76
+ await Bun.write(path, '')
77
+
78
+ await expect(detectInputFormat(path)).rejects.toThrow('Empty file')
79
+ })
80
+
81
+ test('throws on invalid JSON', async () => {
82
+ const path = `${tempDir}/invalid.jsonl`
83
+ await Bun.write(path, 'not json\n')
84
+
85
+ await expect(detectInputFormat(path)).rejects.toThrow('Invalid JSON')
86
+ })
87
+
88
+ test('throws on unrecognized format', async () => {
89
+ const path = `${tempDir}/unknown.jsonl`
90
+ await Bun.write(path, `${JSON.stringify({ id: 'test', foo: 'bar' })}\n`)
91
+
92
+ await expect(detectInputFormat(path)).rejects.toThrow('Unable to detect format')
93
+ })
94
+
95
+ test('ignores empty lines and uses first non-empty line', async () => {
96
+ const path = `${tempDir}/with-empty.jsonl`
97
+ await Bun.write(path, `\n\n${CAPTURE_RESULT}\n`)
98
+
99
+ const format = await detectInputFormat(path)
100
+
101
+ expect(format).toBe('capture')
102
+ })
103
+ })
104
+
105
+ // ============================================================================
106
+ // detectAndValidateFormat Tests
107
+ // ============================================================================
108
+
109
+ describe('detectAndValidateFormat', () => {
110
+ test('validates all files have same format', async () => {
111
+ const path1 = `${tempDir}/capture1.jsonl`
112
+ const path2 = `${tempDir}/capture2.jsonl`
113
+ await Bun.write(path1, `${CAPTURE_RESULT}\n`)
114
+ await Bun.write(path2, `${CAPTURE_RESULT}\n`)
115
+
116
+ const format = await detectAndValidateFormat([path1, path2])
117
+
118
+ expect(format).toBe('capture')
119
+ })
120
+
121
+ test('throws on format mismatch', async () => {
122
+ const capturePath = `${tempDir}/capture-mixed.jsonl`
123
+ const trialPath = `${tempDir}/trial-mixed.jsonl`
124
+ await Bun.write(capturePath, `${CAPTURE_RESULT}\n`)
125
+ await Bun.write(trialPath, `${TRIAL_RESULT}\n`)
126
+
127
+ await expect(detectAndValidateFormat([capturePath, trialPath])).rejects.toThrow('Format mismatch')
128
+ })
129
+
130
+ test('throws on empty file list', async () => {
131
+ await expect(detectAndValidateFormat([])).rejects.toThrow('No files provided')
132
+ })
133
+
134
+ test('works with single file', async () => {
135
+ const path = `${tempDir}/single-trial.jsonl`
136
+ await Bun.write(path, `${TRIAL_RESULT}\n`)
137
+
138
+ const format = await detectAndValidateFormat([path])
139
+
140
+ expect(format).toBe('trials')
141
+ })
142
+ })
@@ -0,0 +1,277 @@
1
+ /**
2
+ * Unit tests for trials comparison module.
3
+ *
4
+ * @remarks
5
+ * Tests for runTrialsCompare and supporting functions.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
11
+ import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
12
+
13
+ // ============================================================================
14
+ // Test Fixtures
15
+ // ============================================================================
16
+
17
+ const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
18
+ id,
19
+ input: `Prompt for ${id}`,
20
+ k,
21
+ passRate: passAtK,
22
+ passAtK,
23
+ passExpK,
24
+ trials: Array.from({ length: k }, (_, i) => ({
25
+ trialNum: i + 1,
26
+ output: `Output ${i + 1}`,
27
+ trajectory: [],
28
+ duration: 100 + i * 10,
29
+ pass: Math.random() < passAtK,
30
+ score: passAtK,
31
+ })),
32
+ })
33
+
34
+ const tempDir = `${import.meta.dir}/.test-tmp/compare-trials`
35
+
36
+ beforeAll(async () => {
37
+ await Bun.$`mkdir -p ${tempDir}`
38
+ })
39
+
40
+ afterAll(async () => {
41
+ await Bun.$`rm -rf ${tempDir}`
42
+ })
43
+
44
+ // ============================================================================
45
+ // buildTrialsIndex Tests
46
+ // ============================================================================
47
+
48
+ describe('buildTrialsIndex', () => {
49
+ test('builds index from JSONL file', async () => {
50
+ const path = `${tempDir}/trials-index.jsonl`
51
+ const trial1 = createTrialResult('test-001', 0.9, 0.3)
52
+ const trial2 = createTrialResult('test-002', 0.8, 0.6)
53
+ await Bun.write(path, [JSON.stringify(trial1), JSON.stringify(trial2)].join('\n'))
54
+
55
+ const index = await buildTrialsIndex(path)
56
+
57
+ expect(index.size).toBe(2)
58
+ expect(index.get('test-001')?.passAtK).toBe(0.9)
59
+ expect(index.get('test-002')?.passExpK).toBe(0.6)
60
+ })
61
+
62
+ test('handles empty file', async () => {
63
+ const path = `${tempDir}/empty-trials.jsonl`
64
+ await Bun.write(path, '')
65
+
66
+ const index = await buildTrialsIndex(path)
67
+
68
+ expect(index.size).toBe(0)
69
+ })
70
+
71
+ test('throws on invalid JSON', async () => {
72
+ const path = `${tempDir}/invalid-trials.jsonl`
73
+ await Bun.write(path, 'not json\n')
74
+
75
+ await expect(buildTrialsIndex(path)).rejects.toThrow()
76
+ })
77
+ })
78
+
79
+ // ============================================================================
80
+ // runTrialsCompare Tests
81
+ // ============================================================================
82
+
83
+ describe('runTrialsCompare', () => {
84
+ test('compares two trial runs and produces report', async () => {
85
+ const run1Path = `${tempDir}/run1.jsonl`
86
+ const run2Path = `${tempDir}/run2.jsonl`
87
+
88
+ const trial1a = createTrialResult('test-001', 0.9, 0.7)
89
+ const trial1b = createTrialResult('test-002', 0.8, 0.5)
90
+ const trial2a = createTrialResult('test-001', 0.95, 0.9)
91
+ const trial2b = createTrialResult('test-002', 0.6, 0.4)
92
+
93
+ await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
94
+ await Bun.write(run2Path, [JSON.stringify(trial2a), JSON.stringify(trial2b)].join('\n'))
95
+
96
+ const outputPath = `${tempDir}/comparison.json`
97
+ const report = await runTrialsCompare({
98
+ runs: [
99
+ { label: 'baseline', path: run1Path },
100
+ { label: 'variant', path: run2Path },
101
+ ],
102
+ outputPath,
103
+ progress: false,
104
+ })
105
+
106
+ expect(report.meta.inputFormat).toBe('trials')
107
+ expect(report.meta.runs).toEqual(['baseline', 'variant'])
108
+ expect(report.meta.promptCount).toBe(2)
109
+ expect(report.capability).toBeDefined()
110
+ expect(report.reliability).toBeDefined()
111
+ expect(report.flakiness).toBeDefined()
112
+ expect(report.headToHead.capability.length).toBeGreaterThan(0)
113
+
114
+ // Verify output file was written
115
+ const outputExists = await Bun.file(outputPath).exists()
116
+ expect(outputExists).toBe(true)
117
+ })
118
+
119
+ test('throws with fewer than 2 runs', async () => {
120
+ const run1Path = `${tempDir}/single-run.jsonl`
121
+ await Bun.write(run1Path, JSON.stringify(createTrialResult('test-001', 0.9, 0.7)))
122
+
123
+ await expect(
124
+ runTrialsCompare({
125
+ runs: [{ label: 'only', path: run1Path }],
126
+ progress: false,
127
+ }),
128
+ ).rejects.toThrow('At least 2 runs required')
129
+ })
130
+
131
+ test('skips prompts only in one run', async () => {
132
+ const run1Path = `${tempDir}/partial1.jsonl`
133
+ const run2Path = `${tempDir}/partial2.jsonl`
134
+
135
+ // Only run1 has test-001
136
+ const trial1a = createTrialResult('test-001', 0.9, 0.7)
137
+ // Both have test-002
138
+ const trial1b = createTrialResult('test-002', 0.8, 0.5)
139
+ const trial2b = createTrialResult('test-002', 0.6, 0.4)
140
+
141
+ await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
142
+ await Bun.write(run2Path, JSON.stringify(trial2b))
143
+
144
+ const report = await runTrialsCompare({
145
+ runs: [
146
+ { label: 'run1', path: run1Path },
147
+ { label: 'run2', path: run2Path },
148
+ ],
149
+ progress: false,
150
+ })
151
+
152
+ // Only test-002 should be compared (both runs have it)
153
+ expect(report.headToHead.overall.length).toBeGreaterThan(0)
154
+ // Per-prompt should only have test-002
155
+ const perPromptIds = report.perPrompt?.map((p) => p.id) ?? []
156
+ expect(perPromptIds).toContain('test-002')
157
+ expect(perPromptIds).not.toContain('test-001')
158
+ })
159
+
160
+ test('generates markdown output when format is markdown', async () => {
161
+ const run1Path = `${tempDir}/md-run1.jsonl`
162
+ const run2Path = `${tempDir}/md-run2.jsonl`
163
+ const outputPath = `${tempDir}/report.md`
164
+
165
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
166
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
167
+
168
+ await Bun.write(run1Path, JSON.stringify(trial1))
169
+ await Bun.write(run2Path, JSON.stringify(trial2))
170
+
171
+ await runTrialsCompare({
172
+ runs: [
173
+ { label: 'agent1', path: run1Path },
174
+ { label: 'agent2', path: run2Path },
175
+ ],
176
+ outputPath,
177
+ format: 'markdown',
178
+ progress: false,
179
+ })
180
+
181
+ const content = await Bun.file(outputPath).text()
182
+ expect(content).toContain('# Trials Comparison Report')
183
+ expect(content).toContain('## Capability')
184
+ expect(content).toContain('## Reliability')
185
+ expect(content).toContain('## Flakiness')
186
+ expect(content).toContain('agent1')
187
+ expect(content).toContain('agent2')
188
+ })
189
+
190
+ test('uses statistical strategy when specified', async () => {
191
+ const run1Path = `${tempDir}/stat-run1.jsonl`
192
+ const run2Path = `${tempDir}/stat-run2.jsonl`
193
+
194
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
195
+ const trial2 = createTrialResult('test-001', 0.5, 0.3)
196
+
197
+ await Bun.write(run1Path, JSON.stringify(trial1))
198
+ await Bun.write(run2Path, JSON.stringify(trial2))
199
+
200
+ const report = await runTrialsCompare({
201
+ runs: [
202
+ { label: 'better', path: run1Path },
203
+ { label: 'worse', path: run2Path },
204
+ ],
205
+ strategy: 'statistical',
206
+ progress: false,
207
+ })
208
+
209
+ // Report should be generated without error
210
+ expect(report.meta.runs).toEqual(['better', 'worse'])
211
+ })
212
+
213
+ test('computes correct capability metrics', async () => {
214
+ const run1Path = `${tempDir}/cap-run1.jsonl`
215
+
216
+ // Create 3 prompts with known passAtK values
217
+ const trials = [
218
+ createTrialResult('p1', 1.0, 0.8), // passAtK = 1.0
219
+ createTrialResult('p2', 0.5, 0.3), // passAtK = 0.5
220
+ createTrialResult('p3', 0.8, 0.6), // passAtK = 0.8
221
+ ]
222
+ // Average passAtK = (1.0 + 0.5 + 0.8) / 3 = 0.767
223
+ // Sorted: 0.5, 0.8, 1.0 -> median = 0.8
224
+
225
+ await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
226
+
227
+ const run2Path = `${tempDir}/cap-run2.jsonl`
228
+ await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
229
+
230
+ const report = await runTrialsCompare({
231
+ runs: [
232
+ { label: 'test', path: run1Path },
233
+ { label: 'test2', path: run2Path },
234
+ ],
235
+ progress: false,
236
+ })
237
+
238
+ const cap = report.capability.test
239
+ expect(cap).toBeDefined()
240
+ // Average should be approximately 0.767
241
+ expect(cap?.avgPassAtK).toBeCloseTo(0.767, 2)
242
+ // Median of [0.5, 0.8, 1.0] = 0.8
243
+ expect(cap?.medianPassAtK).toBeCloseTo(0.8, 2)
244
+ })
245
+
246
+ test('identifies flaky prompts correctly', async () => {
247
+ const run1Path = `${tempDir}/flaky-run1.jsonl`
248
+
249
+ // Create prompts with varying flakiness
250
+ const trials = [
251
+ createTrialResult('consistent', 0.9, 0.9), // flakiness = 0
252
+ createTrialResult('flaky', 0.9, 0.1), // flakiness = 0.8
253
+ createTrialResult('moderate', 0.7, 0.5), // flakiness = 0.2
254
+ ]
255
+
256
+ await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
257
+
258
+ const run2Path = `${tempDir}/flaky-run2.jsonl`
259
+ await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
260
+
261
+ const report = await runTrialsCompare({
262
+ runs: [
263
+ { label: 'test', path: run1Path },
264
+ { label: 'test2', path: run2Path },
265
+ ],
266
+ progress: false,
267
+ })
268
+
269
+ const flak = report.flakiness.test
270
+ expect(flak).toBeDefined()
271
+ // 2 prompts have non-zero flakiness
272
+ expect(flak?.flakyPromptCount).toBe(2)
273
+ // Top flaky should include 'flaky' prompt
274
+ const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
275
+ expect(topFlakyIds).toContain('flaky')
276
+ })
277
+ })
@@ -47,6 +47,7 @@ const resolvePath = (path: string): string => {
47
47
  * The metadata field contains arbitrary key-value pairs from the original
48
48
  * prompt JSONL (e.g., category, difficulty, tags). Use this to implement
49
49
  * category-specific grading logic or filter calibration samples.
50
+ * The cwd field provides the working directory path for git-based outcome detection.
50
51
  */
51
52
  type ExecGraderInput = {
52
53
  input: string | string[]
@@ -54,6 +55,7 @@ type ExecGraderInput = {
54
55
  hint?: string
55
56
  trajectory?: TrajectoryStep[]
56
57
  metadata?: Record<string, unknown>
58
+ cwd?: string
57
59
  }
58
60
 
59
61
  /**
@@ -73,6 +75,8 @@ const createExecGrader = (execPath: string): Grader => {
73
75
  output: params.output,
74
76
  hint: params.hint,
75
77
  trajectory: params.trajectory,
78
+ metadata: params.metadata,
79
+ cwd: params.cwd,
76
80
  }
77
81
 
78
82
  const inputJson = JSON.stringify(input)