@plaited/agent-eval-harness 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/package.json +1 -1
- package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
- package/src/graders/trials-compare-statistical.ts +188 -0
- package/src/graders/trials-compare-weighted.ts +128 -0
- package/src/graders.ts +21 -1
- package/src/pipeline/compare-format-detection.ts +100 -0
- package/src/pipeline/compare-trials.ts +596 -0
- package/src/pipeline/compare.ts +75 -19
- package/src/pipeline/pipeline.types.ts +52 -1
- package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
- package/src/pipeline/tests/compare-trials.spec.ts +277 -0
- package/src/schemas/schemas.ts +151 -0
- package/src/schemas.ts +13 -0
package/src/pipeline/compare.ts
CHANGED
|
@@ -42,6 +42,8 @@ import type {
|
|
|
42
42
|
TrajectoryInfo,
|
|
43
43
|
TrajectoryRichness,
|
|
44
44
|
} from '../schemas.ts'
|
|
45
|
+
import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
|
|
46
|
+
import { runTrialsCompare } from './compare-trials.ts'
|
|
45
47
|
import type {
|
|
46
48
|
CompareConfig,
|
|
47
49
|
ComparisonGrader,
|
|
@@ -647,6 +649,7 @@ export const compare = async (args: string[]): Promise<void> => {
|
|
|
647
649
|
strategy: { type: 'string', short: 's' },
|
|
648
650
|
output: { type: 'string', short: 'o' },
|
|
649
651
|
format: { type: 'string', short: 'f' },
|
|
652
|
+
'input-format': { type: 'string' },
|
|
650
653
|
progress: { type: 'boolean', default: false },
|
|
651
654
|
help: { type: 'boolean', short: 'h' },
|
|
652
655
|
},
|
|
@@ -658,6 +661,7 @@ export const compare = async (args: string[]): Promise<void> => {
|
|
|
658
661
|
Usage: agent-eval-harness compare [files...] [options]
|
|
659
662
|
|
|
660
663
|
Compare multiple runs of the same prompts and generate aggregate report.
|
|
664
|
+
Supports both CaptureResult (single-run) and TrialResult (multi-run reliability) formats.
|
|
661
665
|
|
|
662
666
|
Arguments:
|
|
663
667
|
files... Result files to compare (positional, unlimited)
|
|
@@ -668,30 +672,47 @@ Options:
|
|
|
668
672
|
-g, --grader Path to custom grader (required if strategy=custom)
|
|
669
673
|
-o, --output Output file (default: stdout)
|
|
670
674
|
-f, --format Output format: json (default) or markdown
|
|
675
|
+
--input-format Input format: auto (default), capture, or trials
|
|
671
676
|
--progress Show progress to stderr
|
|
672
677
|
-h, --help Show this help message
|
|
673
678
|
|
|
679
|
+
Input Formats:
|
|
680
|
+
auto Auto-detect from file content (default)
|
|
681
|
+
capture CaptureResult format (trajectory/timing fields)
|
|
682
|
+
trials TrialResult format (trials/k fields) for pass@k analysis
|
|
683
|
+
|
|
674
684
|
Built-in Strategies:
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
685
|
+
For CaptureResult (capture format):
|
|
686
|
+
weighted Configurable weights for quality, latency, reliability
|
|
687
|
+
Env vars: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
|
|
688
|
+
statistical Bootstrap sampling for confidence intervals
|
|
689
|
+
Env var: COMPARE_BOOTSTRAP_ITERATIONS
|
|
690
|
+
|
|
691
|
+
For TrialResult (trials format):
|
|
692
|
+
weighted Configurable weights for capability, reliability, consistency
|
|
693
|
+
Env vars: COMPARE_CAPABILITY, COMPARE_RELIABILITY, COMPARE_CONSISTENCY
|
|
694
|
+
statistical Bootstrap sampling for passAtK confidence intervals
|
|
695
|
+
Env var: COMPARE_BOOTSTRAP_ITERATIONS
|
|
679
696
|
|
|
680
697
|
Custom Grader:
|
|
681
698
|
Must export 'grade' or 'compare' function with signature:
|
|
682
|
-
(params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
699
|
+
CaptureResult: (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
700
|
+
TrialResult: (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
683
701
|
|
|
684
702
|
Examples:
|
|
685
|
-
# Default: weighted strategy
|
|
703
|
+
# Default: auto-detect format, weighted strategy, JSON output
|
|
686
704
|
agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
687
705
|
|
|
706
|
+
# Explicit trials format for pass@k comparison
|
|
707
|
+
agent-eval-harness compare trials1.jsonl trials2.jsonl --input-format trials -o comparison.json
|
|
708
|
+
|
|
709
|
+
# Trials comparison with custom weights
|
|
710
|
+
COMPARE_CAPABILITY=0.5 COMPARE_RELIABILITY=0.3 COMPARE_CONSISTENCY=0.2 \\
|
|
711
|
+
agent-eval-harness compare trials1.jsonl trials2.jsonl -o comparison.json
|
|
712
|
+
|
|
688
713
|
# Statistical significance strategy
|
|
689
714
|
agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
|
|
690
715
|
|
|
691
|
-
# Custom weights
|
|
692
|
-
COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \\
|
|
693
|
-
agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
694
|
-
|
|
695
716
|
# Markdown report
|
|
696
717
|
agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
|
|
697
718
|
|
|
@@ -749,19 +770,54 @@ Examples:
|
|
|
749
770
|
process.exit(1)
|
|
750
771
|
}
|
|
751
772
|
|
|
752
|
-
// Validate format (explicit format takes precedence, otherwise infer from extension)
|
|
773
|
+
// Validate output format (explicit format takes precedence, otherwise infer from extension)
|
|
753
774
|
const format = inferFormat(values.output, values.format)
|
|
754
775
|
if (values.format && !['json', 'markdown'].includes(values.format)) {
|
|
755
776
|
console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
|
|
756
777
|
process.exit(1)
|
|
757
778
|
}
|
|
758
779
|
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
780
|
+
// Validate input format
|
|
781
|
+
const inputFormatArg = values['input-format']
|
|
782
|
+
if (inputFormatArg && !['auto', 'capture', 'trials'].includes(inputFormatArg)) {
|
|
783
|
+
console.error(`Error: Invalid input-format '${inputFormatArg}'. Use: auto, capture, or trials`)
|
|
784
|
+
process.exit(1)
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// Detect or use specified input format
|
|
788
|
+
let inputFormat: CompareInputFormat
|
|
789
|
+
try {
|
|
790
|
+
if (inputFormatArg === 'capture') {
|
|
791
|
+
inputFormat = 'capture'
|
|
792
|
+
} else if (inputFormatArg === 'trials') {
|
|
793
|
+
inputFormat = 'trials'
|
|
794
|
+
} else {
|
|
795
|
+
// Auto-detect from file content
|
|
796
|
+
inputFormat = await detectAndValidateFormat(runs.map((r) => r.path))
|
|
797
|
+
}
|
|
798
|
+
} catch (error) {
|
|
799
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
800
|
+
process.exit(1)
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// Route to appropriate comparison function based on input format
|
|
804
|
+
if (inputFormat === 'trials') {
|
|
805
|
+
await runTrialsCompare({
|
|
806
|
+
runs,
|
|
807
|
+
strategy,
|
|
808
|
+
graderPath: values.grader,
|
|
809
|
+
outputPath: values.output,
|
|
810
|
+
progress: values.progress,
|
|
811
|
+
format,
|
|
812
|
+
})
|
|
813
|
+
} else {
|
|
814
|
+
await runCompare({
|
|
815
|
+
runs,
|
|
816
|
+
strategy,
|
|
817
|
+
graderPath: values.grader,
|
|
818
|
+
outputPath: values.output,
|
|
819
|
+
progress: values.progress,
|
|
820
|
+
format,
|
|
821
|
+
})
|
|
822
|
+
}
|
|
767
823
|
}
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* @packageDocumentation
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import type { GraderResult, TrajectoryStep } from '../schemas.ts'
|
|
13
|
+
import type { GraderResult, TrajectoryStep, TrialEntry } from '../schemas.ts'
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* Raw output from the `run` command.
|
|
@@ -272,3 +272,54 @@ export type ComparisonResult = {
|
|
|
272
272
|
/** Optional reasoning */
|
|
273
273
|
reasoning?: string
|
|
274
274
|
}
|
|
275
|
+
|
|
276
|
+
// ============================================================================
|
|
277
|
+
// Trials Comparison Types
|
|
278
|
+
// ============================================================================
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Run data for trials comparison.
|
|
282
|
+
*
|
|
283
|
+
* @remarks
|
|
284
|
+
* Contains the trials-specific metrics (passAtK, passExpK) plus
|
|
285
|
+
* the individual trial entries for deeper analysis.
|
|
286
|
+
*/
|
|
287
|
+
export type TrialsComparisonRunData = {
|
|
288
|
+
/** Simple pass rate: passes / k */
|
|
289
|
+
passRate?: number
|
|
290
|
+
/** pass@k: probability of at least one pass in k samples */
|
|
291
|
+
passAtK?: number
|
|
292
|
+
/** pass^k: probability of all k samples passing */
|
|
293
|
+
passExpK?: number
|
|
294
|
+
/** Number of trials (k) */
|
|
295
|
+
k: number
|
|
296
|
+
/** Individual trial results */
|
|
297
|
+
trials: TrialEntry[]
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* Input to trials comparison grader function.
|
|
302
|
+
*
|
|
303
|
+
* @remarks
|
|
304
|
+
* Provides all runs' trial results for a single prompt ID
|
|
305
|
+
* so the grader can compare capability and reliability.
|
|
306
|
+
*/
|
|
307
|
+
export type TrialsComparisonGraderInput = {
|
|
308
|
+
/** Test case identifier */
|
|
309
|
+
id: string
|
|
310
|
+
/** Original prompt input */
|
|
311
|
+
input: string | string[]
|
|
312
|
+
/** Grader context hint */
|
|
313
|
+
hint?: string
|
|
314
|
+
/** Results keyed by run label */
|
|
315
|
+
runs: Record<string, TrialsComparisonRunData>
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Trials comparison grader function type.
|
|
320
|
+
*
|
|
321
|
+
* @remarks
|
|
322
|
+
* User-provided graders implement this interface to compare
|
|
323
|
+
* multiple runs of the same prompt using trials data.
|
|
324
|
+
*/
|
|
325
|
+
export type TrialsComparisonGrader = (params: TrialsComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for compare format detection.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for auto-detecting CaptureResult vs TrialResult format.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
|
|
11
|
+
import { detectAndValidateFormat, detectInputFormat } from '../compare-format-detection.ts'
|
|
12
|
+
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// Test Fixtures
|
|
15
|
+
// ============================================================================
|
|
16
|
+
|
|
17
|
+
const CAPTURE_RESULT = JSON.stringify({
|
|
18
|
+
id: 'test-001',
|
|
19
|
+
input: 'Hello',
|
|
20
|
+
output: 'Hi there',
|
|
21
|
+
trajectory: [{ type: 'message', content: 'Hi', timestamp: 1234567890 }],
|
|
22
|
+
timing: { start: 1234567890, end: 1234567891, total: 1, sessionCreation: 0 },
|
|
23
|
+
metadata: {},
|
|
24
|
+
toolErrors: false,
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
const TRIAL_RESULT = JSON.stringify({
|
|
28
|
+
id: 'test-001',
|
|
29
|
+
input: 'Hello',
|
|
30
|
+
k: 3,
|
|
31
|
+
passRate: 0.67,
|
|
32
|
+
passAtK: 0.9,
|
|
33
|
+
passExpK: 0.3,
|
|
34
|
+
trials: [
|
|
35
|
+
{ trialNum: 1, output: 'Hi', trajectory: [], duration: 100, pass: true, score: 1.0 },
|
|
36
|
+
{ trialNum: 2, output: 'Hello', trajectory: [], duration: 120, pass: true, score: 0.8 },
|
|
37
|
+
{ trialNum: 3, output: 'Error', trajectory: [], duration: 150, pass: false, score: 0.2 },
|
|
38
|
+
],
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
const tempDir = `${import.meta.dir}/.test-tmp/format-detection`
|
|
42
|
+
|
|
43
|
+
beforeAll(async () => {
|
|
44
|
+
await Bun.$`mkdir -p ${tempDir}`
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
afterAll(async () => {
|
|
48
|
+
await Bun.$`rm -rf ${tempDir}`
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
// ============================================================================
|
|
52
|
+
// detectInputFormat Tests
|
|
53
|
+
// ============================================================================
|
|
54
|
+
|
|
55
|
+
describe('detectInputFormat', () => {
|
|
56
|
+
test('detects CaptureResult format', async () => {
|
|
57
|
+
const path = `${tempDir}/capture.jsonl`
|
|
58
|
+
await Bun.write(path, `${CAPTURE_RESULT}\n`)
|
|
59
|
+
|
|
60
|
+
const format = await detectInputFormat(path)
|
|
61
|
+
|
|
62
|
+
expect(format).toBe('capture')
|
|
63
|
+
})
|
|
64
|
+
|
|
65
|
+
test('detects TrialResult format', async () => {
|
|
66
|
+
const path = `${tempDir}/trial.jsonl`
|
|
67
|
+
await Bun.write(path, `${TRIAL_RESULT}\n`)
|
|
68
|
+
|
|
69
|
+
const format = await detectInputFormat(path)
|
|
70
|
+
|
|
71
|
+
expect(format).toBe('trials')
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
test('throws on empty file', async () => {
|
|
75
|
+
const path = `${tempDir}/empty.jsonl`
|
|
76
|
+
await Bun.write(path, '')
|
|
77
|
+
|
|
78
|
+
await expect(detectInputFormat(path)).rejects.toThrow('Empty file')
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
test('throws on invalid JSON', async () => {
|
|
82
|
+
const path = `${tempDir}/invalid.jsonl`
|
|
83
|
+
await Bun.write(path, 'not json\n')
|
|
84
|
+
|
|
85
|
+
await expect(detectInputFormat(path)).rejects.toThrow('Invalid JSON')
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
test('throws on unrecognized format', async () => {
|
|
89
|
+
const path = `${tempDir}/unknown.jsonl`
|
|
90
|
+
await Bun.write(path, `${JSON.stringify({ id: 'test', foo: 'bar' })}\n`)
|
|
91
|
+
|
|
92
|
+
await expect(detectInputFormat(path)).rejects.toThrow('Unable to detect format')
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
test('ignores empty lines and uses first non-empty line', async () => {
|
|
96
|
+
const path = `${tempDir}/with-empty.jsonl`
|
|
97
|
+
await Bun.write(path, `\n\n${CAPTURE_RESULT}\n`)
|
|
98
|
+
|
|
99
|
+
const format = await detectInputFormat(path)
|
|
100
|
+
|
|
101
|
+
expect(format).toBe('capture')
|
|
102
|
+
})
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// detectAndValidateFormat Tests
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
describe('detectAndValidateFormat', () => {
|
|
110
|
+
test('validates all files have same format', async () => {
|
|
111
|
+
const path1 = `${tempDir}/capture1.jsonl`
|
|
112
|
+
const path2 = `${tempDir}/capture2.jsonl`
|
|
113
|
+
await Bun.write(path1, `${CAPTURE_RESULT}\n`)
|
|
114
|
+
await Bun.write(path2, `${CAPTURE_RESULT}\n`)
|
|
115
|
+
|
|
116
|
+
const format = await detectAndValidateFormat([path1, path2])
|
|
117
|
+
|
|
118
|
+
expect(format).toBe('capture')
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
test('throws on format mismatch', async () => {
|
|
122
|
+
const capturePath = `${tempDir}/capture-mixed.jsonl`
|
|
123
|
+
const trialPath = `${tempDir}/trial-mixed.jsonl`
|
|
124
|
+
await Bun.write(capturePath, `${CAPTURE_RESULT}\n`)
|
|
125
|
+
await Bun.write(trialPath, `${TRIAL_RESULT}\n`)
|
|
126
|
+
|
|
127
|
+
await expect(detectAndValidateFormat([capturePath, trialPath])).rejects.toThrow('Format mismatch')
|
|
128
|
+
})
|
|
129
|
+
|
|
130
|
+
test('throws on empty file list', async () => {
|
|
131
|
+
await expect(detectAndValidateFormat([])).rejects.toThrow('No files provided')
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
test('works with single file', async () => {
|
|
135
|
+
const path = `${tempDir}/single-trial.jsonl`
|
|
136
|
+
await Bun.write(path, `${TRIAL_RESULT}\n`)
|
|
137
|
+
|
|
138
|
+
const format = await detectAndValidateFormat([path])
|
|
139
|
+
|
|
140
|
+
expect(format).toBe('trials')
|
|
141
|
+
})
|
|
142
|
+
})
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for trials comparison module.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for runTrialsCompare and supporting functions.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { afterAll, beforeAll, describe, expect, test } from 'bun:test'
|
|
11
|
+
import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
|
|
12
|
+
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// Test Fixtures
|
|
15
|
+
// ============================================================================
|
|
16
|
+
|
|
17
|
+
const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
|
|
18
|
+
id,
|
|
19
|
+
input: `Prompt for ${id}`,
|
|
20
|
+
k,
|
|
21
|
+
passRate: passAtK,
|
|
22
|
+
passAtK,
|
|
23
|
+
passExpK,
|
|
24
|
+
trials: Array.from({ length: k }, (_, i) => ({
|
|
25
|
+
trialNum: i + 1,
|
|
26
|
+
output: `Output ${i + 1}`,
|
|
27
|
+
trajectory: [],
|
|
28
|
+
duration: 100 + i * 10,
|
|
29
|
+
pass: Math.random() < passAtK,
|
|
30
|
+
score: passAtK,
|
|
31
|
+
})),
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
const tempDir = `${import.meta.dir}/.test-tmp/compare-trials`
|
|
35
|
+
|
|
36
|
+
beforeAll(async () => {
|
|
37
|
+
await Bun.$`mkdir -p ${tempDir}`
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
afterAll(async () => {
|
|
41
|
+
await Bun.$`rm -rf ${tempDir}`
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
// ============================================================================
|
|
45
|
+
// buildTrialsIndex Tests
|
|
46
|
+
// ============================================================================
|
|
47
|
+
|
|
48
|
+
describe('buildTrialsIndex', () => {
|
|
49
|
+
test('builds index from JSONL file', async () => {
|
|
50
|
+
const path = `${tempDir}/trials-index.jsonl`
|
|
51
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.3)
|
|
52
|
+
const trial2 = createTrialResult('test-002', 0.8, 0.6)
|
|
53
|
+
await Bun.write(path, [JSON.stringify(trial1), JSON.stringify(trial2)].join('\n'))
|
|
54
|
+
|
|
55
|
+
const index = await buildTrialsIndex(path)
|
|
56
|
+
|
|
57
|
+
expect(index.size).toBe(2)
|
|
58
|
+
expect(index.get('test-001')?.passAtK).toBe(0.9)
|
|
59
|
+
expect(index.get('test-002')?.passExpK).toBe(0.6)
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
test('handles empty file', async () => {
|
|
63
|
+
const path = `${tempDir}/empty-trials.jsonl`
|
|
64
|
+
await Bun.write(path, '')
|
|
65
|
+
|
|
66
|
+
const index = await buildTrialsIndex(path)
|
|
67
|
+
|
|
68
|
+
expect(index.size).toBe(0)
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
test('throws on invalid JSON', async () => {
|
|
72
|
+
const path = `${tempDir}/invalid-trials.jsonl`
|
|
73
|
+
await Bun.write(path, 'not json\n')
|
|
74
|
+
|
|
75
|
+
await expect(buildTrialsIndex(path)).rejects.toThrow()
|
|
76
|
+
})
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
// ============================================================================
|
|
80
|
+
// runTrialsCompare Tests
|
|
81
|
+
// ============================================================================
|
|
82
|
+
|
|
83
|
+
describe('runTrialsCompare', () => {
|
|
84
|
+
test('compares two trial runs and produces report', async () => {
|
|
85
|
+
const run1Path = `${tempDir}/run1.jsonl`
|
|
86
|
+
const run2Path = `${tempDir}/run2.jsonl`
|
|
87
|
+
|
|
88
|
+
const trial1a = createTrialResult('test-001', 0.9, 0.7)
|
|
89
|
+
const trial1b = createTrialResult('test-002', 0.8, 0.5)
|
|
90
|
+
const trial2a = createTrialResult('test-001', 0.95, 0.9)
|
|
91
|
+
const trial2b = createTrialResult('test-002', 0.6, 0.4)
|
|
92
|
+
|
|
93
|
+
await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
|
|
94
|
+
await Bun.write(run2Path, [JSON.stringify(trial2a), JSON.stringify(trial2b)].join('\n'))
|
|
95
|
+
|
|
96
|
+
const outputPath = `${tempDir}/comparison.json`
|
|
97
|
+
const report = await runTrialsCompare({
|
|
98
|
+
runs: [
|
|
99
|
+
{ label: 'baseline', path: run1Path },
|
|
100
|
+
{ label: 'variant', path: run2Path },
|
|
101
|
+
],
|
|
102
|
+
outputPath,
|
|
103
|
+
progress: false,
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
expect(report.meta.inputFormat).toBe('trials')
|
|
107
|
+
expect(report.meta.runs).toEqual(['baseline', 'variant'])
|
|
108
|
+
expect(report.meta.promptCount).toBe(2)
|
|
109
|
+
expect(report.capability).toBeDefined()
|
|
110
|
+
expect(report.reliability).toBeDefined()
|
|
111
|
+
expect(report.flakiness).toBeDefined()
|
|
112
|
+
expect(report.headToHead.capability.length).toBeGreaterThan(0)
|
|
113
|
+
|
|
114
|
+
// Verify output file was written
|
|
115
|
+
const outputExists = await Bun.file(outputPath).exists()
|
|
116
|
+
expect(outputExists).toBe(true)
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
test('throws with fewer than 2 runs', async () => {
|
|
120
|
+
const run1Path = `${tempDir}/single-run.jsonl`
|
|
121
|
+
await Bun.write(run1Path, JSON.stringify(createTrialResult('test-001', 0.9, 0.7)))
|
|
122
|
+
|
|
123
|
+
await expect(
|
|
124
|
+
runTrialsCompare({
|
|
125
|
+
runs: [{ label: 'only', path: run1Path }],
|
|
126
|
+
progress: false,
|
|
127
|
+
}),
|
|
128
|
+
).rejects.toThrow('At least 2 runs required')
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
test('skips prompts only in one run', async () => {
|
|
132
|
+
const run1Path = `${tempDir}/partial1.jsonl`
|
|
133
|
+
const run2Path = `${tempDir}/partial2.jsonl`
|
|
134
|
+
|
|
135
|
+
// Only run1 has test-001
|
|
136
|
+
const trial1a = createTrialResult('test-001', 0.9, 0.7)
|
|
137
|
+
// Both have test-002
|
|
138
|
+
const trial1b = createTrialResult('test-002', 0.8, 0.5)
|
|
139
|
+
const trial2b = createTrialResult('test-002', 0.6, 0.4)
|
|
140
|
+
|
|
141
|
+
await Bun.write(run1Path, [JSON.stringify(trial1a), JSON.stringify(trial1b)].join('\n'))
|
|
142
|
+
await Bun.write(run2Path, JSON.stringify(trial2b))
|
|
143
|
+
|
|
144
|
+
const report = await runTrialsCompare({
|
|
145
|
+
runs: [
|
|
146
|
+
{ label: 'run1', path: run1Path },
|
|
147
|
+
{ label: 'run2', path: run2Path },
|
|
148
|
+
],
|
|
149
|
+
progress: false,
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
// Only test-002 should be compared (both runs have it)
|
|
153
|
+
expect(report.headToHead.overall.length).toBeGreaterThan(0)
|
|
154
|
+
// Per-prompt should only have test-002
|
|
155
|
+
const perPromptIds = report.perPrompt?.map((p) => p.id) ?? []
|
|
156
|
+
expect(perPromptIds).toContain('test-002')
|
|
157
|
+
expect(perPromptIds).not.toContain('test-001')
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
test('generates markdown output when format is markdown', async () => {
|
|
161
|
+
const run1Path = `${tempDir}/md-run1.jsonl`
|
|
162
|
+
const run2Path = `${tempDir}/md-run2.jsonl`
|
|
163
|
+
const outputPath = `${tempDir}/report.md`
|
|
164
|
+
|
|
165
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
166
|
+
const trial2 = createTrialResult('test-001', 0.8, 0.6)
|
|
167
|
+
|
|
168
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
169
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
170
|
+
|
|
171
|
+
await runTrialsCompare({
|
|
172
|
+
runs: [
|
|
173
|
+
{ label: 'agent1', path: run1Path },
|
|
174
|
+
{ label: 'agent2', path: run2Path },
|
|
175
|
+
],
|
|
176
|
+
outputPath,
|
|
177
|
+
format: 'markdown',
|
|
178
|
+
progress: false,
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
const content = await Bun.file(outputPath).text()
|
|
182
|
+
expect(content).toContain('# Trials Comparison Report')
|
|
183
|
+
expect(content).toContain('## Capability')
|
|
184
|
+
expect(content).toContain('## Reliability')
|
|
185
|
+
expect(content).toContain('## Flakiness')
|
|
186
|
+
expect(content).toContain('agent1')
|
|
187
|
+
expect(content).toContain('agent2')
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
test('uses statistical strategy when specified', async () => {
|
|
191
|
+
const run1Path = `${tempDir}/stat-run1.jsonl`
|
|
192
|
+
const run2Path = `${tempDir}/stat-run2.jsonl`
|
|
193
|
+
|
|
194
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
195
|
+
const trial2 = createTrialResult('test-001', 0.5, 0.3)
|
|
196
|
+
|
|
197
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
198
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
199
|
+
|
|
200
|
+
const report = await runTrialsCompare({
|
|
201
|
+
runs: [
|
|
202
|
+
{ label: 'better', path: run1Path },
|
|
203
|
+
{ label: 'worse', path: run2Path },
|
|
204
|
+
],
|
|
205
|
+
strategy: 'statistical',
|
|
206
|
+
progress: false,
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
// Report should be generated without error
|
|
210
|
+
expect(report.meta.runs).toEqual(['better', 'worse'])
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
test('computes correct capability metrics', async () => {
|
|
214
|
+
const run1Path = `${tempDir}/cap-run1.jsonl`
|
|
215
|
+
|
|
216
|
+
// Create 3 prompts with known passAtK values
|
|
217
|
+
const trials = [
|
|
218
|
+
createTrialResult('p1', 1.0, 0.8), // passAtK = 1.0
|
|
219
|
+
createTrialResult('p2', 0.5, 0.3), // passAtK = 0.5
|
|
220
|
+
createTrialResult('p3', 0.8, 0.6), // passAtK = 0.8
|
|
221
|
+
]
|
|
222
|
+
// Average passAtK = (1.0 + 0.5 + 0.8) / 3 = 0.767
|
|
223
|
+
// Sorted: 0.5, 0.8, 1.0 -> median = 0.8
|
|
224
|
+
|
|
225
|
+
await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
|
|
226
|
+
|
|
227
|
+
const run2Path = `${tempDir}/cap-run2.jsonl`
|
|
228
|
+
await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
|
|
229
|
+
|
|
230
|
+
const report = await runTrialsCompare({
|
|
231
|
+
runs: [
|
|
232
|
+
{ label: 'test', path: run1Path },
|
|
233
|
+
{ label: 'test2', path: run2Path },
|
|
234
|
+
],
|
|
235
|
+
progress: false,
|
|
236
|
+
})
|
|
237
|
+
|
|
238
|
+
const cap = report.capability.test
|
|
239
|
+
expect(cap).toBeDefined()
|
|
240
|
+
// Average should be approximately 0.767
|
|
241
|
+
expect(cap?.avgPassAtK).toBeCloseTo(0.767, 2)
|
|
242
|
+
// Median of [0.5, 0.8, 1.0] = 0.8
|
|
243
|
+
expect(cap?.medianPassAtK).toBeCloseTo(0.8, 2)
|
|
244
|
+
})
|
|
245
|
+
|
|
246
|
+
test('identifies flaky prompts correctly', async () => {
|
|
247
|
+
const run1Path = `${tempDir}/flaky-run1.jsonl`
|
|
248
|
+
|
|
249
|
+
// Create prompts with varying flakiness
|
|
250
|
+
const trials = [
|
|
251
|
+
createTrialResult('consistent', 0.9, 0.9), // flakiness = 0
|
|
252
|
+
createTrialResult('flaky', 0.9, 0.1), // flakiness = 0.8
|
|
253
|
+
createTrialResult('moderate', 0.7, 0.5), // flakiness = 0.2
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
await Bun.write(run1Path, trials.map((t) => JSON.stringify(t)).join('\n'))
|
|
257
|
+
|
|
258
|
+
const run2Path = `${tempDir}/flaky-run2.jsonl`
|
|
259
|
+
await Bun.write(run2Path, trials.map((t) => JSON.stringify(t)).join('\n'))
|
|
260
|
+
|
|
261
|
+
const report = await runTrialsCompare({
|
|
262
|
+
runs: [
|
|
263
|
+
{ label: 'test', path: run1Path },
|
|
264
|
+
{ label: 'test2', path: run2Path },
|
|
265
|
+
],
|
|
266
|
+
progress: false,
|
|
267
|
+
})
|
|
268
|
+
|
|
269
|
+
const flak = report.flakiness.test
|
|
270
|
+
expect(flak).toBeDefined()
|
|
271
|
+
// 2 prompts have non-zero flakiness
|
|
272
|
+
expect(flak?.flakyPromptCount).toBe(2)
|
|
273
|
+
// Top flaky should include 'flaky' prompt
|
|
274
|
+
const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
|
|
275
|
+
expect(topFlakyIds).toContain('flaky')
|
|
276
|
+
})
|
|
277
|
+
})
|