@plaited/agent-eval-harness 0.7.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,650 @@
1
+ /**
2
+ * Pipeline compare command for trials data.
3
+ *
4
+ * @remarks
5
+ * Compares multiple runs of TrialResult data, analyzing capability (passAtK),
6
+ * reliability (passExpK), and flakiness metrics.
7
+ *
8
+ * Outputs a TrialsComparisonReport JSON (not JSONL) containing aggregate
9
+ * statistics across all dimensions plus head-to-head comparisons.
10
+ *
11
+ * Built-in strategies:
12
+ * - `weighted`: Configurable weights for capability, reliability, consistency (default)
13
+ * - `statistical`: Bootstrap sampling for confidence intervals on passAtK
14
+ *
15
+ * @packageDocumentation
16
+ */
17
+
18
+ import { logProgress, writeOutput } from '../core.ts'
19
+ import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
20
+ import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
21
+ import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
22
+ import type {
23
+ PairwiseComparison,
24
+ TrialResult,
25
+ TrialsCapabilityMetrics,
26
+ TrialsComparisonMeta,
27
+ TrialsComparisonReport,
28
+ TrialsFlakinessMetrics,
29
+ TrialsPromptComparison,
30
+ TrialsReliabilityMetrics,
31
+ } from '../schemas.ts'
32
+ import { TrialResultSchema } from '../schemas.ts'
33
+ import type {
34
+ ComparisonGraderResult,
35
+ LabeledRun,
36
+ TrialsComparisonGrader,
37
+ TrialsComparisonGraderInput,
38
+ TrialsComparisonRunData,
39
+ } from './pipeline.types.ts'
40
+
41
+ /** Comparison strategy type for trials */
42
+ export type TrialsCompareStrategy = 'weighted' | 'statistical' | 'custom'
43
+
44
+ /** Extended compare config for trials */
45
+ export type TrialsCompareConfig = {
46
+ /** Labeled runs to compare */
47
+ runs: LabeledRun[]
48
+ /** Comparison strategy (default: weighted) */
49
+ strategy?: TrialsCompareStrategy
50
+ /** Path to custom grader (required if strategy is 'custom') */
51
+ graderPath?: string
52
+ /** Output file path */
53
+ outputPath?: string
54
+ /** Show progress to stderr */
55
+ progress?: boolean
56
+ /** Output format (default: json) */
57
+ format?: 'json' | 'markdown'
58
+ }
59
+
60
+ /**
61
+ * Stream trial results from a JSONL file.
62
+ *
63
+ * @param path - Path to the trials.jsonl file
64
+ * @yields Parsed and validated trial results
65
+ */
66
+ async function* streamTrialResults(path: string): AsyncGenerator<TrialResult, void, unknown> {
67
+ const file = Bun.file(path)
68
+ const text = await file.text()
69
+ const lines = text.split('\n')
70
+
71
+ for (let i = 0; i < lines.length; i++) {
72
+ const line = lines[i]?.trim()
73
+ if (!line) continue
74
+
75
+ try {
76
+ yield TrialResultSchema.parse(JSON.parse(line))
77
+ } catch (error) {
78
+ throw new Error(`Invalid trial result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
79
+ }
80
+ }
81
+ }
82
+
83
+ /**
84
+ * Build an indexed map of trial results by ID.
85
+ *
86
+ * @param path - Path to the trials.jsonl file
87
+ * @returns Map of result ID to TrialResult
88
+ */
89
+ export const buildTrialsIndex = async (path: string): Promise<Map<string, TrialResult>> => {
90
+ const index = new Map<string, TrialResult>()
91
+
92
+ for await (const result of streamTrialResults(path)) {
93
+ index.set(result.id, result)
94
+ }
95
+
96
+ return index
97
+ }
98
+
99
+ /**
100
+ * Load trials comparison grader from file.
101
+ *
102
+ * @param path - Path to grader module
103
+ * @returns Loaded trials comparison grader function
104
+ * @throws Error if module cannot be loaded or doesn't export a grader function
105
+ */
106
+ const loadTrialsComparisonGrader = async (path: string): Promise<TrialsComparisonGrader> => {
107
+ let module: Record<string, unknown>
108
+ try {
109
+ module = (await import(path)) as Record<string, unknown>
110
+ } catch (error) {
111
+ throw new Error(`Failed to load grader from '${path}': ${error instanceof Error ? error.message : error}`)
112
+ }
113
+
114
+ if (typeof module.grade === 'function') {
115
+ return module.grade as TrialsComparisonGrader
116
+ }
117
+ if (typeof module.default === 'function') {
118
+ return module.default as TrialsComparisonGrader
119
+ }
120
+ if (typeof module.compare === 'function') {
121
+ return module.compare as TrialsComparisonGrader
122
+ }
123
+
124
+ throw new Error(`Trials comparison grader must export 'grade', 'compare', or 'default' function`)
125
+ }
126
+
127
+ /**
128
+ * Get grader function based on strategy.
129
+ *
130
+ * @param strategy - Comparison strategy
131
+ * @param graderPath - Path to custom grader (for 'custom' strategy)
132
+ * @returns Trials comparison grader function
133
+ */
134
+ const getTrialsGrader = async (
135
+ strategy: TrialsCompareStrategy,
136
+ graderPath?: string,
137
+ ): Promise<TrialsComparisonGrader> => {
138
+ switch (strategy) {
139
+ case 'weighted':
140
+ return weightedGrade
141
+ case 'statistical':
142
+ return statisticalGrade
143
+ case 'custom':
144
+ if (!graderPath) {
145
+ throw new Error('Custom strategy requires --grader path')
146
+ }
147
+ return loadTrialsComparisonGrader(graderPath)
148
+ }
149
+ }
150
+
151
+ /**
152
+ * Compute percentile from sorted array.
153
+ *
154
+ * @param sorted - Sorted array of numbers
155
+ * @param p - Percentile (0-1)
156
+ * @returns Value at percentile
157
+ */
158
+ const percentile = (sorted: number[], p: number): number => {
159
+ if (sorted.length === 0) return 0
160
+ const idx = Math.floor(sorted.length * p)
161
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0
162
+ }
163
+
164
+ /**
165
+ * Compute capability metrics from trial results.
166
+ *
167
+ * @param results - Array of trial results
168
+ * @returns Capability metrics (passAtK statistics)
169
+ */
170
+ const computeCapabilityMetrics = (results: TrialResult[]): TrialsCapabilityMetrics => {
171
+ const passAtKValues = results.map((r) => r.passAtK ?? 0)
172
+
173
+ if (passAtKValues.length === 0) {
174
+ return { avgPassAtK: 0, medianPassAtK: 0, p25PassAtK: 0, p75PassAtK: 0 }
175
+ }
176
+
177
+ const sorted = [...passAtKValues].sort((a, b) => a - b)
178
+ const sum = passAtKValues.reduce((a, b) => a + b, 0)
179
+
180
+ return {
181
+ avgPassAtK: sum / passAtKValues.length,
182
+ medianPassAtK: percentile(sorted, 0.5),
183
+ p25PassAtK: percentile(sorted, 0.25),
184
+ p75PassAtK: percentile(sorted, 0.75),
185
+ }
186
+ }
187
+
188
+ /**
189
+ * Compute reliability metrics from trial results.
190
+ *
191
+ * @param results - Array of trial results
192
+ * @returns Reliability metrics (passExpK statistics)
193
+ */
194
+ const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMetrics => {
195
+ const passExpKValues = results.map((r) => r.passExpK ?? 0)
196
+
197
+ if (passExpKValues.length === 0) {
198
+ return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
199
+ }
200
+
201
+ const sorted = [...passExpKValues].sort((a, b) => a - b)
202
+ const sum = passExpKValues.reduce((a, b) => a + b, 0)
203
+
204
+ return {
205
+ avgPassExpK: sum / passExpKValues.length,
206
+ medianPassExpK: percentile(sorted, 0.5),
207
+ p25PassExpK: percentile(sorted, 0.25),
208
+ p75PassExpK: percentile(sorted, 0.75),
209
+ }
210
+ }
211
+
212
+ /**
213
+ * Compute flakiness metrics from trial results.
214
+ *
215
+ * @param results - Array of trial results
216
+ * @param maxTopFlaky - Maximum number of top flaky prompts to include
217
+ * @returns Flakiness metrics
218
+ */
219
+ const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 10): TrialsFlakinessMetrics => {
220
+ const flakinessData = results.map((r) => ({
221
+ id: r.id,
222
+ flakiness: Math.max(0, (r.passAtK ?? 0) - (r.passExpK ?? 0)),
223
+ }))
224
+
225
+ if (flakinessData.length === 0) {
226
+ return { avgFlakiness: 0, medianFlakiness: 0, flakyPromptCount: 0, topFlakyPrompts: [] }
227
+ }
228
+
229
+ const flakinessValues = flakinessData.map((d) => d.flakiness)
230
+ const sorted = [...flakinessValues].sort((a, b) => a - b)
231
+ const sum = flakinessValues.reduce((a, b) => a + b, 0)
232
+
233
+ // Sort by flakiness descending to get top flaky prompts
234
+ const topFlaky = [...flakinessData]
235
+ .filter((d) => d.flakiness > 0)
236
+ .sort((a, b) => b.flakiness - a.flakiness)
237
+ .slice(0, maxTopFlaky)
238
+
239
+ return {
240
+ avgFlakiness: sum / flakinessValues.length,
241
+ medianFlakiness: percentile(sorted, 0.5),
242
+ flakyPromptCount: flakinessData.filter((d) => d.flakiness > 0).length,
243
+ topFlakyPrompts: topFlaky,
244
+ }
245
+ }
246
+
247
+ /**
248
+ * Execute trials comparison and generate aggregate report.
249
+ *
250
+ * @param config - Trials compare configuration
251
+ * @returns Trials comparison report
252
+ */
253
+ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<TrialsComparisonReport> => {
254
+ const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
255
+
256
+ if (runs.length < 2) {
257
+ throw new Error('At least 2 runs required for comparison')
258
+ }
259
+
260
+ // Get grader based on strategy
261
+ const grader = await getTrialsGrader(strategy, graderPath)
262
+
263
+ const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
264
+ logProgress(`Comparing ${runs.length} trials runs with strategy: ${strategyLabel}`, progress)
265
+ for (const run of runs) {
266
+ logProgress(` - ${run.label}: ${run.path}`, progress)
267
+ }
268
+
269
+ // Load all runs using indexed streaming
270
+ const runResults: Record<string, Map<string, TrialResult>> = {}
271
+ for (const run of runs) {
272
+ logProgress(`Loading ${run.label}...`, progress)
273
+ runResults[run.label] = await buildTrialsIndex(run.path)
274
+ }
275
+
276
+ // Build set of all prompt IDs across runs
277
+ const promptIds = new Set<string>()
278
+ for (const resultsMap of Object.values(runResults)) {
279
+ for (const id of resultsMap.keys()) {
280
+ promptIds.add(id)
281
+ }
282
+ }
283
+
284
+ logProgress(`Comparing ${promptIds.size} prompts...`, progress)
285
+
286
+ // Per-prompt comparison results
287
+ const promptComparisons: TrialsPromptComparison[] = []
288
+ const perPromptGraderResults: { id: string; result: ComparisonGraderResult }[] = []
289
+
290
+ // Track k value (should be consistent across all results)
291
+ let trialsPerPrompt = 0
292
+
293
+ for (const promptId of promptIds) {
294
+ logProgress(` ${promptId}`, progress)
295
+
296
+ // Build comparison input
297
+ const runsData: TrialsComparisonGraderInput['runs'] = {}
298
+ let input: string | string[] = ''
299
+ let hint: string | undefined
300
+
301
+ for (const [label, resultsMap] of Object.entries(runResults)) {
302
+ const result = resultsMap.get(promptId)
303
+ if (result) {
304
+ const runData: TrialsComparisonRunData = {
305
+ passRate: result.passRate,
306
+ passAtK: result.passAtK,
307
+ passExpK: result.passExpK,
308
+ k: result.k,
309
+ trials: result.trials,
310
+ }
311
+ runsData[label] = runData
312
+
313
+ // Track k value
314
+ if (trialsPerPrompt === 0) {
315
+ trialsPerPrompt = result.k
316
+ }
317
+
318
+ // Use first found input/hint as the reference
319
+ if (!input) {
320
+ input = result.input
321
+ hint = result.hint
322
+ }
323
+ }
324
+ }
325
+
326
+ // Skip if not present in at least 2 runs
327
+ if (Object.keys(runsData).length < 2) {
328
+ logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress)
329
+ continue
330
+ }
331
+
332
+ // Apply comparison grader
333
+ const graderInput: TrialsComparisonGraderInput = {
334
+ id: promptId,
335
+ input,
336
+ hint,
337
+ runs: runsData,
338
+ }
339
+
340
+ const graderResult = await grader(graderInput)
341
+ perPromptGraderResults.push({ id: promptId, result: graderResult })
342
+
343
+ // Build prompt comparison for head-to-head
344
+ const passAtK: Record<string, number> = {}
345
+ const passExpK: Record<string, number> = {}
346
+ const flakiness: Record<string, number> = {}
347
+
348
+ for (const [label, data] of Object.entries(runsData)) {
349
+ passAtK[label] = data.passAtK ?? 0
350
+ passExpK[label] = data.passExpK ?? 0
351
+ flakiness[label] = Math.max(0, (data.passAtK ?? 0) - (data.passExpK ?? 0))
352
+ }
353
+
354
+ // Determine winners
355
+ const labels = Object.keys(runsData)
356
+ let capabilityWinner: string | null = null
357
+ let reliabilityWinner: string | null = null
358
+
359
+ // Capability winner: highest passAtK
360
+ const sortedByCapability = [...labels].sort((a, b) => (passAtK[b] ?? 0) - (passAtK[a] ?? 0))
361
+ if (sortedByCapability.length >= 2) {
362
+ const first = sortedByCapability[0]
363
+ const second = sortedByCapability[1]
364
+ if (first && second && (passAtK[first] ?? 0) > (passAtK[second] ?? 0)) {
365
+ capabilityWinner = first
366
+ }
367
+ }
368
+
369
+ // Reliability winner: highest passExpK
370
+ const sortedByReliability = [...labels].sort((a, b) => (passExpK[b] ?? 0) - (passExpK[a] ?? 0))
371
+ if (sortedByReliability.length >= 2) {
372
+ const first = sortedByReliability[0]
373
+ const second = sortedByReliability[1]
374
+ if (first && second && (passExpK[first] ?? 0) > (passExpK[second] ?? 0)) {
375
+ reliabilityWinner = first
376
+ }
377
+ }
378
+
379
+ promptComparisons.push({
380
+ id: promptId,
381
+ capabilityWinner,
382
+ reliabilityWinner,
383
+ passAtK,
384
+ passExpK,
385
+ flakiness,
386
+ })
387
+
388
+ // Log winner
389
+ const winner = graderResult.rankings.find((r) => r.rank === 1)
390
+ if (winner) {
391
+ logProgress(` Overall winner: ${winner.run} (${winner.score.toFixed(3)})`, progress)
392
+ }
393
+ }
394
+
395
+ // Compute aggregate metrics per run
396
+ const runLabels = runs.map((r) => r.label)
397
+
398
+ const capability: Record<string, TrialsCapabilityMetrics> = {}
399
+ const reliability: Record<string, TrialsReliabilityMetrics> = {}
400
+ const flakiness: Record<string, TrialsFlakinessMetrics> = {}
401
+
402
+ for (const label of runLabels) {
403
+ const resultsMap = runResults[label] ?? new Map()
404
+ const results = [...resultsMap.values()]
405
+
406
+ capability[label] = computeCapabilityMetrics(results)
407
+ reliability[label] = computeReliabilityMetrics(results)
408
+ flakiness[label] = computeFlakinessMetrics(results)
409
+ }
410
+
411
+ // Compute confidence intervals when using statistical strategy
412
+ if (strategy === 'statistical') {
413
+ const bootstrapConfig = getBootstrapConfigFromEnv()
414
+
415
+ for (const label of runLabels) {
416
+ const resultsMap = runResults[label] ?? new Map()
417
+ const results = [...resultsMap.values()]
418
+ const passAtKValues = results.map((r) => r.passAtK ?? 0)
419
+ const passExpKValues = results.map((r) => r.passExpK ?? 0)
420
+
421
+ // Capability CIs
422
+ const capabilityMetrics = capability[label]
423
+ if (capabilityMetrics) {
424
+ capabilityMetrics.confidenceIntervals = {
425
+ avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
426
+ }
427
+ }
428
+
429
+ // Reliability CIs
430
+ const reliabilityMetrics = reliability[label]
431
+ if (reliabilityMetrics) {
432
+ reliabilityMetrics.confidenceIntervals = {
433
+ avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
434
+ }
435
+ }
436
+ }
437
+ }
438
+
439
+ // Compute pairwise comparisons
440
+ const capabilityPairwise: PairwiseComparison[] = []
441
+ const reliabilityPairwise: PairwiseComparison[] = []
442
+ const overallPairwise: PairwiseComparison[] = []
443
+
444
+ for (let i = 0; i < runLabels.length; i++) {
445
+ for (let j = i + 1; j < runLabels.length; j++) {
446
+ const runA = runLabels[i]
447
+ const runB = runLabels[j]
448
+
449
+ if (!runA || !runB) continue
450
+
451
+ // Capability pairwise
452
+ let capAWins = 0
453
+ let capBWins = 0
454
+ let capTies = 0
455
+
456
+ // Reliability pairwise
457
+ let relAWins = 0
458
+ let relBWins = 0
459
+ let relTies = 0
460
+
461
+ // Overall pairwise (from grader results)
462
+ let overallAWins = 0
463
+ let overallBWins = 0
464
+ let overallTies = 0
465
+
466
+ for (const pc of promptComparisons) {
467
+ // Capability
468
+ if (pc.capabilityWinner === runA) capAWins++
469
+ else if (pc.capabilityWinner === runB) capBWins++
470
+ else capTies++
471
+
472
+ // Reliability
473
+ if (pc.reliabilityWinner === runA) relAWins++
474
+ else if (pc.reliabilityWinner === runB) relBWins++
475
+ else relTies++
476
+ }
477
+
478
+ // Overall from grader results
479
+ for (const { result } of perPromptGraderResults) {
480
+ const winner = result.rankings.find((r) => r.rank === 1)
481
+ if (winner?.run === runA) overallAWins++
482
+ else if (winner?.run === runB) overallBWins++
483
+ else overallTies++
484
+ }
485
+
486
+ capabilityPairwise.push({ runA, runB, aWins: capAWins, bWins: capBWins, ties: capTies })
487
+ reliabilityPairwise.push({ runA, runB, aWins: relAWins, bWins: relBWins, ties: relTies })
488
+ overallPairwise.push({ runA, runB, aWins: overallAWins, bWins: overallBWins, ties: overallTies })
489
+ }
490
+ }
491
+
492
+ // Build meta
493
+ const meta: TrialsComparisonMeta = {
494
+ generatedAt: new Date().toISOString(),
495
+ runs: runLabels,
496
+ promptCount: promptIds.size,
497
+ trialsPerPrompt,
498
+ inputFormat: 'trials',
499
+ }
500
+
501
+ // Assemble report
502
+ const report: TrialsComparisonReport = {
503
+ meta,
504
+ capability,
505
+ reliability,
506
+ flakiness,
507
+ headToHead: {
508
+ capability: capabilityPairwise,
509
+ reliability: reliabilityPairwise,
510
+ overall: overallPairwise,
511
+ },
512
+ perPrompt: promptComparisons,
513
+ }
514
+
515
+ // Output
516
+ if (format === 'markdown') {
517
+ const markdown = formatTrialsReportAsMarkdown(report)
518
+ await writeOutput(markdown, outputPath, false)
519
+ } else {
520
+ await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
521
+ }
522
+
523
+ // Summary statistics
524
+ logProgress('', progress)
525
+ logProgress('=== Summary ===', progress)
526
+
527
+ for (const [label, cap] of Object.entries(capability)) {
528
+ const rel = reliability[label]
529
+ const flak = flakiness[label]
530
+ logProgress(
531
+ ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
532
+ progress,
533
+ )
534
+ }
535
+
536
+ logProgress('', progress)
537
+ logProgress('Overall wins:', progress)
538
+ for (const pw of overallPairwise) {
539
+ logProgress(` ${pw.runA} vs ${pw.runB}: ${pw.aWins}-${pw.bWins}-${pw.ties}`, progress)
540
+ }
541
+
542
+ logProgress('Done!', progress)
543
+
544
+ return report
545
+ }
546
+
547
+ /**
548
+ * Format trials comparison report as markdown.
549
+ *
550
+ * @param report - Trials comparison report
551
+ * @returns Markdown string
552
+ */
553
+ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string => {
554
+ const lines: string[] = []
555
+
556
+ lines.push('# Trials Comparison Report')
557
+ lines.push('')
558
+ lines.push(`Generated: ${report.meta.generatedAt}`)
559
+ lines.push(`Runs: ${report.meta.runs.join(', ')}`)
560
+ lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
561
+ lines.push('')
562
+
563
+ // Check if any run has confidence intervals (statistical strategy was used)
564
+ const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
565
+
566
+ // Capability table
567
+ lines.push('## Capability (passAtK)')
568
+ lines.push('')
569
+ if (hasCIs) {
570
+ lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
571
+ lines.push('|-----|-----|--------|--------|-----|-----|')
572
+ for (const [label, c] of Object.entries(report.capability)) {
573
+ const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
574
+ lines.push(
575
+ `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
576
+ )
577
+ }
578
+ } else {
579
+ lines.push('| Run | Avg | Median | P25 | P75 |')
580
+ lines.push('|-----|-----|--------|-----|-----|')
581
+ for (const [label, c] of Object.entries(report.capability)) {
582
+ lines.push(
583
+ `| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
584
+ )
585
+ }
586
+ }
587
+ lines.push('')
588
+
589
+ // Reliability table
590
+ lines.push('## Reliability (passExpK)')
591
+ lines.push('')
592
+ if (hasCIs) {
593
+ lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
594
+ lines.push('|-----|-----|--------|--------|-----|-----|')
595
+ for (const [label, r] of Object.entries(report.reliability)) {
596
+ const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
597
+ lines.push(
598
+ `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
599
+ )
600
+ }
601
+ } else {
602
+ lines.push('| Run | Avg | Median | P25 | P75 |')
603
+ lines.push('|-----|-----|--------|-----|-----|')
604
+ for (const [label, r] of Object.entries(report.reliability)) {
605
+ lines.push(
606
+ `| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
607
+ )
608
+ }
609
+ }
610
+ lines.push('')
611
+
612
+ // Flakiness table
613
+ lines.push('## Flakiness')
614
+ lines.push('')
615
+ lines.push('| Run | Avg | Median | Flaky Prompts |')
616
+ lines.push('|-----|-----|--------|---------------|')
617
+ for (const [label, f] of Object.entries(report.flakiness)) {
618
+ lines.push(`| ${label} | ${f.avgFlakiness.toFixed(3)} | ${f.medianFlakiness.toFixed(3)} | ${f.flakyPromptCount} |`)
619
+ }
620
+ lines.push('')
621
+
622
+ // Head-to-head
623
+ lines.push('## Head-to-Head')
624
+ lines.push('')
625
+ lines.push('### By Capability')
626
+ lines.push('| Matchup | A Wins | B Wins | Ties |')
627
+ lines.push('|---------|--------|--------|------|')
628
+ for (const p of report.headToHead.capability) {
629
+ lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
630
+ }
631
+ lines.push('')
632
+
633
+ lines.push('### By Reliability')
634
+ lines.push('| Matchup | A Wins | B Wins | Ties |')
635
+ lines.push('|---------|--------|--------|------|')
636
+ for (const p of report.headToHead.reliability) {
637
+ lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
638
+ }
639
+ lines.push('')
640
+
641
+ lines.push('### Overall (Weighted)')
642
+ lines.push('| Matchup | A Wins | B Wins | Ties |')
643
+ lines.push('|---------|--------|--------|------|')
644
+ for (const p of report.headToHead.overall) {
645
+ lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
646
+ }
647
+ lines.push('')
648
+
649
+ return lines.join('\n')
650
+ }