@plaited/agent-eval-harness 0.7.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/package.json +5 -3
- package/src/graders/bootstrap.ts +135 -0
- package/src/graders/compare-statistical.ts +14 -86
- package/src/graders/tests/bootstrap.spec.ts +169 -0
- package/src/graders/tests/trials-compare-graders.spec.ts +358 -0
- package/src/graders/trials-compare-statistical.ts +183 -0
- package/src/graders/trials-compare-weighted.ts +128 -0
- package/src/graders.ts +21 -1
- package/src/pipeline/compare-format-detection.ts +100 -0
- package/src/pipeline/compare-trials.ts +650 -0
- package/src/pipeline/compare.ts +144 -31
- package/src/pipeline/pipeline.types.ts +52 -1
- package/src/pipeline/tests/compare-format-detection.spec.ts +142 -0
- package/src/pipeline/tests/compare-statistical.spec.ts +281 -0
- package/src/pipeline/tests/compare-trials.spec.ts +417 -0
- package/src/schemas/schemas.ts +216 -0
- package/src/schemas.ts +13 -0
|
@@ -0,0 +1,650 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline compare command for trials data.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Compares multiple runs of TrialResult data, analyzing capability (passAtK),
|
|
6
|
+
* reliability (passExpK), and flakiness metrics.
|
|
7
|
+
*
|
|
8
|
+
* Outputs a TrialsComparisonReport JSON (not JSONL) containing aggregate
|
|
9
|
+
* statistics across all dimensions plus head-to-head comparisons.
|
|
10
|
+
*
|
|
11
|
+
* Built-in strategies:
|
|
12
|
+
* - `weighted`: Configurable weights for capability, reliability, consistency (default)
|
|
13
|
+
* - `statistical`: Bootstrap sampling for confidence intervals on passAtK
|
|
14
|
+
*
|
|
15
|
+
* @packageDocumentation
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { logProgress, writeOutput } from '../core.ts'
|
|
19
|
+
import { bootstrap, formatCI, getBootstrapConfigFromEnv } from '../graders/bootstrap.ts'
|
|
20
|
+
import { grade as statisticalGrade } from '../graders/trials-compare-statistical.ts'
|
|
21
|
+
import { grade as weightedGrade } from '../graders/trials-compare-weighted.ts'
|
|
22
|
+
import type {
|
|
23
|
+
PairwiseComparison,
|
|
24
|
+
TrialResult,
|
|
25
|
+
TrialsCapabilityMetrics,
|
|
26
|
+
TrialsComparisonMeta,
|
|
27
|
+
TrialsComparisonReport,
|
|
28
|
+
TrialsFlakinessMetrics,
|
|
29
|
+
TrialsPromptComparison,
|
|
30
|
+
TrialsReliabilityMetrics,
|
|
31
|
+
} from '../schemas.ts'
|
|
32
|
+
import { TrialResultSchema } from '../schemas.ts'
|
|
33
|
+
import type {
|
|
34
|
+
ComparisonGraderResult,
|
|
35
|
+
LabeledRun,
|
|
36
|
+
TrialsComparisonGrader,
|
|
37
|
+
TrialsComparisonGraderInput,
|
|
38
|
+
TrialsComparisonRunData,
|
|
39
|
+
} from './pipeline.types.ts'
|
|
40
|
+
|
|
41
|
+
/** Comparison strategy type for trials */
|
|
42
|
+
export type TrialsCompareStrategy = 'weighted' | 'statistical' | 'custom'
|
|
43
|
+
|
|
44
|
+
/** Extended compare config for trials */
|
|
45
|
+
export type TrialsCompareConfig = {
|
|
46
|
+
/** Labeled runs to compare */
|
|
47
|
+
runs: LabeledRun[]
|
|
48
|
+
/** Comparison strategy (default: weighted) */
|
|
49
|
+
strategy?: TrialsCompareStrategy
|
|
50
|
+
/** Path to custom grader (required if strategy is 'custom') */
|
|
51
|
+
graderPath?: string
|
|
52
|
+
/** Output file path */
|
|
53
|
+
outputPath?: string
|
|
54
|
+
/** Show progress to stderr */
|
|
55
|
+
progress?: boolean
|
|
56
|
+
/** Output format (default: json) */
|
|
57
|
+
format?: 'json' | 'markdown'
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Stream trial results from a JSONL file.
|
|
62
|
+
*
|
|
63
|
+
* @param path - Path to the trials.jsonl file
|
|
64
|
+
* @yields Parsed and validated trial results
|
|
65
|
+
*/
|
|
66
|
+
async function* streamTrialResults(path: string): AsyncGenerator<TrialResult, void, unknown> {
|
|
67
|
+
const file = Bun.file(path)
|
|
68
|
+
const text = await file.text()
|
|
69
|
+
const lines = text.split('\n')
|
|
70
|
+
|
|
71
|
+
for (let i = 0; i < lines.length; i++) {
|
|
72
|
+
const line = lines[i]?.trim()
|
|
73
|
+
if (!line) continue
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
yield TrialResultSchema.parse(JSON.parse(line))
|
|
77
|
+
} catch (error) {
|
|
78
|
+
throw new Error(`Invalid trial result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Build an indexed map of trial results by ID.
|
|
85
|
+
*
|
|
86
|
+
* @param path - Path to the trials.jsonl file
|
|
87
|
+
* @returns Map of result ID to TrialResult
|
|
88
|
+
*/
|
|
89
|
+
export const buildTrialsIndex = async (path: string): Promise<Map<string, TrialResult>> => {
|
|
90
|
+
const index = new Map<string, TrialResult>()
|
|
91
|
+
|
|
92
|
+
for await (const result of streamTrialResults(path)) {
|
|
93
|
+
index.set(result.id, result)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return index
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Load trials comparison grader from file.
|
|
101
|
+
*
|
|
102
|
+
* @param path - Path to grader module
|
|
103
|
+
* @returns Loaded trials comparison grader function
|
|
104
|
+
* @throws Error if module cannot be loaded or doesn't export a grader function
|
|
105
|
+
*/
|
|
106
|
+
const loadTrialsComparisonGrader = async (path: string): Promise<TrialsComparisonGrader> => {
|
|
107
|
+
let module: Record<string, unknown>
|
|
108
|
+
try {
|
|
109
|
+
module = (await import(path)) as Record<string, unknown>
|
|
110
|
+
} catch (error) {
|
|
111
|
+
throw new Error(`Failed to load grader from '${path}': ${error instanceof Error ? error.message : error}`)
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (typeof module.grade === 'function') {
|
|
115
|
+
return module.grade as TrialsComparisonGrader
|
|
116
|
+
}
|
|
117
|
+
if (typeof module.default === 'function') {
|
|
118
|
+
return module.default as TrialsComparisonGrader
|
|
119
|
+
}
|
|
120
|
+
if (typeof module.compare === 'function') {
|
|
121
|
+
return module.compare as TrialsComparisonGrader
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
throw new Error(`Trials comparison grader must export 'grade', 'compare', or 'default' function`)
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Get grader function based on strategy.
|
|
129
|
+
*
|
|
130
|
+
* @param strategy - Comparison strategy
|
|
131
|
+
* @param graderPath - Path to custom grader (for 'custom' strategy)
|
|
132
|
+
* @returns Trials comparison grader function
|
|
133
|
+
*/
|
|
134
|
+
const getTrialsGrader = async (
|
|
135
|
+
strategy: TrialsCompareStrategy,
|
|
136
|
+
graderPath?: string,
|
|
137
|
+
): Promise<TrialsComparisonGrader> => {
|
|
138
|
+
switch (strategy) {
|
|
139
|
+
case 'weighted':
|
|
140
|
+
return weightedGrade
|
|
141
|
+
case 'statistical':
|
|
142
|
+
return statisticalGrade
|
|
143
|
+
case 'custom':
|
|
144
|
+
if (!graderPath) {
|
|
145
|
+
throw new Error('Custom strategy requires --grader path')
|
|
146
|
+
}
|
|
147
|
+
return loadTrialsComparisonGrader(graderPath)
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Compute percentile from sorted array.
|
|
153
|
+
*
|
|
154
|
+
* @param sorted - Sorted array of numbers
|
|
155
|
+
* @param p - Percentile (0-1)
|
|
156
|
+
* @returns Value at percentile
|
|
157
|
+
*/
|
|
158
|
+
const percentile = (sorted: number[], p: number): number => {
|
|
159
|
+
if (sorted.length === 0) return 0
|
|
160
|
+
const idx = Math.floor(sorted.length * p)
|
|
161
|
+
return sorted[Math.min(idx, sorted.length - 1)] ?? 0
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Compute capability metrics from trial results.
|
|
166
|
+
*
|
|
167
|
+
* @param results - Array of trial results
|
|
168
|
+
* @returns Capability metrics (passAtK statistics)
|
|
169
|
+
*/
|
|
170
|
+
const computeCapabilityMetrics = (results: TrialResult[]): TrialsCapabilityMetrics => {
|
|
171
|
+
const passAtKValues = results.map((r) => r.passAtK ?? 0)
|
|
172
|
+
|
|
173
|
+
if (passAtKValues.length === 0) {
|
|
174
|
+
return { avgPassAtK: 0, medianPassAtK: 0, p25PassAtK: 0, p75PassAtK: 0 }
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const sorted = [...passAtKValues].sort((a, b) => a - b)
|
|
178
|
+
const sum = passAtKValues.reduce((a, b) => a + b, 0)
|
|
179
|
+
|
|
180
|
+
return {
|
|
181
|
+
avgPassAtK: sum / passAtKValues.length,
|
|
182
|
+
medianPassAtK: percentile(sorted, 0.5),
|
|
183
|
+
p25PassAtK: percentile(sorted, 0.25),
|
|
184
|
+
p75PassAtK: percentile(sorted, 0.75),
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Compute reliability metrics from trial results.
|
|
190
|
+
*
|
|
191
|
+
* @param results - Array of trial results
|
|
192
|
+
* @returns Reliability metrics (passExpK statistics)
|
|
193
|
+
*/
|
|
194
|
+
const computeReliabilityMetrics = (results: TrialResult[]): TrialsReliabilityMetrics => {
|
|
195
|
+
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
196
|
+
|
|
197
|
+
if (passExpKValues.length === 0) {
|
|
198
|
+
return { avgPassExpK: 0, medianPassExpK: 0, p25PassExpK: 0, p75PassExpK: 0 }
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const sorted = [...passExpKValues].sort((a, b) => a - b)
|
|
202
|
+
const sum = passExpKValues.reduce((a, b) => a + b, 0)
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
avgPassExpK: sum / passExpKValues.length,
|
|
206
|
+
medianPassExpK: percentile(sorted, 0.5),
|
|
207
|
+
p25PassExpK: percentile(sorted, 0.25),
|
|
208
|
+
p75PassExpK: percentile(sorted, 0.75),
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Compute flakiness metrics from trial results.
|
|
214
|
+
*
|
|
215
|
+
* @param results - Array of trial results
|
|
216
|
+
* @param maxTopFlaky - Maximum number of top flaky prompts to include
|
|
217
|
+
* @returns Flakiness metrics
|
|
218
|
+
*/
|
|
219
|
+
const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 10): TrialsFlakinessMetrics => {
|
|
220
|
+
const flakinessData = results.map((r) => ({
|
|
221
|
+
id: r.id,
|
|
222
|
+
flakiness: Math.max(0, (r.passAtK ?? 0) - (r.passExpK ?? 0)),
|
|
223
|
+
}))
|
|
224
|
+
|
|
225
|
+
if (flakinessData.length === 0) {
|
|
226
|
+
return { avgFlakiness: 0, medianFlakiness: 0, flakyPromptCount: 0, topFlakyPrompts: [] }
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const flakinessValues = flakinessData.map((d) => d.flakiness)
|
|
230
|
+
const sorted = [...flakinessValues].sort((a, b) => a - b)
|
|
231
|
+
const sum = flakinessValues.reduce((a, b) => a + b, 0)
|
|
232
|
+
|
|
233
|
+
// Sort by flakiness descending to get top flaky prompts
|
|
234
|
+
const topFlaky = [...flakinessData]
|
|
235
|
+
.filter((d) => d.flakiness > 0)
|
|
236
|
+
.sort((a, b) => b.flakiness - a.flakiness)
|
|
237
|
+
.slice(0, maxTopFlaky)
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
avgFlakiness: sum / flakinessValues.length,
|
|
241
|
+
medianFlakiness: percentile(sorted, 0.5),
|
|
242
|
+
flakyPromptCount: flakinessData.filter((d) => d.flakiness > 0).length,
|
|
243
|
+
topFlakyPrompts: topFlaky,
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Execute trials comparison and generate aggregate report.
|
|
249
|
+
*
|
|
250
|
+
* @param config - Trials compare configuration
|
|
251
|
+
* @returns Trials comparison report
|
|
252
|
+
*/
|
|
253
|
+
export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<TrialsComparisonReport> => {
|
|
254
|
+
const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
|
|
255
|
+
|
|
256
|
+
if (runs.length < 2) {
|
|
257
|
+
throw new Error('At least 2 runs required for comparison')
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Get grader based on strategy
|
|
261
|
+
const grader = await getTrialsGrader(strategy, graderPath)
|
|
262
|
+
|
|
263
|
+
const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
|
|
264
|
+
logProgress(`Comparing ${runs.length} trials runs with strategy: ${strategyLabel}`, progress)
|
|
265
|
+
for (const run of runs) {
|
|
266
|
+
logProgress(` - ${run.label}: ${run.path}`, progress)
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Load all runs using indexed streaming
|
|
270
|
+
const runResults: Record<string, Map<string, TrialResult>> = {}
|
|
271
|
+
for (const run of runs) {
|
|
272
|
+
logProgress(`Loading ${run.label}...`, progress)
|
|
273
|
+
runResults[run.label] = await buildTrialsIndex(run.path)
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// Build set of all prompt IDs across runs
|
|
277
|
+
const promptIds = new Set<string>()
|
|
278
|
+
for (const resultsMap of Object.values(runResults)) {
|
|
279
|
+
for (const id of resultsMap.keys()) {
|
|
280
|
+
promptIds.add(id)
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
logProgress(`Comparing ${promptIds.size} prompts...`, progress)
|
|
285
|
+
|
|
286
|
+
// Per-prompt comparison results
|
|
287
|
+
const promptComparisons: TrialsPromptComparison[] = []
|
|
288
|
+
const perPromptGraderResults: { id: string; result: ComparisonGraderResult }[] = []
|
|
289
|
+
|
|
290
|
+
// Track k value (should be consistent across all results)
|
|
291
|
+
let trialsPerPrompt = 0
|
|
292
|
+
|
|
293
|
+
for (const promptId of promptIds) {
|
|
294
|
+
logProgress(` ${promptId}`, progress)
|
|
295
|
+
|
|
296
|
+
// Build comparison input
|
|
297
|
+
const runsData: TrialsComparisonGraderInput['runs'] = {}
|
|
298
|
+
let input: string | string[] = ''
|
|
299
|
+
let hint: string | undefined
|
|
300
|
+
|
|
301
|
+
for (const [label, resultsMap] of Object.entries(runResults)) {
|
|
302
|
+
const result = resultsMap.get(promptId)
|
|
303
|
+
if (result) {
|
|
304
|
+
const runData: TrialsComparisonRunData = {
|
|
305
|
+
passRate: result.passRate,
|
|
306
|
+
passAtK: result.passAtK,
|
|
307
|
+
passExpK: result.passExpK,
|
|
308
|
+
k: result.k,
|
|
309
|
+
trials: result.trials,
|
|
310
|
+
}
|
|
311
|
+
runsData[label] = runData
|
|
312
|
+
|
|
313
|
+
// Track k value
|
|
314
|
+
if (trialsPerPrompt === 0) {
|
|
315
|
+
trialsPerPrompt = result.k
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Use first found input/hint as the reference
|
|
319
|
+
if (!input) {
|
|
320
|
+
input = result.input
|
|
321
|
+
hint = result.hint
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Skip if not present in at least 2 runs
|
|
327
|
+
if (Object.keys(runsData).length < 2) {
|
|
328
|
+
logProgress(` Skipped (only in ${Object.keys(runsData).length} run)`, progress)
|
|
329
|
+
continue
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Apply comparison grader
|
|
333
|
+
const graderInput: TrialsComparisonGraderInput = {
|
|
334
|
+
id: promptId,
|
|
335
|
+
input,
|
|
336
|
+
hint,
|
|
337
|
+
runs: runsData,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
const graderResult = await grader(graderInput)
|
|
341
|
+
perPromptGraderResults.push({ id: promptId, result: graderResult })
|
|
342
|
+
|
|
343
|
+
// Build prompt comparison for head-to-head
|
|
344
|
+
const passAtK: Record<string, number> = {}
|
|
345
|
+
const passExpK: Record<string, number> = {}
|
|
346
|
+
const flakiness: Record<string, number> = {}
|
|
347
|
+
|
|
348
|
+
for (const [label, data] of Object.entries(runsData)) {
|
|
349
|
+
passAtK[label] = data.passAtK ?? 0
|
|
350
|
+
passExpK[label] = data.passExpK ?? 0
|
|
351
|
+
flakiness[label] = Math.max(0, (data.passAtK ?? 0) - (data.passExpK ?? 0))
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Determine winners
|
|
355
|
+
const labels = Object.keys(runsData)
|
|
356
|
+
let capabilityWinner: string | null = null
|
|
357
|
+
let reliabilityWinner: string | null = null
|
|
358
|
+
|
|
359
|
+
// Capability winner: highest passAtK
|
|
360
|
+
const sortedByCapability = [...labels].sort((a, b) => (passAtK[b] ?? 0) - (passAtK[a] ?? 0))
|
|
361
|
+
if (sortedByCapability.length >= 2) {
|
|
362
|
+
const first = sortedByCapability[0]
|
|
363
|
+
const second = sortedByCapability[1]
|
|
364
|
+
if (first && second && (passAtK[first] ?? 0) > (passAtK[second] ?? 0)) {
|
|
365
|
+
capabilityWinner = first
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Reliability winner: highest passExpK
|
|
370
|
+
const sortedByReliability = [...labels].sort((a, b) => (passExpK[b] ?? 0) - (passExpK[a] ?? 0))
|
|
371
|
+
if (sortedByReliability.length >= 2) {
|
|
372
|
+
const first = sortedByReliability[0]
|
|
373
|
+
const second = sortedByReliability[1]
|
|
374
|
+
if (first && second && (passExpK[first] ?? 0) > (passExpK[second] ?? 0)) {
|
|
375
|
+
reliabilityWinner = first
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
promptComparisons.push({
|
|
380
|
+
id: promptId,
|
|
381
|
+
capabilityWinner,
|
|
382
|
+
reliabilityWinner,
|
|
383
|
+
passAtK,
|
|
384
|
+
passExpK,
|
|
385
|
+
flakiness,
|
|
386
|
+
})
|
|
387
|
+
|
|
388
|
+
// Log winner
|
|
389
|
+
const winner = graderResult.rankings.find((r) => r.rank === 1)
|
|
390
|
+
if (winner) {
|
|
391
|
+
logProgress(` Overall winner: ${winner.run} (${winner.score.toFixed(3)})`, progress)
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Compute aggregate metrics per run
|
|
396
|
+
const runLabels = runs.map((r) => r.label)
|
|
397
|
+
|
|
398
|
+
const capability: Record<string, TrialsCapabilityMetrics> = {}
|
|
399
|
+
const reliability: Record<string, TrialsReliabilityMetrics> = {}
|
|
400
|
+
const flakiness: Record<string, TrialsFlakinessMetrics> = {}
|
|
401
|
+
|
|
402
|
+
for (const label of runLabels) {
|
|
403
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
404
|
+
const results = [...resultsMap.values()]
|
|
405
|
+
|
|
406
|
+
capability[label] = computeCapabilityMetrics(results)
|
|
407
|
+
reliability[label] = computeReliabilityMetrics(results)
|
|
408
|
+
flakiness[label] = computeFlakinessMetrics(results)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Compute confidence intervals when using statistical strategy
|
|
412
|
+
if (strategy === 'statistical') {
|
|
413
|
+
const bootstrapConfig = getBootstrapConfigFromEnv()
|
|
414
|
+
|
|
415
|
+
for (const label of runLabels) {
|
|
416
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
417
|
+
const results = [...resultsMap.values()]
|
|
418
|
+
const passAtKValues = results.map((r) => r.passAtK ?? 0)
|
|
419
|
+
const passExpKValues = results.map((r) => r.passExpK ?? 0)
|
|
420
|
+
|
|
421
|
+
// Capability CIs
|
|
422
|
+
const capabilityMetrics = capability[label]
|
|
423
|
+
if (capabilityMetrics) {
|
|
424
|
+
capabilityMetrics.confidenceIntervals = {
|
|
425
|
+
avgPassAtK: bootstrap(passAtKValues, bootstrapConfig).ci,
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Reliability CIs
|
|
430
|
+
const reliabilityMetrics = reliability[label]
|
|
431
|
+
if (reliabilityMetrics) {
|
|
432
|
+
reliabilityMetrics.confidenceIntervals = {
|
|
433
|
+
avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Compute pairwise comparisons
|
|
440
|
+
const capabilityPairwise: PairwiseComparison[] = []
|
|
441
|
+
const reliabilityPairwise: PairwiseComparison[] = []
|
|
442
|
+
const overallPairwise: PairwiseComparison[] = []
|
|
443
|
+
|
|
444
|
+
for (let i = 0; i < runLabels.length; i++) {
|
|
445
|
+
for (let j = i + 1; j < runLabels.length; j++) {
|
|
446
|
+
const runA = runLabels[i]
|
|
447
|
+
const runB = runLabels[j]
|
|
448
|
+
|
|
449
|
+
if (!runA || !runB) continue
|
|
450
|
+
|
|
451
|
+
// Capability pairwise
|
|
452
|
+
let capAWins = 0
|
|
453
|
+
let capBWins = 0
|
|
454
|
+
let capTies = 0
|
|
455
|
+
|
|
456
|
+
// Reliability pairwise
|
|
457
|
+
let relAWins = 0
|
|
458
|
+
let relBWins = 0
|
|
459
|
+
let relTies = 0
|
|
460
|
+
|
|
461
|
+
// Overall pairwise (from grader results)
|
|
462
|
+
let overallAWins = 0
|
|
463
|
+
let overallBWins = 0
|
|
464
|
+
let overallTies = 0
|
|
465
|
+
|
|
466
|
+
for (const pc of promptComparisons) {
|
|
467
|
+
// Capability
|
|
468
|
+
if (pc.capabilityWinner === runA) capAWins++
|
|
469
|
+
else if (pc.capabilityWinner === runB) capBWins++
|
|
470
|
+
else capTies++
|
|
471
|
+
|
|
472
|
+
// Reliability
|
|
473
|
+
if (pc.reliabilityWinner === runA) relAWins++
|
|
474
|
+
else if (pc.reliabilityWinner === runB) relBWins++
|
|
475
|
+
else relTies++
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Overall from grader results
|
|
479
|
+
for (const { result } of perPromptGraderResults) {
|
|
480
|
+
const winner = result.rankings.find((r) => r.rank === 1)
|
|
481
|
+
if (winner?.run === runA) overallAWins++
|
|
482
|
+
else if (winner?.run === runB) overallBWins++
|
|
483
|
+
else overallTies++
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
capabilityPairwise.push({ runA, runB, aWins: capAWins, bWins: capBWins, ties: capTies })
|
|
487
|
+
reliabilityPairwise.push({ runA, runB, aWins: relAWins, bWins: relBWins, ties: relTies })
|
|
488
|
+
overallPairwise.push({ runA, runB, aWins: overallAWins, bWins: overallBWins, ties: overallTies })
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Build meta
|
|
493
|
+
const meta: TrialsComparisonMeta = {
|
|
494
|
+
generatedAt: new Date().toISOString(),
|
|
495
|
+
runs: runLabels,
|
|
496
|
+
promptCount: promptIds.size,
|
|
497
|
+
trialsPerPrompt,
|
|
498
|
+
inputFormat: 'trials',
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Assemble report
|
|
502
|
+
const report: TrialsComparisonReport = {
|
|
503
|
+
meta,
|
|
504
|
+
capability,
|
|
505
|
+
reliability,
|
|
506
|
+
flakiness,
|
|
507
|
+
headToHead: {
|
|
508
|
+
capability: capabilityPairwise,
|
|
509
|
+
reliability: reliabilityPairwise,
|
|
510
|
+
overall: overallPairwise,
|
|
511
|
+
},
|
|
512
|
+
perPrompt: promptComparisons,
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// Output
|
|
516
|
+
if (format === 'markdown') {
|
|
517
|
+
const markdown = formatTrialsReportAsMarkdown(report)
|
|
518
|
+
await writeOutput(markdown, outputPath, false)
|
|
519
|
+
} else {
|
|
520
|
+
await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// Summary statistics
|
|
524
|
+
logProgress('', progress)
|
|
525
|
+
logProgress('=== Summary ===', progress)
|
|
526
|
+
|
|
527
|
+
for (const [label, cap] of Object.entries(capability)) {
|
|
528
|
+
const rel = reliability[label]
|
|
529
|
+
const flak = flakiness[label]
|
|
530
|
+
logProgress(
|
|
531
|
+
` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
|
|
532
|
+
progress,
|
|
533
|
+
)
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
logProgress('', progress)
|
|
537
|
+
logProgress('Overall wins:', progress)
|
|
538
|
+
for (const pw of overallPairwise) {
|
|
539
|
+
logProgress(` ${pw.runA} vs ${pw.runB}: ${pw.aWins}-${pw.bWins}-${pw.ties}`, progress)
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
logProgress('Done!', progress)
|
|
543
|
+
|
|
544
|
+
return report
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
/**
|
|
548
|
+
* Format trials comparison report as markdown.
|
|
549
|
+
*
|
|
550
|
+
* @param report - Trials comparison report
|
|
551
|
+
* @returns Markdown string
|
|
552
|
+
*/
|
|
553
|
+
const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string => {
|
|
554
|
+
const lines: string[] = []
|
|
555
|
+
|
|
556
|
+
lines.push('# Trials Comparison Report')
|
|
557
|
+
lines.push('')
|
|
558
|
+
lines.push(`Generated: ${report.meta.generatedAt}`)
|
|
559
|
+
lines.push(`Runs: ${report.meta.runs.join(', ')}`)
|
|
560
|
+
lines.push(`Prompts: ${report.meta.promptCount} | Trials per prompt: ${report.meta.trialsPerPrompt}`)
|
|
561
|
+
lines.push('')
|
|
562
|
+
|
|
563
|
+
// Check if any run has confidence intervals (statistical strategy was used)
|
|
564
|
+
const hasCIs = Object.values(report.capability).some((c) => c.confidenceIntervals)
|
|
565
|
+
|
|
566
|
+
// Capability table
|
|
567
|
+
lines.push('## Capability (passAtK)')
|
|
568
|
+
lines.push('')
|
|
569
|
+
if (hasCIs) {
|
|
570
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
571
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
572
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
573
|
+
const avgCI = formatCI(c.confidenceIntervals?.avgPassAtK)
|
|
574
|
+
lines.push(
|
|
575
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${avgCI} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
576
|
+
)
|
|
577
|
+
}
|
|
578
|
+
} else {
|
|
579
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
580
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
581
|
+
for (const [label, c] of Object.entries(report.capability)) {
|
|
582
|
+
lines.push(
|
|
583
|
+
`| ${label} | ${c.avgPassAtK.toFixed(3)} | ${c.medianPassAtK.toFixed(3)} | ${c.p25PassAtK.toFixed(3)} | ${c.p75PassAtK.toFixed(3)} |`,
|
|
584
|
+
)
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
lines.push('')
|
|
588
|
+
|
|
589
|
+
// Reliability table
|
|
590
|
+
lines.push('## Reliability (passExpK)')
|
|
591
|
+
lines.push('')
|
|
592
|
+
if (hasCIs) {
|
|
593
|
+
lines.push('| Run | Avg | 95% CI | Median | P25 | P75 |')
|
|
594
|
+
lines.push('|-----|-----|--------|--------|-----|-----|')
|
|
595
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
596
|
+
const avgCI = formatCI(r.confidenceIntervals?.avgPassExpK)
|
|
597
|
+
lines.push(
|
|
598
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${avgCI} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
599
|
+
)
|
|
600
|
+
}
|
|
601
|
+
} else {
|
|
602
|
+
lines.push('| Run | Avg | Median | P25 | P75 |')
|
|
603
|
+
lines.push('|-----|-----|--------|-----|-----|')
|
|
604
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
605
|
+
lines.push(
|
|
606
|
+
`| ${label} | ${r.avgPassExpK.toFixed(3)} | ${r.medianPassExpK.toFixed(3)} | ${r.p25PassExpK.toFixed(3)} | ${r.p75PassExpK.toFixed(3)} |`,
|
|
607
|
+
)
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
lines.push('')
|
|
611
|
+
|
|
612
|
+
// Flakiness table
|
|
613
|
+
lines.push('## Flakiness')
|
|
614
|
+
lines.push('')
|
|
615
|
+
lines.push('| Run | Avg | Median | Flaky Prompts |')
|
|
616
|
+
lines.push('|-----|-----|--------|---------------|')
|
|
617
|
+
for (const [label, f] of Object.entries(report.flakiness)) {
|
|
618
|
+
lines.push(`| ${label} | ${f.avgFlakiness.toFixed(3)} | ${f.medianFlakiness.toFixed(3)} | ${f.flakyPromptCount} |`)
|
|
619
|
+
}
|
|
620
|
+
lines.push('')
|
|
621
|
+
|
|
622
|
+
// Head-to-head
|
|
623
|
+
lines.push('## Head-to-Head')
|
|
624
|
+
lines.push('')
|
|
625
|
+
lines.push('### By Capability')
|
|
626
|
+
lines.push('| Matchup | A Wins | B Wins | Ties |')
|
|
627
|
+
lines.push('|---------|--------|--------|------|')
|
|
628
|
+
for (const p of report.headToHead.capability) {
|
|
629
|
+
lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
|
|
630
|
+
}
|
|
631
|
+
lines.push('')
|
|
632
|
+
|
|
633
|
+
lines.push('### By Reliability')
|
|
634
|
+
lines.push('| Matchup | A Wins | B Wins | Ties |')
|
|
635
|
+
lines.push('|---------|--------|--------|------|')
|
|
636
|
+
for (const p of report.headToHead.reliability) {
|
|
637
|
+
lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
|
|
638
|
+
}
|
|
639
|
+
lines.push('')
|
|
640
|
+
|
|
641
|
+
lines.push('### Overall (Weighted)')
|
|
642
|
+
lines.push('| Matchup | A Wins | B Wins | Ties |')
|
|
643
|
+
lines.push('|---------|--------|--------|------|')
|
|
644
|
+
for (const p of report.headToHead.overall) {
|
|
645
|
+
lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
|
|
646
|
+
}
|
|
647
|
+
lines.push('')
|
|
648
|
+
|
|
649
|
+
return lines.join('\n')
|
|
650
|
+
}
|