@plaited/agent-eval-harness 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,10 @@
3
3
  *
4
4
  * @remarks
5
5
  * Compares results from different configurations (agents, MCP servers, models)
6
- * using a user-provided comparison grader that ranks the runs.
6
+ * using either built-in strategies or a user-provided comparison grader.
7
+ *
8
+ * Outputs a holistic ComparisonReport JSON (not JSONL) containing aggregate
9
+ * statistics across quality, performance, reliability, and head-to-head metrics.
7
10
  *
8
11
  * Terminology: "runs" (not "agents") because comparisons can be:
9
12
  * - Same agent, different MCP servers
@@ -12,13 +15,33 @@
12
15
  * - Same agent, different model versions
13
16
  * - Different agents entirely
14
17
  *
18
+ * Built-in strategies:
19
+ * - `weighted`: Configurable weights for quality, latency, reliability (default)
20
+ * - `statistical`: Bootstrap sampling for confidence intervals
21
+ *
15
22
  * @packageDocumentation
16
23
  */
17
24
 
18
25
  import { basename, extname } from 'node:path'
19
26
  import { parseArgs } from 'node:util'
20
- import { loadResults, logProgress, writeOutput } from '../core.ts'
21
- import type { CaptureResult } from '../schemas.ts'
27
+ import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
28
+ import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
29
+ import { grade as weightedGrade } from '../graders/compare-weighted.ts'
30
+ import type {
31
+ CaptureResult,
32
+ ComparisonMeta,
33
+ ComparisonReport,
34
+ HeadToHead,
35
+ LatencyStats,
36
+ PairwiseComparison,
37
+ PerformanceMetrics,
38
+ PromptComparison,
39
+ QualityMetrics,
40
+ ReliabilityMetrics,
41
+ ScoreDistribution,
42
+ TrajectoryInfo,
43
+ TrajectoryRichness,
44
+ } from '../schemas.ts'
22
45
  import type {
23
46
  CompareConfig,
24
47
  ComparisonGrader,
@@ -27,6 +50,19 @@ import type {
27
50
  LabeledRun,
28
51
  } from './pipeline.types.ts'
29
52
 
53
+ /** Comparison strategy type */
54
+ export type CompareStrategy = 'weighted' | 'statistical' | 'custom'
55
+
56
+ /** Extended compare config with strategy support */
57
+ export type ExtendedCompareConfig = Omit<CompareConfig, 'graderPath'> & {
58
+ /** Comparison strategy (default: weighted) */
59
+ strategy?: CompareStrategy
60
+ /** Path to custom grader (required if strategy is 'custom') */
61
+ graderPath?: string
62
+ /** Output format (default: json) */
63
+ format?: 'json' | 'markdown'
64
+ }
65
+
30
66
  /**
31
67
  * Load comparison grader from file.
32
68
  *
@@ -93,50 +129,204 @@ const parseLabeledRun = (arg: string): LabeledRun => {
93
129
  }
94
130
 
95
131
  /**
96
- * Execute pipeline compare with configuration.
132
+ * Validate that all run files exist.
97
133
  *
98
- * @param config - Compare configuration
134
+ * @param runs - Labeled runs to validate
135
+ * @throws Error if any file doesn't exist
99
136
  */
100
- export const runCompare = async (config: CompareConfig): Promise<void> => {
101
- const { runs, graderPath, outputPath, progress = false } = config
137
+ const validateRunFiles = async (runs: LabeledRun[]): Promise<void> => {
138
+ const missing: string[] = []
139
+
140
+ for (const run of runs) {
141
+ const exists = await Bun.file(run.path).exists()
142
+ if (!exists) {
143
+ missing.push(`${run.label}: ${run.path}`)
144
+ }
145
+ }
146
+
147
+ if (missing.length > 0) {
148
+ throw new Error(`Result file(s) not found:\n ${missing.join('\n ')}`)
149
+ }
150
+ }
151
+
152
+ /**
153
+ * Infer output format from file extension.
154
+ *
155
+ * @param outputPath - Output file path
156
+ * @param explicitFormat - Explicitly provided format (takes precedence)
157
+ * @returns Inferred format
158
+ */
159
+ const inferFormat = (outputPath: string | undefined, explicitFormat: string | undefined): 'json' | 'markdown' => {
160
+ // Explicit format takes precedence
161
+ if (explicitFormat === 'json' || explicitFormat === 'markdown') {
162
+ return explicitFormat
163
+ }
164
+
165
+ // Infer from file extension
166
+ if (outputPath) {
167
+ const ext = extname(outputPath).toLowerCase()
168
+ if (ext === '.md' || ext === '.markdown') {
169
+ return 'markdown'
170
+ }
171
+ }
172
+
173
+ return 'json'
174
+ }
175
+
176
+ /**
177
+ * Get grader function based on strategy.
178
+ *
179
+ * @param strategy - Comparison strategy
180
+ * @param graderPath - Path to custom grader (for 'custom' strategy)
181
+ * @returns Comparison grader function
182
+ */
183
+ const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promise<ComparisonGrader> => {
184
+ switch (strategy) {
185
+ case 'weighted':
186
+ return weightedGrade
187
+ case 'statistical':
188
+ return statisticalGrade
189
+ case 'custom':
190
+ if (!graderPath) {
191
+ throw new Error('Custom strategy requires --grader path')
192
+ }
193
+ return loadComparisonGrader(graderPath)
194
+ }
195
+ }
196
+
197
+ /**
198
+ * Compute percentile from sorted array.
199
+ *
200
+ * @param sorted - Sorted array of numbers
201
+ * @param p - Percentile (0-1)
202
+ * @returns Value at percentile
203
+ */
204
+ const percentile = (sorted: number[], p: number): number => {
205
+ if (sorted.length === 0) return 0
206
+ const idx = Math.floor(sorted.length * p)
207
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0
208
+ }
209
+
210
+ /**
211
+ * Compute latency statistics from array of durations.
212
+ *
213
+ * @param durations - Array of durations in milliseconds
214
+ * @returns Latency statistics
215
+ */
216
+ const computeLatencyStats = (durations: number[]): LatencyStats => {
217
+ if (durations.length === 0) {
218
+ return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
219
+ }
220
+
221
+ const sorted = [...durations].sort((a, b) => a - b)
222
+ const sum = sorted.reduce((a, b) => a + b, 0)
223
+
224
+ return {
225
+ p50: percentile(sorted, 0.5),
226
+ p90: percentile(sorted, 0.9),
227
+ p99: percentile(sorted, 0.99),
228
+ mean: sum / sorted.length,
229
+ min: sorted[0] ?? 0,
230
+ max: sorted[sorted.length - 1] ?? 0,
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Compute score distribution histogram.
236
+ *
237
+ * @param scores - Array of scores (0-1)
238
+ * @returns Score distribution histogram
239
+ */
240
+ const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
241
+ const dist: ScoreDistribution = {
242
+ '0.0-0.2': 0,
243
+ '0.2-0.4': 0,
244
+ '0.4-0.6': 0,
245
+ '0.6-0.8': 0,
246
+ '0.8-1.0': 0,
247
+ }
248
+
249
+ for (const score of scores) {
250
+ if (score < 0.2) dist['0.0-0.2']++
251
+ else if (score < 0.4) dist['0.2-0.4']++
252
+ else if (score < 0.6) dist['0.4-0.6']++
253
+ else if (score < 0.8) dist['0.6-0.8']++
254
+ else dist['0.8-1.0']++
255
+ }
256
+
257
+ return dist
258
+ }
259
+
260
+ /**
261
+ * Detect trajectory richness from capture results.
262
+ *
263
+ * @param results - Array of capture results
264
+ * @returns Most common trajectory richness level
265
+ */
266
+ const detectTrajectoryRichness = (results: CaptureResult[]): TrajectoryRichness => {
267
+ // Check metadata first
268
+ for (const r of results) {
269
+ const richness = r.metadata?.trajectoryRichness
270
+ if (richness === 'full' || richness === 'minimal' || richness === 'messages-only') {
271
+ return richness as TrajectoryRichness
272
+ }
273
+ }
274
+
275
+ // Infer from trajectory content
276
+ for (const r of results) {
277
+ const hasThought = r.trajectory.some((s) => s.type === 'thought')
278
+ const hasToolCall = r.trajectory.some((s) => s.type === 'tool_call')
279
+ if (hasThought || hasToolCall) return 'full'
280
+ }
281
+
282
+ // Check if we have any trajectory at all
283
+ const hasTrajectory = results.some((r) => r.trajectory.length > 0)
284
+ return hasTrajectory ? 'messages-only' : 'minimal'
285
+ }
286
+
287
+ /**
288
+ * Execute pipeline compare and generate aggregate report.
289
+ *
290
+ * @param config - Extended compare configuration
291
+ * @returns Comparison report
292
+ */
293
+ export const runCompare = async (config: ExtendedCompareConfig): Promise<ComparisonReport> => {
294
+ const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
102
295
 
103
296
  if (runs.length < 2) {
104
297
  throw new Error('At least 2 runs required for comparison')
105
298
  }
106
299
 
107
- // Load comparison grader
108
- const grader = await loadComparisonGrader(graderPath)
300
+ // Get grader based on strategy
301
+ const grader = await getGrader(strategy, graderPath)
109
302
 
110
- logProgress(`Comparing ${runs.length} runs with: ${graderPath}`, progress)
303
+ const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
304
+ logProgress(`Comparing ${runs.length} runs with strategy: ${strategyLabel}`, progress)
111
305
  for (const run of runs) {
112
306
  logProgress(` - ${run.label}: ${run.path}`, progress)
113
307
  }
114
308
 
115
- // Load all runs
116
- const runResults: Record<string, CaptureResult[]> = {}
309
+ // Load all runs using indexed streaming (memory-efficient for large files)
310
+ // Uses Map<id, result> instead of arrays for O(1) lookups
311
+ const runResults: Record<string, Map<string, CaptureResult>> = {}
117
312
  for (const run of runs) {
118
313
  logProgress(`Loading ${run.label}...`, progress)
119
- runResults[run.label] = await loadResults(run.path)
314
+ runResults[run.label] = await buildResultsIndex(run.path)
120
315
  }
121
316
 
122
- // Build map of prompt IDs to runs
317
+ // Build set of all prompt IDs across runs
123
318
  const promptIds = new Set<string>()
124
- for (const results of Object.values(runResults)) {
125
- for (const result of results) {
126
- promptIds.add(result.id)
319
+ for (const resultsMap of Object.values(runResults)) {
320
+ for (const id of resultsMap.keys()) {
321
+ promptIds.add(id)
127
322
  }
128
323
  }
129
324
 
130
325
  logProgress(`Comparing ${promptIds.size} prompts...`, progress)
131
326
 
132
- let isFirstOutput = true
133
-
134
- // Clear output file if specified
135
- if (outputPath) {
136
- await Bun.write(outputPath, '')
137
- }
138
-
139
- const results: ComparisonResult[] = []
327
+ // Per-prompt comparison results
328
+ const perPromptResults: ComparisonResult[] = []
329
+ const promptComparisons: PromptComparison[] = []
140
330
 
141
331
  for (const promptId of promptIds) {
142
332
  logProgress(` ${promptId}`, progress)
@@ -145,18 +335,24 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
145
335
  const runsData: ComparisonGraderInput['runs'] = {}
146
336
  let input: string | string[] = ''
147
337
  let hint: string | undefined
338
+ let metadata: Record<string, unknown> | undefined
148
339
 
149
- for (const [label, labelResults] of Object.entries(runResults)) {
150
- const result = labelResults.find((r) => r.id === promptId)
340
+ for (const [label, resultsMap] of Object.entries(runResults)) {
341
+ const result = resultsMap.get(promptId)
151
342
  if (result) {
152
343
  runsData[label] = {
153
344
  output: result.output,
154
345
  trajectory: result.trajectory,
346
+ // Include additional fields for graders that need them
347
+ ...(result.score && { score: result.score }),
348
+ ...(result.timing && { duration: result.timing.total }),
349
+ ...(result.toolErrors !== undefined && { toolErrors: result.toolErrors }),
155
350
  }
156
- // Use first found input/hint as the reference
351
+ // Use first found input/hint/metadata as the reference
157
352
  if (!input) {
158
353
  input = result.input
159
354
  hint = result.hint
355
+ metadata = result.metadata
160
356
  }
161
357
  }
162
358
  }
@@ -172,6 +368,7 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
172
368
  id: promptId,
173
369
  input,
174
370
  hint,
371
+ metadata,
175
372
  runs: runsData,
176
373
  }
177
374
 
@@ -185,16 +382,164 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
185
382
  reasoning: graderResult.reasoning,
186
383
  }
187
384
 
188
- results.push(comparisonResult)
385
+ perPromptResults.push(comparisonResult)
189
386
 
190
- // Log winner
387
+ // Build prompt comparison for head-to-head
191
388
  const winner = graderResult.rankings.find((r) => r.rank === 1)
389
+ const scores: Record<string, number> = {}
390
+ const latencies: Record<string, number> = {}
391
+ const hadErrors: Record<string, boolean> = {}
392
+
393
+ for (const ranking of graderResult.rankings) {
394
+ scores[ranking.run] = ranking.score
395
+ }
396
+
397
+ for (const [label, data] of Object.entries(runsData)) {
398
+ latencies[label] = data.duration ?? 0
399
+ hadErrors[label] = data.toolErrors ?? false
400
+ }
401
+
402
+ promptComparisons.push({
403
+ id: promptId,
404
+ winner: winner?.run ?? null,
405
+ scores,
406
+ latencies,
407
+ hadErrors,
408
+ })
409
+
410
+ // Log winner
192
411
  if (winner) {
193
412
  logProgress(` Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress)
194
413
  }
414
+ }
415
+
416
+ // Compute aggregate metrics
417
+ const runLabels = runs.map((r) => r.label)
418
+
419
+ // Quality metrics (iterate over Map values)
420
+ const quality: Record<string, QualityMetrics> = {}
421
+ for (const label of runLabels) {
422
+ const resultsMap = runResults[label] ?? new Map()
423
+ const results = [...resultsMap.values()]
424
+ const scores = results.map((r) => r.score?.score ?? 0)
425
+ const passes = results.filter((r) => r.score?.pass === true).length
426
+ const fails = results.length - passes
427
+
428
+ quality[label] = {
429
+ avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
430
+ passRate: results.length > 0 ? passes / results.length : 0,
431
+ passCount: passes,
432
+ failCount: fails,
433
+ scoreDistribution: computeScoreDistribution(scores),
434
+ }
435
+ }
436
+
437
+ // Performance metrics
438
+ const performance: Record<string, PerformanceMetrics> = {}
439
+ for (const label of runLabels) {
440
+ const resultsMap = runResults[label] ?? new Map()
441
+ const results = [...resultsMap.values()]
442
+ const durations = results.map((r) => r.timing?.total ?? 0)
443
+ const firstResponses = results.map((r) => r.timing?.firstResponse).filter((v): v is number => v !== undefined)
444
+
445
+ performance[label] = {
446
+ latency: computeLatencyStats(durations),
447
+ firstResponse: firstResponses.length > 0 ? computeLatencyStats(firstResponses) : undefined,
448
+ totalDuration: durations.reduce((a, b) => a + b, 0),
449
+ }
450
+ }
451
+
452
+ // Reliability metrics
453
+ const reliability: Record<string, ReliabilityMetrics> = {}
454
+ for (const label of runLabels) {
455
+ const resultsMap = runResults[label] ?? new Map()
456
+ const results = [...resultsMap.values()]
457
+ const toolErrorCount = results.filter((r) => r.toolErrors === true).length
458
+ const timeoutCount = results.filter((r) =>
459
+ r.errors?.some((e: string) => e.toLowerCase().includes('timeout')),
460
+ ).length
461
+ const completedCount = results.filter((r) => r.output && !r.errors?.length).length
462
+
463
+ reliability[label] = {
464
+ toolErrors: toolErrorCount,
465
+ toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
466
+ timeouts: timeoutCount,
467
+ timeoutRate: results.length > 0 ? timeoutCount / results.length : 0,
468
+ completionRate: results.length > 0 ? completedCount / results.length : 1,
469
+ }
470
+ }
471
+
472
+ // Trajectory info
473
+ const trajectoryInfo: Record<string, TrajectoryInfo> = {}
474
+ for (const label of runLabels) {
475
+ const resultsMap = runResults[label] ?? new Map()
476
+ const results = [...resultsMap.values()]
477
+ const stepCounts = results.map((r) => r.trajectory?.length ?? 0)
478
+ const avgStepCount = stepCounts.length > 0 ? stepCounts.reduce((a, b) => a + b, 0) / stepCounts.length : 0
479
+
480
+ trajectoryInfo[label] = {
481
+ richness: detectTrajectoryRichness(results),
482
+ avgStepCount,
483
+ }
484
+ }
485
+
486
+ // Pairwise comparisons
487
+ const pairwise: PairwiseComparison[] = []
488
+ for (let i = 0; i < runLabels.length; i++) {
489
+ for (let j = i + 1; j < runLabels.length; j++) {
490
+ const runA = runLabels[i]
491
+ const runB = runLabels[j]
492
+
493
+ // Skip if labels are undefined (shouldn't happen but TypeScript requires check)
494
+ if (!runA || !runB) continue
495
+
496
+ let aWins = 0
497
+ let bWins = 0
498
+ let ties = 0
499
+
500
+ for (const pc of promptComparisons) {
501
+ if (pc.winner === runA) aWins++
502
+ else if (pc.winner === runB) bWins++
503
+ else ties++
504
+ }
505
+
506
+ pairwise.push({ runA, runB, aWins, bWins, ties })
507
+ }
508
+ }
509
+
510
+ // Head-to-head
511
+ const headToHead: HeadToHead = {
512
+ prompts: promptComparisons,
513
+ pairwise,
514
+ }
515
+
516
+ // Count prompts where all runs are present
517
+ const promptsWithAllRuns = promptComparisons.filter((pc) => Object.keys(pc.scores).length === runLabels.length).length
518
+
519
+ // Build meta
520
+ const meta: ComparisonMeta = {
521
+ generatedAt: new Date().toISOString(),
522
+ runs: runLabels,
523
+ promptCount: promptIds.size,
524
+ promptsWithAllRuns,
525
+ }
526
+
527
+ // Assemble report
528
+ const report: ComparisonReport = {
529
+ meta,
530
+ quality,
531
+ performance,
532
+ reliability,
533
+ trajectoryInfo,
534
+ headToHead,
535
+ }
195
536
 
196
- await writeOutput(JSON.stringify(comparisonResult), outputPath, !isFirstOutput)
197
- isFirstOutput = false
537
+ // Output
538
+ if (format === 'markdown') {
539
+ const markdown = formatReportAsMarkdown(report)
540
+ await writeOutput(markdown, outputPath, false)
541
+ } else {
542
+ await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
198
543
  }
199
544
 
200
545
  // Summary statistics
@@ -202,24 +547,90 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
202
547
  logProgress('=== Summary ===', progress)
203
548
 
204
549
  const winCounts: Record<string, number> = {}
205
- for (const run of runs) {
206
- winCounts[run.label] = 0
550
+ for (const label of runLabels) {
551
+ winCounts[label] = 0
207
552
  }
208
553
 
209
- for (const result of results) {
210
- const winner = result.rankings.find((r) => r.rank === 1)
211
- if (winner && winner.run in winCounts) {
212
- const currentCount = winCounts[winner.run] ?? 0
213
- winCounts[winner.run] = currentCount + 1
554
+ for (const pc of promptComparisons) {
555
+ if (pc.winner && pc.winner in winCounts) {
556
+ const current = winCounts[pc.winner] ?? 0
557
+ winCounts[pc.winner] = current + 1
214
558
  }
215
559
  }
216
560
 
217
561
  for (const [label, wins] of Object.entries(winCounts)) {
218
- const pct = ((wins / results.length) * 100).toFixed(1)
562
+ const pct = promptComparisons.length > 0 ? ((wins / promptComparisons.length) * 100).toFixed(1) : '0.0'
219
563
  logProgress(` ${label}: ${wins} wins (${pct}%)`, progress)
220
564
  }
221
565
 
222
566
  logProgress('Done!', progress)
567
+
568
+ return report
569
+ }
570
+
571
+ /**
572
+ * Format comparison report as markdown.
573
+ *
574
+ * @param report - Comparison report
575
+ * @returns Markdown string
576
+ */
577
+ const formatReportAsMarkdown = (report: ComparisonReport): string => {
578
+ const lines: string[] = []
579
+
580
+ lines.push('# Comparison Report')
581
+ lines.push('')
582
+ lines.push(`Generated: ${report.meta.generatedAt}`)
583
+ lines.push(`Runs: ${report.meta.runs.join(', ')}`)
584
+ lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
585
+ lines.push('')
586
+
587
+ // Quality table
588
+ lines.push('## Quality')
589
+ lines.push('')
590
+ lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
591
+ lines.push('|-----|-----------|-----------|------|------|')
592
+ for (const [label, q] of Object.entries(report.quality)) {
593
+ lines.push(
594
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
595
+ )
596
+ }
597
+ lines.push('')
598
+
599
+ // Performance table
600
+ lines.push('## Performance')
601
+ lines.push('')
602
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
603
+ lines.push('|-----|----------|----------|----------|-----------|')
604
+ for (const [label, p] of Object.entries(report.performance)) {
605
+ lines.push(
606
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
607
+ )
608
+ }
609
+ lines.push('')
610
+
611
+ // Reliability table
612
+ lines.push('## Reliability')
613
+ lines.push('')
614
+ lines.push('| Run | Tool Errors | Error Rate | Completion Rate |')
615
+ lines.push('|-----|-------------|------------|-----------------|')
616
+ for (const [label, r] of Object.entries(report.reliability)) {
617
+ lines.push(
618
+ `| ${label} | ${r.toolErrors} | ${(r.toolErrorRate * 100).toFixed(1)}% | ${(r.completionRate * 100).toFixed(1)}% |`,
619
+ )
620
+ }
621
+ lines.push('')
622
+
623
+ // Pairwise wins
624
+ lines.push('## Head-to-Head')
625
+ lines.push('')
626
+ lines.push('| Matchup | Wins | Wins | Ties |')
627
+ lines.push('|---------|------|------|------|')
628
+ for (const p of report.headToHead.pairwise) {
629
+ lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
630
+ }
631
+ lines.push('')
632
+
633
+ return lines.join('\n')
223
634
  }
224
635
 
225
636
  /**
@@ -233,7 +644,9 @@ export const compare = async (args: string[]): Promise<void> => {
233
644
  options: {
234
645
  run: { type: 'string', multiple: true },
235
646
  grader: { type: 'string', short: 'g' },
647
+ strategy: { type: 'string', short: 's' },
236
648
  output: { type: 'string', short: 'o' },
649
+ format: { type: 'string', short: 'f' },
237
650
  progress: { type: 'boolean', default: false },
238
651
  help: { type: 'boolean', short: 'h' },
239
652
  },
@@ -241,60 +654,60 @@ export const compare = async (args: string[]): Promise<void> => {
241
654
  })
242
655
 
243
656
  if (values.help) {
244
- // biome-ignore lint/suspicious/noConsole: CLI help output
245
657
  console.log(`
246
- Usage: agent-eval-harness compare [files...] --grader <grader> [options]
658
+ Usage: agent-eval-harness compare [files...] [options]
247
659
 
248
- Compare multiple runs of the same prompts.
660
+ Compare multiple runs of the same prompts and generate aggregate report.
249
661
 
250
662
  Arguments:
251
663
  files... Result files to compare (positional, unlimited)
252
664
 
253
665
  Options:
254
666
  --run Labeled run format: "label:path.jsonl" (alternative to positional)
255
- -g, --grader Path to comparison grader (.ts/.js module) (required)
667
+ -s, --strategy Comparison strategy: weighted (default), statistical, or custom
668
+ -g, --grader Path to custom grader (required if strategy=custom)
256
669
  -o, --output Output file (default: stdout)
670
+ -f, --format Output format: json (default) or markdown
257
671
  --progress Show progress to stderr
258
672
  -h, --help Show this help message
259
673
 
260
- Comparison Grader:
674
+ Built-in Strategies:
675
+ weighted Configurable weights for quality, latency, reliability
676
+ Customize via: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
677
+ statistical Bootstrap sampling for confidence intervals
678
+ Customize via: COMPARE_BOOTSTRAP_ITERATIONS
679
+
680
+ Custom Grader:
261
681
  Must export 'grade' or 'compare' function with signature:
262
682
  (params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
263
683
 
264
- Input includes all runs' results for a single prompt.
265
- Output should rank runs from best to worst.
266
-
267
684
  Examples:
268
- # Compare multiple result files (positional)
269
- agent-eval-harness compare run1.jsonl run2.jsonl run3.jsonl -g ./compare-grader.ts
685
+ # Default: weighted strategy with JSON output
686
+ agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
687
+
688
+ # Statistical significance strategy
689
+ agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
690
+
691
+ # Custom weights
692
+ COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \\
693
+ agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
694
+
695
+ # Markdown report
696
+ agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
697
+
698
+ # Custom grader
699
+ agent-eval-harness compare run1.jsonl run2.jsonl \\
700
+ --strategy custom --grader ./my-llm-judge.ts -o comparison.json
270
701
 
271
702
  # With explicit labels
272
703
  agent-eval-harness compare \\
273
704
  --run "with-bun-mcp:results-bun.jsonl" \\
274
705
  --run "vanilla:results-vanilla.jsonl" \\
275
- -g ./compare-grader.ts
276
-
277
- # Mix positional and labeled
278
- agent-eval-harness compare results-*.jsonl \\
279
- --run "baseline:baseline.jsonl" \\
280
- -g ./compare-grader.ts -o comparison.jsonl
281
-
282
- # Typical workflow
283
- # 1. Capture with different configs
284
- agent-eval-harness capture prompts.jsonl -s claude.json -o vanilla.jsonl
285
- agent-eval-harness capture prompts.jsonl -s claude-with-mcp.json -o with-mcp.jsonl
286
-
287
- # 2. Compare results
288
- agent-eval-harness compare vanilla.jsonl with-mcp.jsonl -g ./compare-grader.ts
706
+ -o comparison.json
289
707
  `)
290
708
  return
291
709
  }
292
710
 
293
- if (!values.grader) {
294
- console.error('Error: --grader is required')
295
- process.exit(1)
296
- }
297
-
298
711
  // Collect runs from positional args and --run flags
299
712
  const runs: LabeledRun[] = []
300
713
 
@@ -312,14 +725,43 @@ Examples:
312
725
 
313
726
  if (runs.length < 2) {
314
727
  console.error('Error: At least 2 result files required for comparison')
315
- console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl -g ./grader.ts')
728
+ console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl')
729
+ process.exit(1)
730
+ }
731
+
732
+ // Validate that all run files exist (early error for better UX)
733
+ try {
734
+ await validateRunFiles(runs)
735
+ } catch (error) {
736
+ console.error(`Error: ${error instanceof Error ? error.message : error}`)
737
+ process.exit(1)
738
+ }
739
+
740
+ // Validate strategy
741
+ const strategy = (values.strategy as CompareStrategy) ?? 'weighted'
742
+ if (!['weighted', 'statistical', 'custom'].includes(strategy)) {
743
+ console.error(`Error: Invalid strategy '${strategy}'. Use: weighted, statistical, or custom`)
744
+ process.exit(1)
745
+ }
746
+
747
+ if (strategy === 'custom' && !values.grader) {
748
+ console.error('Error: --grader is required when using --strategy custom')
749
+ process.exit(1)
750
+ }
751
+
752
+ // Validate format (explicit format takes precedence, otherwise infer from extension)
753
+ const format = inferFormat(values.output, values.format)
754
+ if (values.format && !['json', 'markdown'].includes(values.format)) {
755
+ console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
316
756
  process.exit(1)
317
757
  }
318
758
 
319
759
  await runCompare({
320
760
  runs,
761
+ strategy,
321
762
  graderPath: values.grader,
322
763
  outputPath: values.output,
323
764
  progress: values.progress,
765
+ format,
324
766
  })
325
767
  }