@plaited/agent-eval-harness 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/bin/cli.ts +0 -2
- package/package.json +1 -1
- package/src/commands/balance.ts +0 -2
- package/src/commands/calibrate.ts +1 -2
- package/src/commands/capture.ts +1 -1
- package/src/commands/summarize.ts +1 -3
- package/src/commands/trials.ts +1 -1
- package/src/commands/validate-refs.ts +1 -2
- package/src/core/core.ts +1 -1
- package/src/core/loading.ts +77 -0
- package/src/core/output.ts +0 -1
- package/src/core.ts +4 -1
- package/src/graders/compare-statistical.ts +187 -0
- package/src/graders/compare-weighted.ts +112 -0
- package/src/graders/tests/compare-graders.spec.ts +293 -0
- package/src/graders.ts +19 -0
- package/src/headless/headless-cli.ts +0 -2
- package/src/headless/headless-session-manager.ts +4 -1
- package/src/pipeline/compare.ts +512 -70
- package/src/pipeline/extract.ts +1 -1
- package/src/pipeline/format.ts +0 -1
- package/src/pipeline/grade.ts +1 -1
- package/src/pipeline/pipeline.ts +2 -1
- package/src/pipeline/pipeline.types.ts +29 -1
- package/src/pipeline/run.ts +5 -3
- package/src/schemas/grader-loader.ts +9 -1
- package/src/schemas/schemas-cli.ts +0 -7
- package/src/schemas/schemas.ts +211 -0
- package/src/schemas.ts +23 -0
package/src/pipeline/compare.ts
CHANGED
|
@@ -3,7 +3,10 @@
|
|
|
3
3
|
*
|
|
4
4
|
* @remarks
|
|
5
5
|
* Compares results from different configurations (agents, MCP servers, models)
|
|
6
|
-
* using a user-provided comparison grader
|
|
6
|
+
* using either built-in strategies or a user-provided comparison grader.
|
|
7
|
+
*
|
|
8
|
+
* Outputs a holistic ComparisonReport JSON (not JSONL) containing aggregate
|
|
9
|
+
* statistics across quality, performance, reliability, and head-to-head metrics.
|
|
7
10
|
*
|
|
8
11
|
* Terminology: "runs" (not "agents") because comparisons can be:
|
|
9
12
|
* - Same agent, different MCP servers
|
|
@@ -12,13 +15,33 @@
|
|
|
12
15
|
* - Same agent, different model versions
|
|
13
16
|
* - Different agents entirely
|
|
14
17
|
*
|
|
18
|
+
* Built-in strategies:
|
|
19
|
+
* - `weighted`: Configurable weights for quality, latency, reliability (default)
|
|
20
|
+
* - `statistical`: Bootstrap sampling for confidence intervals
|
|
21
|
+
*
|
|
15
22
|
* @packageDocumentation
|
|
16
23
|
*/
|
|
17
24
|
|
|
18
25
|
import { basename, extname } from 'node:path'
|
|
19
26
|
import { parseArgs } from 'node:util'
|
|
20
|
-
import {
|
|
21
|
-
import
|
|
27
|
+
import { buildResultsIndex, logProgress, writeOutput } from '../core.ts'
|
|
28
|
+
import { grade as statisticalGrade } from '../graders/compare-statistical.ts'
|
|
29
|
+
import { grade as weightedGrade } from '../graders/compare-weighted.ts'
|
|
30
|
+
import type {
|
|
31
|
+
CaptureResult,
|
|
32
|
+
ComparisonMeta,
|
|
33
|
+
ComparisonReport,
|
|
34
|
+
HeadToHead,
|
|
35
|
+
LatencyStats,
|
|
36
|
+
PairwiseComparison,
|
|
37
|
+
PerformanceMetrics,
|
|
38
|
+
PromptComparison,
|
|
39
|
+
QualityMetrics,
|
|
40
|
+
ReliabilityMetrics,
|
|
41
|
+
ScoreDistribution,
|
|
42
|
+
TrajectoryInfo,
|
|
43
|
+
TrajectoryRichness,
|
|
44
|
+
} from '../schemas.ts'
|
|
22
45
|
import type {
|
|
23
46
|
CompareConfig,
|
|
24
47
|
ComparisonGrader,
|
|
@@ -27,6 +50,19 @@ import type {
|
|
|
27
50
|
LabeledRun,
|
|
28
51
|
} from './pipeline.types.ts'
|
|
29
52
|
|
|
53
|
+
/** Comparison strategy type */
|
|
54
|
+
export type CompareStrategy = 'weighted' | 'statistical' | 'custom'
|
|
55
|
+
|
|
56
|
+
/** Extended compare config with strategy support */
|
|
57
|
+
export type ExtendedCompareConfig = Omit<CompareConfig, 'graderPath'> & {
|
|
58
|
+
/** Comparison strategy (default: weighted) */
|
|
59
|
+
strategy?: CompareStrategy
|
|
60
|
+
/** Path to custom grader (required if strategy is 'custom') */
|
|
61
|
+
graderPath?: string
|
|
62
|
+
/** Output format (default: json) */
|
|
63
|
+
format?: 'json' | 'markdown'
|
|
64
|
+
}
|
|
65
|
+
|
|
30
66
|
/**
|
|
31
67
|
* Load comparison grader from file.
|
|
32
68
|
*
|
|
@@ -93,50 +129,204 @@ const parseLabeledRun = (arg: string): LabeledRun => {
|
|
|
93
129
|
}
|
|
94
130
|
|
|
95
131
|
/**
|
|
96
|
-
*
|
|
132
|
+
* Validate that all run files exist.
|
|
97
133
|
*
|
|
98
|
-
* @param
|
|
134
|
+
* @param runs - Labeled runs to validate
|
|
135
|
+
* @throws Error if any file doesn't exist
|
|
99
136
|
*/
|
|
100
|
-
|
|
101
|
-
const
|
|
137
|
+
const validateRunFiles = async (runs: LabeledRun[]): Promise<void> => {
|
|
138
|
+
const missing: string[] = []
|
|
139
|
+
|
|
140
|
+
for (const run of runs) {
|
|
141
|
+
const exists = await Bun.file(run.path).exists()
|
|
142
|
+
if (!exists) {
|
|
143
|
+
missing.push(`${run.label}: ${run.path}`)
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (missing.length > 0) {
|
|
148
|
+
throw new Error(`Result file(s) not found:\n ${missing.join('\n ')}`)
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Infer output format from file extension.
|
|
154
|
+
*
|
|
155
|
+
* @param outputPath - Output file path
|
|
156
|
+
* @param explicitFormat - Explicitly provided format (takes precedence)
|
|
157
|
+
* @returns Inferred format
|
|
158
|
+
*/
|
|
159
|
+
const inferFormat = (outputPath: string | undefined, explicitFormat: string | undefined): 'json' | 'markdown' => {
|
|
160
|
+
// Explicit format takes precedence
|
|
161
|
+
if (explicitFormat === 'json' || explicitFormat === 'markdown') {
|
|
162
|
+
return explicitFormat
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Infer from file extension
|
|
166
|
+
if (outputPath) {
|
|
167
|
+
const ext = extname(outputPath).toLowerCase()
|
|
168
|
+
if (ext === '.md' || ext === '.markdown') {
|
|
169
|
+
return 'markdown'
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return 'json'
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Get grader function based on strategy.
|
|
178
|
+
*
|
|
179
|
+
* @param strategy - Comparison strategy
|
|
180
|
+
* @param graderPath - Path to custom grader (for 'custom' strategy)
|
|
181
|
+
* @returns Comparison grader function
|
|
182
|
+
*/
|
|
183
|
+
const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promise<ComparisonGrader> => {
|
|
184
|
+
switch (strategy) {
|
|
185
|
+
case 'weighted':
|
|
186
|
+
return weightedGrade
|
|
187
|
+
case 'statistical':
|
|
188
|
+
return statisticalGrade
|
|
189
|
+
case 'custom':
|
|
190
|
+
if (!graderPath) {
|
|
191
|
+
throw new Error('Custom strategy requires --grader path')
|
|
192
|
+
}
|
|
193
|
+
return loadComparisonGrader(graderPath)
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Compute percentile from sorted array.
|
|
199
|
+
*
|
|
200
|
+
* @param sorted - Sorted array of numbers
|
|
201
|
+
* @param p - Percentile (0-1)
|
|
202
|
+
* @returns Value at percentile
|
|
203
|
+
*/
|
|
204
|
+
const percentile = (sorted: number[], p: number): number => {
|
|
205
|
+
if (sorted.length === 0) return 0
|
|
206
|
+
const idx = Math.floor(sorted.length * p)
|
|
207
|
+
return sorted[Math.min(idx, sorted.length - 1)] ?? 0
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Compute latency statistics from array of durations.
|
|
212
|
+
*
|
|
213
|
+
* @param durations - Array of durations in milliseconds
|
|
214
|
+
* @returns Latency statistics
|
|
215
|
+
*/
|
|
216
|
+
const computeLatencyStats = (durations: number[]): LatencyStats => {
|
|
217
|
+
if (durations.length === 0) {
|
|
218
|
+
return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const sorted = [...durations].sort((a, b) => a - b)
|
|
222
|
+
const sum = sorted.reduce((a, b) => a + b, 0)
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
p50: percentile(sorted, 0.5),
|
|
226
|
+
p90: percentile(sorted, 0.9),
|
|
227
|
+
p99: percentile(sorted, 0.99),
|
|
228
|
+
mean: sum / sorted.length,
|
|
229
|
+
min: sorted[0] ?? 0,
|
|
230
|
+
max: sorted[sorted.length - 1] ?? 0,
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Compute score distribution histogram.
|
|
236
|
+
*
|
|
237
|
+
* @param scores - Array of scores (0-1)
|
|
238
|
+
* @returns Score distribution histogram
|
|
239
|
+
*/
|
|
240
|
+
const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
|
|
241
|
+
const dist: ScoreDistribution = {
|
|
242
|
+
'0.0-0.2': 0,
|
|
243
|
+
'0.2-0.4': 0,
|
|
244
|
+
'0.4-0.6': 0,
|
|
245
|
+
'0.6-0.8': 0,
|
|
246
|
+
'0.8-1.0': 0,
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
for (const score of scores) {
|
|
250
|
+
if (score < 0.2) dist['0.0-0.2']++
|
|
251
|
+
else if (score < 0.4) dist['0.2-0.4']++
|
|
252
|
+
else if (score < 0.6) dist['0.4-0.6']++
|
|
253
|
+
else if (score < 0.8) dist['0.6-0.8']++
|
|
254
|
+
else dist['0.8-1.0']++
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return dist
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Detect trajectory richness from capture results.
|
|
262
|
+
*
|
|
263
|
+
* @param results - Array of capture results
|
|
264
|
+
* @returns Most common trajectory richness level
|
|
265
|
+
*/
|
|
266
|
+
const detectTrajectoryRichness = (results: CaptureResult[]): TrajectoryRichness => {
|
|
267
|
+
// Check metadata first
|
|
268
|
+
for (const r of results) {
|
|
269
|
+
const richness = r.metadata?.trajectoryRichness
|
|
270
|
+
if (richness === 'full' || richness === 'minimal' || richness === 'messages-only') {
|
|
271
|
+
return richness as TrajectoryRichness
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Infer from trajectory content
|
|
276
|
+
for (const r of results) {
|
|
277
|
+
const hasThought = r.trajectory.some((s) => s.type === 'thought')
|
|
278
|
+
const hasToolCall = r.trajectory.some((s) => s.type === 'tool_call')
|
|
279
|
+
if (hasThought || hasToolCall) return 'full'
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Check if we have any trajectory at all
|
|
283
|
+
const hasTrajectory = results.some((r) => r.trajectory.length > 0)
|
|
284
|
+
return hasTrajectory ? 'messages-only' : 'minimal'
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Execute pipeline compare and generate aggregate report.
|
|
289
|
+
*
|
|
290
|
+
* @param config - Extended compare configuration
|
|
291
|
+
* @returns Comparison report
|
|
292
|
+
*/
|
|
293
|
+
export const runCompare = async (config: ExtendedCompareConfig): Promise<ComparisonReport> => {
|
|
294
|
+
const { runs, strategy = 'weighted', graderPath, outputPath, progress = false, format = 'json' } = config
|
|
102
295
|
|
|
103
296
|
if (runs.length < 2) {
|
|
104
297
|
throw new Error('At least 2 runs required for comparison')
|
|
105
298
|
}
|
|
106
299
|
|
|
107
|
-
//
|
|
108
|
-
const grader = await
|
|
300
|
+
// Get grader based on strategy
|
|
301
|
+
const grader = await getGrader(strategy, graderPath)
|
|
109
302
|
|
|
110
|
-
|
|
303
|
+
const strategyLabel = strategy === 'custom' ? `custom: ${graderPath}` : strategy
|
|
304
|
+
logProgress(`Comparing ${runs.length} runs with strategy: ${strategyLabel}`, progress)
|
|
111
305
|
for (const run of runs) {
|
|
112
306
|
logProgress(` - ${run.label}: ${run.path}`, progress)
|
|
113
307
|
}
|
|
114
308
|
|
|
115
|
-
// Load all runs
|
|
116
|
-
|
|
309
|
+
// Load all runs using indexed streaming (memory-efficient for large files)
|
|
310
|
+
// Uses Map<id, result> instead of arrays for O(1) lookups
|
|
311
|
+
const runResults: Record<string, Map<string, CaptureResult>> = {}
|
|
117
312
|
for (const run of runs) {
|
|
118
313
|
logProgress(`Loading ${run.label}...`, progress)
|
|
119
|
-
runResults[run.label] = await
|
|
314
|
+
runResults[run.label] = await buildResultsIndex(run.path)
|
|
120
315
|
}
|
|
121
316
|
|
|
122
|
-
// Build
|
|
317
|
+
// Build set of all prompt IDs across runs
|
|
123
318
|
const promptIds = new Set<string>()
|
|
124
|
-
for (const
|
|
125
|
-
for (const
|
|
126
|
-
promptIds.add(
|
|
319
|
+
for (const resultsMap of Object.values(runResults)) {
|
|
320
|
+
for (const id of resultsMap.keys()) {
|
|
321
|
+
promptIds.add(id)
|
|
127
322
|
}
|
|
128
323
|
}
|
|
129
324
|
|
|
130
325
|
logProgress(`Comparing ${promptIds.size} prompts...`, progress)
|
|
131
326
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if (outputPath) {
|
|
136
|
-
await Bun.write(outputPath, '')
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
const results: ComparisonResult[] = []
|
|
327
|
+
// Per-prompt comparison results
|
|
328
|
+
const perPromptResults: ComparisonResult[] = []
|
|
329
|
+
const promptComparisons: PromptComparison[] = []
|
|
140
330
|
|
|
141
331
|
for (const promptId of promptIds) {
|
|
142
332
|
logProgress(` ${promptId}`, progress)
|
|
@@ -145,18 +335,24 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
|
|
|
145
335
|
const runsData: ComparisonGraderInput['runs'] = {}
|
|
146
336
|
let input: string | string[] = ''
|
|
147
337
|
let hint: string | undefined
|
|
338
|
+
let metadata: Record<string, unknown> | undefined
|
|
148
339
|
|
|
149
|
-
for (const [label,
|
|
150
|
-
const result =
|
|
340
|
+
for (const [label, resultsMap] of Object.entries(runResults)) {
|
|
341
|
+
const result = resultsMap.get(promptId)
|
|
151
342
|
if (result) {
|
|
152
343
|
runsData[label] = {
|
|
153
344
|
output: result.output,
|
|
154
345
|
trajectory: result.trajectory,
|
|
346
|
+
// Include additional fields for graders that need them
|
|
347
|
+
...(result.score && { score: result.score }),
|
|
348
|
+
...(result.timing && { duration: result.timing.total }),
|
|
349
|
+
...(result.toolErrors !== undefined && { toolErrors: result.toolErrors }),
|
|
155
350
|
}
|
|
156
|
-
// Use first found input/hint as the reference
|
|
351
|
+
// Use first found input/hint/metadata as the reference
|
|
157
352
|
if (!input) {
|
|
158
353
|
input = result.input
|
|
159
354
|
hint = result.hint
|
|
355
|
+
metadata = result.metadata
|
|
160
356
|
}
|
|
161
357
|
}
|
|
162
358
|
}
|
|
@@ -172,6 +368,7 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
|
|
|
172
368
|
id: promptId,
|
|
173
369
|
input,
|
|
174
370
|
hint,
|
|
371
|
+
metadata,
|
|
175
372
|
runs: runsData,
|
|
176
373
|
}
|
|
177
374
|
|
|
@@ -185,16 +382,164 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
|
|
|
185
382
|
reasoning: graderResult.reasoning,
|
|
186
383
|
}
|
|
187
384
|
|
|
188
|
-
|
|
385
|
+
perPromptResults.push(comparisonResult)
|
|
189
386
|
|
|
190
|
-
//
|
|
387
|
+
// Build prompt comparison for head-to-head
|
|
191
388
|
const winner = graderResult.rankings.find((r) => r.rank === 1)
|
|
389
|
+
const scores: Record<string, number> = {}
|
|
390
|
+
const latencies: Record<string, number> = {}
|
|
391
|
+
const hadErrors: Record<string, boolean> = {}
|
|
392
|
+
|
|
393
|
+
for (const ranking of graderResult.rankings) {
|
|
394
|
+
scores[ranking.run] = ranking.score
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
for (const [label, data] of Object.entries(runsData)) {
|
|
398
|
+
latencies[label] = data.duration ?? 0
|
|
399
|
+
hadErrors[label] = data.toolErrors ?? false
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
promptComparisons.push({
|
|
403
|
+
id: promptId,
|
|
404
|
+
winner: winner?.run ?? null,
|
|
405
|
+
scores,
|
|
406
|
+
latencies,
|
|
407
|
+
hadErrors,
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
// Log winner
|
|
192
411
|
if (winner) {
|
|
193
412
|
logProgress(` Winner: ${winner.run} (${winner.score.toFixed(2)})`, progress)
|
|
194
413
|
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Compute aggregate metrics
|
|
417
|
+
const runLabels = runs.map((r) => r.label)
|
|
418
|
+
|
|
419
|
+
// Quality metrics (iterate over Map values)
|
|
420
|
+
const quality: Record<string, QualityMetrics> = {}
|
|
421
|
+
for (const label of runLabels) {
|
|
422
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
423
|
+
const results = [...resultsMap.values()]
|
|
424
|
+
const scores = results.map((r) => r.score?.score ?? 0)
|
|
425
|
+
const passes = results.filter((r) => r.score?.pass === true).length
|
|
426
|
+
const fails = results.length - passes
|
|
427
|
+
|
|
428
|
+
quality[label] = {
|
|
429
|
+
avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
|
|
430
|
+
passRate: results.length > 0 ? passes / results.length : 0,
|
|
431
|
+
passCount: passes,
|
|
432
|
+
failCount: fails,
|
|
433
|
+
scoreDistribution: computeScoreDistribution(scores),
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Performance metrics
|
|
438
|
+
const performance: Record<string, PerformanceMetrics> = {}
|
|
439
|
+
for (const label of runLabels) {
|
|
440
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
441
|
+
const results = [...resultsMap.values()]
|
|
442
|
+
const durations = results.map((r) => r.timing?.total ?? 0)
|
|
443
|
+
const firstResponses = results.map((r) => r.timing?.firstResponse).filter((v): v is number => v !== undefined)
|
|
444
|
+
|
|
445
|
+
performance[label] = {
|
|
446
|
+
latency: computeLatencyStats(durations),
|
|
447
|
+
firstResponse: firstResponses.length > 0 ? computeLatencyStats(firstResponses) : undefined,
|
|
448
|
+
totalDuration: durations.reduce((a, b) => a + b, 0),
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Reliability metrics
|
|
453
|
+
const reliability: Record<string, ReliabilityMetrics> = {}
|
|
454
|
+
for (const label of runLabels) {
|
|
455
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
456
|
+
const results = [...resultsMap.values()]
|
|
457
|
+
const toolErrorCount = results.filter((r) => r.toolErrors === true).length
|
|
458
|
+
const timeoutCount = results.filter((r) =>
|
|
459
|
+
r.errors?.some((e: string) => e.toLowerCase().includes('timeout')),
|
|
460
|
+
).length
|
|
461
|
+
const completedCount = results.filter((r) => r.output && !r.errors?.length).length
|
|
462
|
+
|
|
463
|
+
reliability[label] = {
|
|
464
|
+
toolErrors: toolErrorCount,
|
|
465
|
+
toolErrorRate: results.length > 0 ? toolErrorCount / results.length : 0,
|
|
466
|
+
timeouts: timeoutCount,
|
|
467
|
+
timeoutRate: results.length > 0 ? timeoutCount / results.length : 0,
|
|
468
|
+
completionRate: results.length > 0 ? completedCount / results.length : 1,
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
// Trajectory info
|
|
473
|
+
const trajectoryInfo: Record<string, TrajectoryInfo> = {}
|
|
474
|
+
for (const label of runLabels) {
|
|
475
|
+
const resultsMap = runResults[label] ?? new Map()
|
|
476
|
+
const results = [...resultsMap.values()]
|
|
477
|
+
const stepCounts = results.map((r) => r.trajectory?.length ?? 0)
|
|
478
|
+
const avgStepCount = stepCounts.length > 0 ? stepCounts.reduce((a, b) => a + b, 0) / stepCounts.length : 0
|
|
479
|
+
|
|
480
|
+
trajectoryInfo[label] = {
|
|
481
|
+
richness: detectTrajectoryRichness(results),
|
|
482
|
+
avgStepCount,
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// Pairwise comparisons
|
|
487
|
+
const pairwise: PairwiseComparison[] = []
|
|
488
|
+
for (let i = 0; i < runLabels.length; i++) {
|
|
489
|
+
for (let j = i + 1; j < runLabels.length; j++) {
|
|
490
|
+
const runA = runLabels[i]
|
|
491
|
+
const runB = runLabels[j]
|
|
492
|
+
|
|
493
|
+
// Skip if labels are undefined (shouldn't happen but TypeScript requires check)
|
|
494
|
+
if (!runA || !runB) continue
|
|
495
|
+
|
|
496
|
+
let aWins = 0
|
|
497
|
+
let bWins = 0
|
|
498
|
+
let ties = 0
|
|
499
|
+
|
|
500
|
+
for (const pc of promptComparisons) {
|
|
501
|
+
if (pc.winner === runA) aWins++
|
|
502
|
+
else if (pc.winner === runB) bWins++
|
|
503
|
+
else ties++
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
pairwise.push({ runA, runB, aWins, bWins, ties })
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Head-to-head
|
|
511
|
+
const headToHead: HeadToHead = {
|
|
512
|
+
prompts: promptComparisons,
|
|
513
|
+
pairwise,
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Count prompts where all runs are present
|
|
517
|
+
const promptsWithAllRuns = promptComparisons.filter((pc) => Object.keys(pc.scores).length === runLabels.length).length
|
|
518
|
+
|
|
519
|
+
// Build meta
|
|
520
|
+
const meta: ComparisonMeta = {
|
|
521
|
+
generatedAt: new Date().toISOString(),
|
|
522
|
+
runs: runLabels,
|
|
523
|
+
promptCount: promptIds.size,
|
|
524
|
+
promptsWithAllRuns,
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// Assemble report
|
|
528
|
+
const report: ComparisonReport = {
|
|
529
|
+
meta,
|
|
530
|
+
quality,
|
|
531
|
+
performance,
|
|
532
|
+
reliability,
|
|
533
|
+
trajectoryInfo,
|
|
534
|
+
headToHead,
|
|
535
|
+
}
|
|
195
536
|
|
|
196
|
-
|
|
197
|
-
|
|
537
|
+
// Output
|
|
538
|
+
if (format === 'markdown') {
|
|
539
|
+
const markdown = formatReportAsMarkdown(report)
|
|
540
|
+
await writeOutput(markdown, outputPath, false)
|
|
541
|
+
} else {
|
|
542
|
+
await writeOutput(JSON.stringify(report, null, 2), outputPath, false)
|
|
198
543
|
}
|
|
199
544
|
|
|
200
545
|
// Summary statistics
|
|
@@ -202,24 +547,90 @@ export const runCompare = async (config: CompareConfig): Promise<void> => {
|
|
|
202
547
|
logProgress('=== Summary ===', progress)
|
|
203
548
|
|
|
204
549
|
const winCounts: Record<string, number> = {}
|
|
205
|
-
for (const
|
|
206
|
-
winCounts[
|
|
550
|
+
for (const label of runLabels) {
|
|
551
|
+
winCounts[label] = 0
|
|
207
552
|
}
|
|
208
553
|
|
|
209
|
-
for (const
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
winCounts[winner.run] = currentCount + 1
|
|
554
|
+
for (const pc of promptComparisons) {
|
|
555
|
+
if (pc.winner && pc.winner in winCounts) {
|
|
556
|
+
const current = winCounts[pc.winner] ?? 0
|
|
557
|
+
winCounts[pc.winner] = current + 1
|
|
214
558
|
}
|
|
215
559
|
}
|
|
216
560
|
|
|
217
561
|
for (const [label, wins] of Object.entries(winCounts)) {
|
|
218
|
-
const pct = ((wins /
|
|
562
|
+
const pct = promptComparisons.length > 0 ? ((wins / promptComparisons.length) * 100).toFixed(1) : '0.0'
|
|
219
563
|
logProgress(` ${label}: ${wins} wins (${pct}%)`, progress)
|
|
220
564
|
}
|
|
221
565
|
|
|
222
566
|
logProgress('Done!', progress)
|
|
567
|
+
|
|
568
|
+
return report
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
/**
|
|
572
|
+
* Format comparison report as markdown.
|
|
573
|
+
*
|
|
574
|
+
* @param report - Comparison report
|
|
575
|
+
* @returns Markdown string
|
|
576
|
+
*/
|
|
577
|
+
const formatReportAsMarkdown = (report: ComparisonReport): string => {
|
|
578
|
+
const lines: string[] = []
|
|
579
|
+
|
|
580
|
+
lines.push('# Comparison Report')
|
|
581
|
+
lines.push('')
|
|
582
|
+
lines.push(`Generated: ${report.meta.generatedAt}`)
|
|
583
|
+
lines.push(`Runs: ${report.meta.runs.join(', ')}`)
|
|
584
|
+
lines.push(`Prompts: ${report.meta.promptCount} total, ${report.meta.promptsWithAllRuns} with all runs`)
|
|
585
|
+
lines.push('')
|
|
586
|
+
|
|
587
|
+
// Quality table
|
|
588
|
+
lines.push('## Quality')
|
|
589
|
+
lines.push('')
|
|
590
|
+
lines.push('| Run | Avg Score | Pass Rate | Pass | Fail |')
|
|
591
|
+
lines.push('|-----|-----------|-----------|------|------|')
|
|
592
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
593
|
+
lines.push(
|
|
594
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${(q.passRate * 100).toFixed(1)}% | ${q.passCount} | ${q.failCount} |`,
|
|
595
|
+
)
|
|
596
|
+
}
|
|
597
|
+
lines.push('')
|
|
598
|
+
|
|
599
|
+
// Performance table
|
|
600
|
+
lines.push('## Performance')
|
|
601
|
+
lines.push('')
|
|
602
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) |')
|
|
603
|
+
lines.push('|-----|----------|----------|----------|-----------|')
|
|
604
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
605
|
+
lines.push(
|
|
606
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} |`,
|
|
607
|
+
)
|
|
608
|
+
}
|
|
609
|
+
lines.push('')
|
|
610
|
+
|
|
611
|
+
// Reliability table
|
|
612
|
+
lines.push('## Reliability')
|
|
613
|
+
lines.push('')
|
|
614
|
+
lines.push('| Run | Tool Errors | Error Rate | Completion Rate |')
|
|
615
|
+
lines.push('|-----|-------------|------------|-----------------|')
|
|
616
|
+
for (const [label, r] of Object.entries(report.reliability)) {
|
|
617
|
+
lines.push(
|
|
618
|
+
`| ${label} | ${r.toolErrors} | ${(r.toolErrorRate * 100).toFixed(1)}% | ${(r.completionRate * 100).toFixed(1)}% |`,
|
|
619
|
+
)
|
|
620
|
+
}
|
|
621
|
+
lines.push('')
|
|
622
|
+
|
|
623
|
+
// Pairwise wins
|
|
624
|
+
lines.push('## Head-to-Head')
|
|
625
|
+
lines.push('')
|
|
626
|
+
lines.push('| Matchup | Wins | Wins | Ties |')
|
|
627
|
+
lines.push('|---------|------|------|------|')
|
|
628
|
+
for (const p of report.headToHead.pairwise) {
|
|
629
|
+
lines.push(`| ${p.runA} vs ${p.runB} | ${p.aWins} | ${p.bWins} | ${p.ties} |`)
|
|
630
|
+
}
|
|
631
|
+
lines.push('')
|
|
632
|
+
|
|
633
|
+
return lines.join('\n')
|
|
223
634
|
}
|
|
224
635
|
|
|
225
636
|
/**
|
|
@@ -233,7 +644,9 @@ export const compare = async (args: string[]): Promise<void> => {
|
|
|
233
644
|
options: {
|
|
234
645
|
run: { type: 'string', multiple: true },
|
|
235
646
|
grader: { type: 'string', short: 'g' },
|
|
647
|
+
strategy: { type: 'string', short: 's' },
|
|
236
648
|
output: { type: 'string', short: 'o' },
|
|
649
|
+
format: { type: 'string', short: 'f' },
|
|
237
650
|
progress: { type: 'boolean', default: false },
|
|
238
651
|
help: { type: 'boolean', short: 'h' },
|
|
239
652
|
},
|
|
@@ -241,60 +654,60 @@ export const compare = async (args: string[]): Promise<void> => {
|
|
|
241
654
|
})
|
|
242
655
|
|
|
243
656
|
if (values.help) {
|
|
244
|
-
// biome-ignore lint/suspicious/noConsole: CLI help output
|
|
245
657
|
console.log(`
|
|
246
|
-
Usage: agent-eval-harness compare [files...]
|
|
658
|
+
Usage: agent-eval-harness compare [files...] [options]
|
|
247
659
|
|
|
248
|
-
Compare multiple runs of the same prompts.
|
|
660
|
+
Compare multiple runs of the same prompts and generate aggregate report.
|
|
249
661
|
|
|
250
662
|
Arguments:
|
|
251
663
|
files... Result files to compare (positional, unlimited)
|
|
252
664
|
|
|
253
665
|
Options:
|
|
254
666
|
--run Labeled run format: "label:path.jsonl" (alternative to positional)
|
|
255
|
-
-
|
|
667
|
+
-s, --strategy Comparison strategy: weighted (default), statistical, or custom
|
|
668
|
+
-g, --grader Path to custom grader (required if strategy=custom)
|
|
256
669
|
-o, --output Output file (default: stdout)
|
|
670
|
+
-f, --format Output format: json (default) or markdown
|
|
257
671
|
--progress Show progress to stderr
|
|
258
672
|
-h, --help Show this help message
|
|
259
673
|
|
|
260
|
-
|
|
674
|
+
Built-in Strategies:
|
|
675
|
+
weighted Configurable weights for quality, latency, reliability
|
|
676
|
+
Customize via: COMPARE_QUALITY, COMPARE_LATENCY, COMPARE_RELIABILITY
|
|
677
|
+
statistical Bootstrap sampling for confidence intervals
|
|
678
|
+
Customize via: COMPARE_BOOTSTRAP_ITERATIONS
|
|
679
|
+
|
|
680
|
+
Custom Grader:
|
|
261
681
|
Must export 'grade' or 'compare' function with signature:
|
|
262
682
|
(params: ComparisonGraderInput) => Promise<ComparisonGraderResult>
|
|
263
683
|
|
|
264
|
-
Input includes all runs' results for a single prompt.
|
|
265
|
-
Output should rank runs from best to worst.
|
|
266
|
-
|
|
267
684
|
Examples:
|
|
268
|
-
#
|
|
269
|
-
agent-eval-harness compare run1.jsonl run2.jsonl
|
|
685
|
+
# Default: weighted strategy with JSON output
|
|
686
|
+
agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
687
|
+
|
|
688
|
+
# Statistical significance strategy
|
|
689
|
+
agent-eval-harness compare run1.jsonl run2.jsonl --strategy statistical -o comparison.json
|
|
690
|
+
|
|
691
|
+
# Custom weights
|
|
692
|
+
COMPARE_QUALITY=0.7 COMPARE_LATENCY=0.2 COMPARE_RELIABILITY=0.1 \\
|
|
693
|
+
agent-eval-harness compare run1.jsonl run2.jsonl -o comparison.json
|
|
694
|
+
|
|
695
|
+
# Markdown report
|
|
696
|
+
agent-eval-harness compare run1.jsonl run2.jsonl --format markdown -o report.md
|
|
697
|
+
|
|
698
|
+
# Custom grader
|
|
699
|
+
agent-eval-harness compare run1.jsonl run2.jsonl \\
|
|
700
|
+
--strategy custom --grader ./my-llm-judge.ts -o comparison.json
|
|
270
701
|
|
|
271
702
|
# With explicit labels
|
|
272
703
|
agent-eval-harness compare \\
|
|
273
704
|
--run "with-bun-mcp:results-bun.jsonl" \\
|
|
274
705
|
--run "vanilla:results-vanilla.jsonl" \\
|
|
275
|
-
-
|
|
276
|
-
|
|
277
|
-
# Mix positional and labeled
|
|
278
|
-
agent-eval-harness compare results-*.jsonl \\
|
|
279
|
-
--run "baseline:baseline.jsonl" \\
|
|
280
|
-
-g ./compare-grader.ts -o comparison.jsonl
|
|
281
|
-
|
|
282
|
-
# Typical workflow
|
|
283
|
-
# 1. Capture with different configs
|
|
284
|
-
agent-eval-harness capture prompts.jsonl -s claude.json -o vanilla.jsonl
|
|
285
|
-
agent-eval-harness capture prompts.jsonl -s claude-with-mcp.json -o with-mcp.jsonl
|
|
286
|
-
|
|
287
|
-
# 2. Compare results
|
|
288
|
-
agent-eval-harness compare vanilla.jsonl with-mcp.jsonl -g ./compare-grader.ts
|
|
706
|
+
-o comparison.json
|
|
289
707
|
`)
|
|
290
708
|
return
|
|
291
709
|
}
|
|
292
710
|
|
|
293
|
-
if (!values.grader) {
|
|
294
|
-
console.error('Error: --grader is required')
|
|
295
|
-
process.exit(1)
|
|
296
|
-
}
|
|
297
|
-
|
|
298
711
|
// Collect runs from positional args and --run flags
|
|
299
712
|
const runs: LabeledRun[] = []
|
|
300
713
|
|
|
@@ -312,14 +725,43 @@ Examples:
|
|
|
312
725
|
|
|
313
726
|
if (runs.length < 2) {
|
|
314
727
|
console.error('Error: At least 2 result files required for comparison')
|
|
315
|
-
console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl
|
|
728
|
+
console.error('Example: agent-eval-harness compare run1.jsonl run2.jsonl')
|
|
729
|
+
process.exit(1)
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Validate that all run files exist (early error for better UX)
|
|
733
|
+
try {
|
|
734
|
+
await validateRunFiles(runs)
|
|
735
|
+
} catch (error) {
|
|
736
|
+
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
737
|
+
process.exit(1)
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// Validate strategy
|
|
741
|
+
const strategy = (values.strategy as CompareStrategy) ?? 'weighted'
|
|
742
|
+
if (!['weighted', 'statistical', 'custom'].includes(strategy)) {
|
|
743
|
+
console.error(`Error: Invalid strategy '${strategy}'. Use: weighted, statistical, or custom`)
|
|
744
|
+
process.exit(1)
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
if (strategy === 'custom' && !values.grader) {
|
|
748
|
+
console.error('Error: --grader is required when using --strategy custom')
|
|
749
|
+
process.exit(1)
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
// Validate format (explicit format takes precedence, otherwise infer from extension)
|
|
753
|
+
const format = inferFormat(values.output, values.format)
|
|
754
|
+
if (values.format && !['json', 'markdown'].includes(values.format)) {
|
|
755
|
+
console.error(`Error: Invalid format '${values.format}'. Use: json or markdown`)
|
|
316
756
|
process.exit(1)
|
|
317
757
|
}
|
|
318
758
|
|
|
319
759
|
await runCompare({
|
|
320
760
|
runs,
|
|
761
|
+
strategy,
|
|
321
762
|
graderPath: values.grader,
|
|
322
763
|
outputPath: values.output,
|
|
323
764
|
progress: values.progress,
|
|
765
|
+
format,
|
|
324
766
|
})
|
|
325
767
|
}
|