@plaited/agent-eval-harness 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.12.0",
3
+ "version": "0.12.2",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -56,12 +56,12 @@
56
56
  ]
57
57
  },
58
58
  "dependencies": {
59
- "@plaited/development-skills": "0.7.0",
59
+ "@plaited/development-skills": "0.8.0",
60
60
  "zod": "^4.3.6"
61
61
  },
62
62
  "devDependencies": {
63
- "@biomejs/biome": "2.3.12",
64
- "@types/bun": "1.3.6",
63
+ "@biomejs/biome": "2.3.14",
64
+ "@types/bun": "1.3.9",
65
65
  "format-package": "7.0.0",
66
66
  "lint-staged": "16.2.7",
67
67
  "typescript": "5.9.3"
@@ -26,10 +26,13 @@ import type {
26
26
  TrialsComparisonMeta,
27
27
  TrialsComparisonReport,
28
28
  TrialsFlakinessMetrics,
29
+ TrialsPerformanceMetrics,
29
30
  TrialsPromptComparison,
31
+ TrialsQualityMetrics,
30
32
  TrialsReliabilityMetrics,
31
33
  } from '../schemas.ts'
32
34
  import { TrialResultSchema } from '../schemas.ts'
35
+ import { computeLatencyStats, percentile } from './compare-utils.ts'
33
36
  import type {
34
37
  ComparisonGraderResult,
35
38
  LabeledRun,
@@ -148,19 +151,6 @@ const getTrialsGrader = async (
148
151
  }
149
152
  }
150
153
 
151
- /**
152
- * Compute percentile from sorted array.
153
- *
154
- * @param sorted - Sorted array of numbers
155
- * @param p - Percentile (0-1)
156
- * @returns Value at percentile
157
- */
158
- const percentile = (sorted: number[], p: number): number => {
159
- if (sorted.length === 0) return 0
160
- const idx = Math.floor(sorted.length * p)
161
- return sorted[Math.min(idx, sorted.length - 1)] ?? 0
162
- }
163
-
164
154
  /**
165
155
  * Compute capability metrics from trial results.
166
156
  *
@@ -245,6 +235,72 @@ const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 1
245
235
  }
246
236
  }
247
237
 
238
+ /** Result from quality metrics computation, including raw scores for CI reuse */
239
+ type QualityComputeResult = {
240
+ metrics: TrialsQualityMetrics
241
+ rawScores: number[]
242
+ }
243
+
244
+ /**
245
+ * Compute quality metrics from trial results.
246
+ *
247
+ * @remarks
248
+ * Flattens all trial scores across all prompts into a single distribution.
249
+ * Returns undefined if no scores are present (no grader was used).
250
+ * Returns raw scores alongside metrics to avoid re-traversal for CI computation.
251
+ *
252
+ * @param results - Array of trial results
253
+ * @returns Quality metrics with raw scores, or undefined if no scores
254
+ */
255
+ const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => {
256
+ const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number))
257
+
258
+ if (rawScores.length === 0) return undefined
259
+
260
+ const sorted = [...rawScores].sort((a, b) => a - b)
261
+ const sum = rawScores.reduce((a, b) => a + b, 0)
262
+
263
+ return {
264
+ metrics: {
265
+ type: 'trial',
266
+ avgScore: sum / rawScores.length,
267
+ medianScore: percentile(sorted, 0.5),
268
+ p25Score: percentile(sorted, 0.25),
269
+ p75Score: percentile(sorted, 0.75),
270
+ },
271
+ rawScores,
272
+ }
273
+ }
274
+
275
+ /** Result from performance metrics computation, including raw durations for CI reuse */
276
+ type PerformanceComputeResult = {
277
+ metrics: TrialsPerformanceMetrics
278
+ rawDurations: number[]
279
+ }
280
+
281
+ /**
282
+ * Compute performance metrics from trial results.
283
+ *
284
+ * @remarks
285
+ * Flattens all trial durations across all prompts into latency statistics.
286
+ * Always returns a value since TrialEntry.duration is required.
287
+ * Returns raw durations alongside metrics to avoid re-traversal for CI computation.
288
+ *
289
+ * @param results - Array of trial results
290
+ * @returns Performance metrics with raw durations
291
+ */
292
+ const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => {
293
+ const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration))
294
+
295
+ return {
296
+ metrics: {
297
+ latency: computeLatencyStats(rawDurations),
298
+ totalDuration: rawDurations.reduce((a, b) => a + b, 0),
299
+ },
300
+ rawDurations,
301
+ }
302
+ }
303
+
248
304
  /**
249
305
  * Execute trials comparison and generate aggregate report.
250
306
  *
@@ -399,6 +455,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
399
455
  const capability: Record<string, TrialsCapabilityMetrics> = {}
400
456
  const reliability: Record<string, TrialsReliabilityMetrics> = {}
401
457
  const flakiness: Record<string, TrialsFlakinessMetrics> = {}
458
+ const quality: Record<string, TrialsQualityMetrics> = {}
459
+ const performance: Record<string, TrialsPerformanceMetrics> = {}
460
+ const rawScoresByRun: Record<string, number[]> = {}
461
+ const rawDurationsByRun: Record<string, number[]> = {}
462
+
463
+ let hasQuality = false
402
464
 
403
465
  for (const label of runLabels) {
404
466
  const resultsMap = runResults[label] ?? new Map()
@@ -407,6 +469,17 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
407
469
  capability[label] = computeCapabilityMetrics(results)
408
470
  reliability[label] = computeReliabilityMetrics(results)
409
471
  flakiness[label] = computeFlakinessMetrics(results)
472
+
473
+ const perfResult = computeTrialsPerformanceMetrics(results)
474
+ performance[label] = perfResult.metrics
475
+ rawDurationsByRun[label] = perfResult.rawDurations
476
+
477
+ const qualityResult = computeTrialsQualityMetrics(results)
478
+ if (qualityResult) {
479
+ quality[label] = qualityResult.metrics
480
+ rawScoresByRun[label] = qualityResult.rawScores
481
+ hasQuality = true
482
+ }
410
483
  }
411
484
 
412
485
  // Compute confidence intervals when using statistical strategy
@@ -415,9 +488,9 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
415
488
 
416
489
  for (const label of runLabels) {
417
490
  const resultsMap = runResults[label] ?? new Map()
418
- const results = [...resultsMap.values()]
419
- const passAtKValues = results.map((r) => r.passAtK ?? 0)
420
- const passExpKValues = results.map((r) => r.passExpK ?? 0)
491
+ const resultsArr = [...resultsMap.values()]
492
+ const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0)
493
+ const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0)
421
494
 
422
495
  // Capability CIs
423
496
  const capabilityMetrics = capability[label]
@@ -434,6 +507,24 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
434
507
  avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
435
508
  }
436
509
  }
510
+
511
+ // Quality CIs (only when scores present)
512
+ const qualityMetrics = quality[label]
513
+ const scores = rawScoresByRun[label]
514
+ if (qualityMetrics && scores && scores.length > 0) {
515
+ qualityMetrics.confidenceIntervals = {
516
+ avgScore: bootstrap(scores, bootstrapConfig).ci,
517
+ }
518
+ }
519
+
520
+ // Performance CIs
521
+ const performanceMetrics = performance[label]
522
+ const durations = rawDurationsByRun[label]
523
+ if (performanceMetrics && durations && durations.length > 0) {
524
+ performanceMetrics.confidenceIntervals = {
525
+ latencyMean: bootstrap(durations, bootstrapConfig).ci,
526
+ }
527
+ }
437
528
  }
438
529
  }
439
530
 
@@ -505,6 +596,8 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
505
596
  capability,
506
597
  reliability,
507
598
  flakiness,
599
+ quality: hasQuality ? quality : undefined,
600
+ performance,
508
601
  headToHead: {
509
602
  capability: capabilityPairwise,
510
603
  reliability: reliabilityPairwise,
@@ -528,8 +621,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
528
621
  for (const [label, cap] of Object.entries(capability)) {
529
622
  const rel = reliability[label]
530
623
  const flak = flakiness[label]
624
+ const perf = performance[label]
625
+ const qual = quality[label]
626
+ const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : ''
627
+ const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : ''
531
628
  logProgress(
532
- ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
629
+ ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`,
533
630
  progress,
534
631
  )
535
632
  }
@@ -620,6 +717,58 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
620
717
  }
621
718
  lines.push('')
622
719
 
720
+ // Quality table (only when scores present)
721
+ if (report.quality && Object.keys(report.quality).length > 0) {
722
+ const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
723
+
724
+ lines.push('## Quality (Scores)')
725
+ lines.push('')
726
+ if (hasQualityCIs) {
727
+ lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |')
728
+ lines.push('|-----|-----------|--------|--------|-----|-----|')
729
+ for (const [label, q] of Object.entries(report.quality)) {
730
+ const avgCI = formatCI(q.confidenceIntervals?.avgScore)
731
+ lines.push(
732
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
733
+ )
734
+ }
735
+ } else {
736
+ lines.push('| Run | Avg Score | Median | P25 | P75 |')
737
+ lines.push('|-----|-----------|--------|-----|-----|')
738
+ for (const [label, q] of Object.entries(report.quality)) {
739
+ lines.push(
740
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
741
+ )
742
+ }
743
+ }
744
+ lines.push('')
745
+ }
746
+
747
+ // Performance table (always present)
748
+ const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals)
749
+
750
+ lines.push('## Performance (Latency)')
751
+ lines.push('')
752
+ if (hasPerfCIs) {
753
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |')
754
+ lines.push('|-----|----------|----------|----------|-----------|--------|------------|')
755
+ for (const [label, p] of Object.entries(report.performance)) {
756
+ const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
757
+ lines.push(
758
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`,
759
+ )
760
+ }
761
+ } else {
762
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |')
763
+ lines.push('|-----|----------|----------|----------|-----------|------------|')
764
+ for (const [label, p] of Object.entries(report.performance)) {
765
+ lines.push(
766
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`,
767
+ )
768
+ }
769
+ }
770
+ lines.push('')
771
+
623
772
  // Head-to-head
624
773
  lines.push('## Head-to-Head')
625
774
  lines.push('')
@@ -0,0 +1,85 @@
1
+ /**
2
+ * Shared utility functions for comparison modules.
3
+ *
4
+ * @remarks
5
+ * Extracted from compare.ts and compare-trials.ts to avoid duplication.
6
+ * Contains statistical helpers used by both CaptureResult and TrialResult comparisons.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import type { LatencyStats, ScoreDistribution } from '../schemas.ts'
12
+
13
+ /**
14
+ * Compute percentile from sorted array using nearest rank method.
15
+ *
16
+ * @remarks
17
+ * Uses floor indexing (nearest rank method). For an array of length N,
18
+ * returns the element at index `floor(N * p)`, clamped to the last element.
19
+ * This does not interpolate between ranks.
20
+ *
21
+ * @param sorted - Sorted array of numbers
22
+ * @param p - Percentile (0-1)
23
+ * @returns Value at percentile
24
+ *
25
+ * @public
26
+ */
27
+ export const percentile = (sorted: number[], p: number): number => {
28
+ if (sorted.length === 0) return 0
29
+ const idx = Math.floor(sorted.length * p)
30
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0
31
+ }
32
+
33
+ /**
34
+ * Compute latency statistics from array of durations.
35
+ *
36
+ * @param durations - Array of durations in milliseconds
37
+ * @returns Latency statistics
38
+ *
39
+ * @public
40
+ */
41
+ export const computeLatencyStats = (durations: number[]): LatencyStats => {
42
+ if (durations.length === 0) {
43
+ return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
44
+ }
45
+
46
+ const sorted = [...durations].sort((a, b) => a - b)
47
+ const sum = sorted.reduce((a, b) => a + b, 0)
48
+
49
+ return {
50
+ p50: percentile(sorted, 0.5),
51
+ p90: percentile(sorted, 0.9),
52
+ p99: percentile(sorted, 0.99),
53
+ mean: sum / sorted.length,
54
+ min: sorted[0] ?? 0,
55
+ max: sorted[sorted.length - 1] ?? 0,
56
+ }
57
+ }
58
+
59
+ /**
60
+ * Compute score distribution histogram.
61
+ *
62
+ * @param scores - Array of scores (0-1)
63
+ * @returns Score distribution histogram
64
+ *
65
+ * @public
66
+ */
67
+ export const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
68
+ const dist: ScoreDistribution = {
69
+ '0.0-0.2': 0,
70
+ '0.2-0.4': 0,
71
+ '0.4-0.6': 0,
72
+ '0.6-0.8': 0,
73
+ '0.8-1.0': 0,
74
+ }
75
+
76
+ for (const score of scores) {
77
+ if (score < 0.2) dist['0.0-0.2']++
78
+ else if (score < 0.4) dist['0.2-0.4']++
79
+ else if (score < 0.6) dist['0.4-0.6']++
80
+ else if (score < 0.8) dist['0.6-0.8']++
81
+ else dist['0.8-1.0']++
82
+ }
83
+
84
+ return dist
85
+ }
@@ -33,18 +33,17 @@ import type {
33
33
  ComparisonMeta,
34
34
  ComparisonReport,
35
35
  HeadToHead,
36
- LatencyStats,
37
36
  PairwiseComparison,
38
37
  PerformanceMetrics,
39
38
  PromptComparison,
40
39
  QualityMetrics,
41
40
  ReliabilityMetrics,
42
- ScoreDistribution,
43
41
  TrajectoryInfo,
44
42
  TrajectoryRichness,
45
43
  } from '../schemas.ts'
46
44
  import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
47
45
  import { runTrialsCompare } from './compare-trials.ts'
46
+ import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts'
48
47
  import type {
49
48
  CompareConfig,
50
49
  ComparisonGrader,
@@ -197,69 +196,6 @@ const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promis
197
196
  }
198
197
  }
199
198
 
200
- /**
201
- * Compute percentile from sorted array.
202
- *
203
- * @param sorted - Sorted array of numbers
204
- * @param p - Percentile (0-1)
205
- * @returns Value at percentile
206
- */
207
- const percentile = (sorted: number[], p: number): number => {
208
- if (sorted.length === 0) return 0
209
- const idx = Math.floor(sorted.length * p)
210
- return sorted[Math.min(idx, sorted.length - 1)] ?? 0
211
- }
212
-
213
- /**
214
- * Compute latency statistics from array of durations.
215
- *
216
- * @param durations - Array of durations in milliseconds
217
- * @returns Latency statistics
218
- */
219
- const computeLatencyStats = (durations: number[]): LatencyStats => {
220
- if (durations.length === 0) {
221
- return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
222
- }
223
-
224
- const sorted = [...durations].sort((a, b) => a - b)
225
- const sum = sorted.reduce((a, b) => a + b, 0)
226
-
227
- return {
228
- p50: percentile(sorted, 0.5),
229
- p90: percentile(sorted, 0.9),
230
- p99: percentile(sorted, 0.99),
231
- mean: sum / sorted.length,
232
- min: sorted[0] ?? 0,
233
- max: sorted[sorted.length - 1] ?? 0,
234
- }
235
- }
236
-
237
- /**
238
- * Compute score distribution histogram.
239
- *
240
- * @param scores - Array of scores (0-1)
241
- * @returns Score distribution histogram
242
- */
243
- const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
244
- const dist: ScoreDistribution = {
245
- '0.0-0.2': 0,
246
- '0.2-0.4': 0,
247
- '0.4-0.6': 0,
248
- '0.6-0.8': 0,
249
- '0.8-1.0': 0,
250
- }
251
-
252
- for (const score of scores) {
253
- if (score < 0.2) dist['0.0-0.2']++
254
- else if (score < 0.4) dist['0.2-0.4']++
255
- else if (score < 0.6) dist['0.4-0.6']++
256
- else if (score < 0.8) dist['0.6-0.8']++
257
- else dist['0.8-1.0']++
258
- }
259
-
260
- return dist
261
- }
262
-
263
199
  /**
264
200
  * Detect trajectory richness from capture results.
265
201
  *
@@ -429,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
429
365
  const fails = results.length - passes
430
366
 
431
367
  quality[label] = {
368
+ type: 'run',
432
369
  avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
433
370
  passRate: results.length > 0 ? passes / results.length : 0,
434
371
  passCount: passes,
@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
105
105
  // Verify reliability metrics include type discriminator
106
106
  expect(report.reliability.high?.type).toBe('run')
107
107
  expect(report.reliability.low?.type).toBe('run')
108
+
109
+ // Verify quality metrics include type discriminator
110
+ expect(report.quality.high?.type).toBe('run')
111
+ expect(report.quality.low?.type).toBe('run')
108
112
  })
109
113
 
110
114
  test('computes confidence intervals for performance metrics', async () => {
@@ -14,20 +14,23 @@ import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
14
14
  // Test Fixtures
15
15
  // ============================================================================
16
16
 
17
- const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
17
+ const createTrialResult = (
18
+ id: string,
19
+ passAtK: number,
20
+ passExpK: number,
21
+ k: number = 3,
22
+ includeScores: boolean = true,
23
+ ) => ({
18
24
  id,
19
25
  input: `Prompt for ${id}`,
20
26
  k,
21
- passRate: passAtK,
22
- passAtK,
23
- passExpK,
27
+ ...(includeScores && { passRate: passAtK, passAtK, passExpK }),
24
28
  trials: Array.from({ length: k }, (_, i) => ({
25
29
  trialNum: i + 1,
26
30
  output: `Output ${i + 1}`,
27
31
  trajectory: [],
28
32
  duration: 100 + i * 10,
29
- pass: Math.random() < passAtK,
30
- score: passAtK,
33
+ ...(includeScores && { pass: Math.random() < passAtK, score: passAtK }),
31
34
  })),
32
35
  })
33
36
 
@@ -417,4 +420,173 @@ describe('runTrialsCompare', () => {
417
420
  const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
418
421
  expect(topFlakyIds).toContain('flaky')
419
422
  })
423
+
424
+ test('includes performance metrics with latency stats', async () => {
425
+ const run1Path = `${tempDir}/perf-run1.jsonl`
426
+ const run2Path = `${tempDir}/perf-run2.jsonl`
427
+
428
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
429
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
430
+
431
+ await Bun.write(run1Path, JSON.stringify(trial1))
432
+ await Bun.write(run2Path, JSON.stringify(trial2))
433
+
434
+ const report = await runTrialsCompare({
435
+ runs: [
436
+ { label: 'run1', path: run1Path },
437
+ { label: 'run2', path: run2Path },
438
+ ],
439
+ progress: false,
440
+ })
441
+
442
+ // Performance should always be present
443
+ expect(report.performance).toBeDefined()
444
+ expect(report.performance.run1).toBeDefined()
445
+ expect(report.performance.run2).toBeDefined()
446
+
447
+ const perf = report.performance.run1
448
+ expect(perf?.latency).toBeDefined()
449
+ expect(perf?.latency.p50).toBeGreaterThan(0)
450
+ expect(perf?.latency.mean).toBeGreaterThan(0)
451
+ expect(perf?.latency.min).toBeGreaterThan(0)
452
+ expect(perf?.latency.max).toBeGreaterThan(0)
453
+ expect(perf?.totalDuration).toBeGreaterThan(0)
454
+ })
455
+
456
+ test('includes quality metrics when scores are present', async () => {
457
+ const run1Path = `${tempDir}/qual-run1.jsonl`
458
+ const run2Path = `${tempDir}/qual-run2.jsonl`
459
+
460
+ // createTrialResult always includes score fields
461
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
462
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
463
+
464
+ await Bun.write(run1Path, JSON.stringify(trial1))
465
+ await Bun.write(run2Path, JSON.stringify(trial2))
466
+
467
+ const report = await runTrialsCompare({
468
+ runs: [
469
+ { label: 'run1', path: run1Path },
470
+ { label: 'run2', path: run2Path },
471
+ ],
472
+ progress: false,
473
+ })
474
+
475
+ // Quality should be present since trials have scores
476
+ expect(report.quality).toBeDefined()
477
+ expect(report.quality?.run1).toBeDefined()
478
+
479
+ const qual = report.quality?.run1
480
+ expect(qual?.type).toBe('trial')
481
+ expect(qual?.avgScore).toBeGreaterThan(0)
482
+ expect(qual?.medianScore).toBeGreaterThan(0)
483
+ expect(qual?.p25Score).toBeDefined()
484
+ expect(qual?.p75Score).toBeDefined()
485
+ })
486
+
487
+ test('omits quality metrics when scores are absent', async () => {
488
+ const run1Path = `${tempDir}/noqual-run1.jsonl`
489
+ const run2Path = `${tempDir}/noqual-run2.jsonl`
490
+
491
+ // Create trials without scores (includeScores=false)
492
+ const trial1 = createTrialResult('test-001', 0, 0, 3, false)
493
+ const trial2 = createTrialResult('test-001', 0, 0, 3, false)
494
+
495
+ await Bun.write(run1Path, JSON.stringify(trial1))
496
+ await Bun.write(run2Path, JSON.stringify(trial2))
497
+
498
+ const report = await runTrialsCompare({
499
+ runs: [
500
+ { label: 'run1', path: run1Path },
501
+ { label: 'run2', path: run2Path },
502
+ ],
503
+ progress: false,
504
+ })
505
+
506
+ // Quality should NOT be present since no trials have scores
507
+ expect(report.quality).toBeUndefined()
508
+
509
+ // Performance should still be present
510
+ expect(report.performance).toBeDefined()
511
+ expect(report.performance.run1?.latency.mean).toBeGreaterThan(0)
512
+ })
513
+
514
+ test('statistical strategy computes CIs for quality and performance', async () => {
515
+ const run1Path = `${tempDir}/ci-qp-run1.jsonl`
516
+ const run2Path = `${tempDir}/ci-qp-run2.jsonl`
517
+
518
+ const trials1 = [
519
+ createTrialResult('p1', 0.9, 0.8),
520
+ createTrialResult('p2', 0.85, 0.7),
521
+ createTrialResult('p3', 0.95, 0.9),
522
+ ]
523
+ const trials2 = [
524
+ createTrialResult('p1', 0.6, 0.4),
525
+ createTrialResult('p2', 0.5, 0.3),
526
+ createTrialResult('p3', 0.7, 0.5),
527
+ ]
528
+
529
+ await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
530
+ await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
531
+
532
+ const report = await runTrialsCompare({
533
+ runs: [
534
+ { label: 'high', path: run1Path },
535
+ { label: 'low', path: run2Path },
536
+ ],
537
+ strategy: 'statistical',
538
+ progress: false,
539
+ })
540
+
541
+ // Quality CIs
542
+ const highQual = report.quality?.high
543
+ expect(highQual).toBeDefined()
544
+ expect(highQual?.confidenceIntervals).toBeDefined()
545
+ expect(highQual?.confidenceIntervals?.avgScore).toBeDefined()
546
+
547
+ const qualCI = highQual?.confidenceIntervals?.avgScore
548
+ expect(qualCI).toHaveLength(2)
549
+ expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0)
550
+
551
+ // Performance CIs
552
+ const highPerf = report.performance.high
553
+ expect(highPerf).toBeDefined()
554
+ expect(highPerf?.confidenceIntervals).toBeDefined()
555
+ expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined()
556
+
557
+ const perfCI = highPerf?.confidenceIntervals?.latencyMean
558
+ expect(perfCI).toHaveLength(2)
559
+ expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0)
560
+ })
561
+
562
+ test('markdown output includes quality and performance tables', async () => {
563
+ const run1Path = `${tempDir}/md-qp-run1.jsonl`
564
+ const run2Path = `${tempDir}/md-qp-run2.jsonl`
565
+ const outputPath = `${tempDir}/qp-report.md`
566
+
567
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
568
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
569
+
570
+ await Bun.write(run1Path, JSON.stringify(trial1))
571
+ await Bun.write(run2Path, JSON.stringify(trial2))
572
+
573
+ await runTrialsCompare({
574
+ runs: [
575
+ { label: 'agent1', path: run1Path },
576
+ { label: 'agent2', path: run2Path },
577
+ ],
578
+ outputPath,
579
+ format: 'markdown',
580
+ progress: false,
581
+ })
582
+
583
+ const content = await Bun.file(outputPath).text()
584
+
585
+ // Should contain quality and performance sections
586
+ expect(content).toContain('## Quality (Scores)')
587
+ expect(content).toContain('## Performance (Latency)')
588
+ expect(content).toContain('Avg Score')
589
+ expect(content).toContain('P50 (ms)')
590
+ expect(content).toContain('Mean (ms)')
591
+ })
420
592
  })
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Unit tests for compare-utils shared utilities.
3
+ *
4
+ * @remarks
5
+ * Tests for percentile, computeLatencyStats, and computeScoreDistribution.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ import { describe, expect, test } from 'bun:test'
11
+ import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts'
12
+
13
+ // ============================================================================
14
+ // percentile Tests
15
+ // ============================================================================
16
+
17
+ describe('percentile', () => {
18
+ test('computes correct percentile values', () => {
19
+ const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
20
+
21
+ expect(percentile(sorted, 0.5)).toBe(60)
22
+ expect(percentile(sorted, 0.25)).toBe(30)
23
+ expect(percentile(sorted, 0.75)).toBe(80)
24
+ expect(percentile(sorted, 0.9)).toBe(100)
25
+ })
26
+
27
+ test('returns 0 for empty array', () => {
28
+ expect(percentile([], 0.5)).toBe(0)
29
+ })
30
+
31
+ test('handles single-element array', () => {
32
+ expect(percentile([42], 0.5)).toBe(42)
33
+ expect(percentile([42], 0.0)).toBe(42)
34
+ expect(percentile([42], 1.0)).toBe(42)
35
+ })
36
+
37
+ test('handles p=0 and p=1 boundary values', () => {
38
+ const sorted = [10, 20, 30]
39
+
40
+ expect(percentile(sorted, 0)).toBe(10)
41
+ expect(percentile(sorted, 1)).toBe(30)
42
+ })
43
+ })
44
+
45
+ // ============================================================================
46
+ // computeLatencyStats Tests
47
+ // ============================================================================
48
+
49
+ describe('computeLatencyStats', () => {
50
+ test('returns correct stats for typical durations', () => {
51
+ const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
52
+ const stats = computeLatencyStats(durations)
53
+
54
+ expect(stats.min).toBe(100)
55
+ expect(stats.max).toBe(1000)
56
+ expect(stats.mean).toBe(550)
57
+ expect(stats.p50).toBe(600)
58
+ expect(stats.p90).toBe(1000)
59
+ })
60
+
61
+ test('returns zeros for empty array', () => {
62
+ const stats = computeLatencyStats([])
63
+
64
+ expect(stats.p50).toBe(0)
65
+ expect(stats.p90).toBe(0)
66
+ expect(stats.p99).toBe(0)
67
+ expect(stats.mean).toBe(0)
68
+ expect(stats.min).toBe(0)
69
+ expect(stats.max).toBe(0)
70
+ })
71
+
72
+ test('handles single-element array', () => {
73
+ const stats = computeLatencyStats([42])
74
+
75
+ expect(stats.p50).toBe(42)
76
+ expect(stats.p90).toBe(42)
77
+ expect(stats.mean).toBe(42)
78
+ expect(stats.min).toBe(42)
79
+ expect(stats.max).toBe(42)
80
+ })
81
+
82
+ test('sorts unsorted input', () => {
83
+ const stats = computeLatencyStats([500, 100, 300, 200, 400])
84
+
85
+ expect(stats.min).toBe(100)
86
+ expect(stats.max).toBe(500)
87
+ expect(stats.mean).toBe(300)
88
+ })
89
+ })
90
+
91
+ // ============================================================================
92
+ // computeScoreDistribution Tests
93
+ // ============================================================================
94
+
95
+ describe('computeScoreDistribution', () => {
96
+ test('distributes scores into correct buckets', () => {
97
+ const scores = [0.1, 0.3, 0.5, 0.7, 0.9]
98
+ const dist = computeScoreDistribution(scores)
99
+
100
+ expect(dist['0.0-0.2']).toBe(1)
101
+ expect(dist['0.2-0.4']).toBe(1)
102
+ expect(dist['0.4-0.6']).toBe(1)
103
+ expect(dist['0.6-0.8']).toBe(1)
104
+ expect(dist['0.8-1.0']).toBe(1)
105
+ })
106
+
107
+ test('handles empty scores array', () => {
108
+ const dist = computeScoreDistribution([])
109
+
110
+ expect(dist['0.0-0.2']).toBe(0)
111
+ expect(dist['0.2-0.4']).toBe(0)
112
+ expect(dist['0.4-0.6']).toBe(0)
113
+ expect(dist['0.6-0.8']).toBe(0)
114
+ expect(dist['0.8-1.0']).toBe(0)
115
+ })
116
+
117
+ test('handles boundary values correctly', () => {
118
+ // 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket
119
+ const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
120
+ const dist = computeScoreDistribution(scores)
121
+
122
+ expect(dist['0.0-0.2']).toBe(1) // 0.0
123
+ expect(dist['0.2-0.4']).toBe(1) // 0.2
124
+ expect(dist['0.4-0.6']).toBe(1) // 0.4
125
+ expect(dist['0.6-0.8']).toBe(1) // 0.6
126
+ expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0
127
+ })
128
+ })
@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
620
620
  * Quality metrics for a single run in comparison.
621
621
  */
622
622
  export const QualityMetricsSchema = z.object({
623
+ /** Discriminator for run-level quality metrics */
624
+ type: z.literal('run'),
623
625
  /** Mean grader score (0-1) */
624
626
  avgScore: z.number(),
625
627
  /** Percentage of pass=true results */
@@ -923,6 +925,72 @@ export const TrialsFlakinessMetricsSchema = z.object({
923
925
  /** Trials flakiness metrics type */
924
926
  export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
925
927
 
928
+ /**
929
+ * Confidence intervals for trials quality metrics.
930
+ */
931
+ export const TrialsQualityConfidenceIntervalsSchema = z.object({
932
+ /** CI for avgScore */
933
+ avgScore: ConfidenceIntervalSchema.optional(),
934
+ })
935
+
936
+ /** Trials quality confidence intervals type */
937
+ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfidenceIntervalsSchema>
938
+
939
+ /**
940
+ * Quality metrics for trials comparison (score-based).
941
+ *
942
+ * @remarks
943
+ * Aggregates grader scores across all trials for each prompt.
944
+ * Only present when a grader was used during trials capture.
945
+ */
946
+ export const TrialsQualityMetricsSchema = z.object({
947
+ /** Discriminator for trial-level quality metrics */
948
+ type: z.literal('trial'),
949
+ /** Average score across all trials */
950
+ avgScore: z.number(),
951
+ /** Median score */
952
+ medianScore: z.number(),
953
+ /** 25th percentile score */
954
+ p25Score: z.number(),
955
+ /** 75th percentile score */
956
+ p75Score: z.number(),
957
+ /** Confidence intervals (only with strategy=statistical) */
958
+ confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(),
959
+ })
960
+
961
+ /** Trials quality metrics type */
962
+ export type TrialsQualityMetrics = z.infer<typeof TrialsQualityMetricsSchema>
963
+
964
+ /**
965
+ * Confidence intervals for trials performance metrics.
966
+ */
967
+ export const TrialsPerformanceConfidenceIntervalsSchema = z.object({
968
+ /** CI for latency mean */
969
+ latencyMean: ConfidenceIntervalSchema.optional(),
970
+ })
971
+
972
+ /** Trials performance confidence intervals type */
973
+ export type TrialsPerformanceConfidenceIntervals = z.infer<typeof TrialsPerformanceConfidenceIntervalsSchema>
974
+
975
+ /**
976
+ * Performance metrics for trials comparison (latency-based).
977
+ *
978
+ * @remarks
979
+ * Aggregates trial durations across all prompts.
980
+ * Always present since TrialEntry.duration is required.
981
+ */
982
+ export const TrialsPerformanceMetricsSchema = z.object({
983
+ /** End-to-end latency statistics across all trials */
984
+ latency: LatencyStatsSchema,
985
+ /** Sum of all trial durations in milliseconds */
986
+ totalDuration: z.number(),
987
+ /** Confidence intervals (only with strategy=statistical) */
988
+ confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(),
989
+ })
990
+
991
+ /** Trials performance metrics type */
992
+ export type TrialsPerformanceMetrics = z.infer<typeof TrialsPerformanceMetricsSchema>
993
+
926
994
  /**
927
995
  * Per-prompt metrics for trials comparison drill-down.
928
996
  */
@@ -984,6 +1052,10 @@ export const TrialsComparisonReportSchema = z.object({
984
1052
  reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
985
1053
  /** Flakiness metrics by run label */
986
1054
  flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
1055
+ /** Quality metrics by run label (only when grader scores are present) */
1056
+ quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(),
1057
+ /** Performance metrics by run label (always present, uses trial.duration) */
1058
+ performance: z.record(z.string(), TrialsPerformanceMetricsSchema),
987
1059
  /** Head-to-head comparison details */
988
1060
  headToHead: z.object({
989
1061
  /** Pairwise wins by capability */
package/src/schemas.ts CHANGED
@@ -113,8 +113,16 @@ export {
113
113
  TrialsComparisonReportSchema,
114
114
  type TrialsFlakinessMetrics,
115
115
  TrialsFlakinessMetricsSchema,
116
+ type TrialsPerformanceConfidenceIntervals,
117
+ TrialsPerformanceConfidenceIntervalsSchema,
118
+ type TrialsPerformanceMetrics,
119
+ TrialsPerformanceMetricsSchema,
116
120
  type TrialsPromptComparison,
117
121
  TrialsPromptComparisonSchema,
122
+ type TrialsQualityConfidenceIntervals,
123
+ TrialsQualityConfidenceIntervalsSchema,
124
+ type TrialsQualityMetrics,
125
+ TrialsQualityMetricsSchema,
118
126
  type TrialsReliabilityMetrics,
119
127
  TrialsReliabilityMetricsSchema,
120
128
  type ValidationResult,