@plaited/agent-eval-harness 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@plaited/agent-eval-harness",
3
- "version": "0.12.0",
3
+ "version": "0.12.1",
4
4
  "description": "CLI tool for capturing agent trajectories from headless CLI agents",
5
5
  "license": "ISC",
6
6
  "engines": {
@@ -26,10 +26,13 @@ import type {
26
26
  TrialsComparisonMeta,
27
27
  TrialsComparisonReport,
28
28
  TrialsFlakinessMetrics,
29
+ TrialsPerformanceMetrics,
29
30
  TrialsPromptComparison,
31
+ TrialsQualityMetrics,
30
32
  TrialsReliabilityMetrics,
31
33
  } from '../schemas.ts'
32
34
  import { TrialResultSchema } from '../schemas.ts'
35
+ import { computeLatencyStats, percentile } from './compare-utils.ts'
33
36
  import type {
34
37
  ComparisonGraderResult,
35
38
  LabeledRun,
@@ -148,19 +151,6 @@ const getTrialsGrader = async (
148
151
  }
149
152
  }
150
153
 
151
- /**
152
- * Compute percentile from sorted array.
153
- *
154
- * @param sorted - Sorted array of numbers
155
- * @param p - Percentile (0-1)
156
- * @returns Value at percentile
157
- */
158
- const percentile = (sorted: number[], p: number): number => {
159
- if (sorted.length === 0) return 0
160
- const idx = Math.floor(sorted.length * p)
161
- return sorted[Math.min(idx, sorted.length - 1)] ?? 0
162
- }
163
-
164
154
  /**
165
155
  * Compute capability metrics from trial results.
166
156
  *
@@ -245,6 +235,71 @@ const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 1
245
235
  }
246
236
  }
247
237
 
238
+ /** Result from quality metrics computation, including raw scores for CI reuse */
239
+ type QualityComputeResult = {
240
+ metrics: TrialsQualityMetrics
241
+ rawScores: number[]
242
+ }
243
+
244
+ /**
245
+ * Compute quality metrics from trial results.
246
+ *
247
+ * @remarks
248
+ * Flattens all trial scores across all prompts into a single distribution.
249
+ * Returns undefined if no scores are present (no grader was used).
250
+ * Returns raw scores alongside metrics to avoid re-traversal for CI computation.
251
+ *
252
+ * @param results - Array of trial results
253
+ * @returns Quality metrics with raw scores, or undefined if no scores
254
+ */
255
+ const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => {
256
+ const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number))
257
+
258
+ if (rawScores.length === 0) return undefined
259
+
260
+ const sorted = [...rawScores].sort((a, b) => a - b)
261
+ const sum = rawScores.reduce((a, b) => a + b, 0)
262
+
263
+ return {
264
+ metrics: {
265
+ avgScore: sum / rawScores.length,
266
+ medianScore: percentile(sorted, 0.5),
267
+ p25Score: percentile(sorted, 0.25),
268
+ p75Score: percentile(sorted, 0.75),
269
+ },
270
+ rawScores,
271
+ }
272
+ }
273
+
274
+ /** Result from performance metrics computation, including raw durations for CI reuse */
275
+ type PerformanceComputeResult = {
276
+ metrics: TrialsPerformanceMetrics
277
+ rawDurations: number[]
278
+ }
279
+
280
+ /**
281
+ * Compute performance metrics from trial results.
282
+ *
283
+ * @remarks
284
+ * Flattens all trial durations across all prompts into latency statistics.
285
+ * Always returns a value since TrialEntry.duration is required.
286
+ * Returns raw durations alongside metrics to avoid re-traversal for CI computation.
287
+ *
288
+ * @param results - Array of trial results
289
+ * @returns Performance metrics with raw durations
290
+ */
291
+ const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => {
292
+ const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration))
293
+
294
+ return {
295
+ metrics: {
296
+ latency: computeLatencyStats(rawDurations),
297
+ totalDuration: rawDurations.reduce((a, b) => a + b, 0),
298
+ },
299
+ rawDurations,
300
+ }
301
+ }
302
+
248
303
  /**
249
304
  * Execute trials comparison and generate aggregate report.
250
305
  *
@@ -399,6 +454,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
399
454
  const capability: Record<string, TrialsCapabilityMetrics> = {}
400
455
  const reliability: Record<string, TrialsReliabilityMetrics> = {}
401
456
  const flakiness: Record<string, TrialsFlakinessMetrics> = {}
457
+ const quality: Record<string, TrialsQualityMetrics> = {}
458
+ const performance: Record<string, TrialsPerformanceMetrics> = {}
459
+ const rawScoresByRun: Record<string, number[]> = {}
460
+ const rawDurationsByRun: Record<string, number[]> = {}
461
+
462
+ let hasQuality = false
402
463
 
403
464
  for (const label of runLabels) {
404
465
  const resultsMap = runResults[label] ?? new Map()
@@ -407,6 +468,17 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
407
468
  capability[label] = computeCapabilityMetrics(results)
408
469
  reliability[label] = computeReliabilityMetrics(results)
409
470
  flakiness[label] = computeFlakinessMetrics(results)
471
+
472
+ const perfResult = computeTrialsPerformanceMetrics(results)
473
+ performance[label] = perfResult.metrics
474
+ rawDurationsByRun[label] = perfResult.rawDurations
475
+
476
+ const qualityResult = computeTrialsQualityMetrics(results)
477
+ if (qualityResult) {
478
+ quality[label] = qualityResult.metrics
479
+ rawScoresByRun[label] = qualityResult.rawScores
480
+ hasQuality = true
481
+ }
410
482
  }
411
483
 
412
484
  // Compute confidence intervals when using statistical strategy
@@ -415,9 +487,9 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
415
487
 
416
488
  for (const label of runLabels) {
417
489
  const resultsMap = runResults[label] ?? new Map()
418
- const results = [...resultsMap.values()]
419
- const passAtKValues = results.map((r) => r.passAtK ?? 0)
420
- const passExpKValues = results.map((r) => r.passExpK ?? 0)
490
+ const resultsArr = [...resultsMap.values()]
491
+ const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0)
492
+ const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0)
421
493
 
422
494
  // Capability CIs
423
495
  const capabilityMetrics = capability[label]
@@ -434,6 +506,24 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
434
506
  avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
435
507
  }
436
508
  }
509
+
510
+ // Quality CIs (only when scores present)
511
+ const qualityMetrics = quality[label]
512
+ const scores = rawScoresByRun[label]
513
+ if (qualityMetrics && scores && scores.length > 0) {
514
+ qualityMetrics.confidenceIntervals = {
515
+ avgScore: bootstrap(scores, bootstrapConfig).ci,
516
+ }
517
+ }
518
+
519
+ // Performance CIs
520
+ const performanceMetrics = performance[label]
521
+ const durations = rawDurationsByRun[label]
522
+ if (performanceMetrics && durations && durations.length > 0) {
523
+ performanceMetrics.confidenceIntervals = {
524
+ latencyMean: bootstrap(durations, bootstrapConfig).ci,
525
+ }
526
+ }
437
527
  }
438
528
  }
439
529
 
@@ -505,6 +595,8 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
505
595
  capability,
506
596
  reliability,
507
597
  flakiness,
598
+ quality: hasQuality ? quality : undefined,
599
+ performance,
508
600
  headToHead: {
509
601
  capability: capabilityPairwise,
510
602
  reliability: reliabilityPairwise,
@@ -528,8 +620,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
528
620
  for (const [label, cap] of Object.entries(capability)) {
529
621
  const rel = reliability[label]
530
622
  const flak = flakiness[label]
623
+ const perf = performance[label]
624
+ const qual = quality[label]
625
+ const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : ''
626
+ const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : ''
531
627
  logProgress(
532
- ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
628
+ ` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`,
533
629
  progress,
534
630
  )
535
631
  }
@@ -620,6 +716,58 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
620
716
  }
621
717
  lines.push('')
622
718
 
719
+ // Quality table (only when scores present)
720
+ if (report.quality && Object.keys(report.quality).length > 0) {
721
+ const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
722
+
723
+ lines.push('## Quality (Scores)')
724
+ lines.push('')
725
+ if (hasQualityCIs) {
726
+ lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |')
727
+ lines.push('|-----|-----------|--------|--------|-----|-----|')
728
+ for (const [label, q] of Object.entries(report.quality)) {
729
+ const avgCI = formatCI(q.confidenceIntervals?.avgScore)
730
+ lines.push(
731
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
732
+ )
733
+ }
734
+ } else {
735
+ lines.push('| Run | Avg Score | Median | P25 | P75 |')
736
+ lines.push('|-----|-----------|--------|-----|-----|')
737
+ for (const [label, q] of Object.entries(report.quality)) {
738
+ lines.push(
739
+ `| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
740
+ )
741
+ }
742
+ }
743
+ lines.push('')
744
+ }
745
+
746
+ // Performance table (always present)
747
+ const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals)
748
+
749
+ lines.push('## Performance (Latency)')
750
+ lines.push('')
751
+ if (hasPerfCIs) {
752
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |')
753
+ lines.push('|-----|----------|----------|----------|-----------|--------|------------|')
754
+ for (const [label, p] of Object.entries(report.performance)) {
755
+ const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
756
+ lines.push(
757
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`,
758
+ )
759
+ }
760
+ } else {
761
+ lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |')
762
+ lines.push('|-----|----------|----------|----------|-----------|------------|')
763
+ for (const [label, p] of Object.entries(report.performance)) {
764
+ lines.push(
765
+ `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`,
766
+ )
767
+ }
768
+ }
769
+ lines.push('')
770
+
623
771
  // Head-to-head
624
772
  lines.push('## Head-to-Head')
625
773
  lines.push('')
@@ -0,0 +1,85 @@
1
+ /**
2
+ * Shared utility functions for comparison modules.
3
+ *
4
+ * @remarks
5
+ * Extracted from compare.ts and compare-trials.ts to avoid duplication.
6
+ * Contains statistical helpers used by both CaptureResult and TrialResult comparisons.
7
+ *
8
+ * @packageDocumentation
9
+ */
10
+
11
+ import type { LatencyStats, ScoreDistribution } from '../schemas.ts'
12
+
13
+ /**
14
+ * Compute percentile from sorted array using nearest rank method.
15
+ *
16
+ * @remarks
17
+ * Uses floor indexing (nearest rank method). For an array of length N,
18
+ * returns the element at index `floor(N * p)`, clamped to the last element.
19
+ * This does not interpolate between ranks.
20
+ *
21
+ * @param sorted - Sorted array of numbers
22
+ * @param p - Percentile (0-1)
23
+ * @returns Value at percentile
24
+ *
25
+ * @public
26
+ */
27
+ export const percentile = (sorted: number[], p: number): number => {
28
+ if (sorted.length === 0) return 0
29
+ const idx = Math.floor(sorted.length * p)
30
+ return sorted[Math.min(idx, sorted.length - 1)] ?? 0
31
+ }
32
+
33
+ /**
34
+ * Compute latency statistics from array of durations.
35
+ *
36
+ * @param durations - Array of durations in milliseconds
37
+ * @returns Latency statistics
38
+ *
39
+ * @public
40
+ */
41
+ export const computeLatencyStats = (durations: number[]): LatencyStats => {
42
+ if (durations.length === 0) {
43
+ return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
44
+ }
45
+
46
+ const sorted = [...durations].sort((a, b) => a - b)
47
+ const sum = sorted.reduce((a, b) => a + b, 0)
48
+
49
+ return {
50
+ p50: percentile(sorted, 0.5),
51
+ p90: percentile(sorted, 0.9),
52
+ p99: percentile(sorted, 0.99),
53
+ mean: sum / sorted.length,
54
+ min: sorted[0] ?? 0,
55
+ max: sorted[sorted.length - 1] ?? 0,
56
+ }
57
+ }
58
+
59
+ /**
60
+ * Compute score distribution histogram.
61
+ *
62
+ * @param scores - Array of scores (0-1)
63
+ * @returns Score distribution histogram
64
+ *
65
+ * @public
66
+ */
67
+ export const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
68
+ const dist: ScoreDistribution = {
69
+ '0.0-0.2': 0,
70
+ '0.2-0.4': 0,
71
+ '0.4-0.6': 0,
72
+ '0.6-0.8': 0,
73
+ '0.8-1.0': 0,
74
+ }
75
+
76
+ for (const score of scores) {
77
+ if (score < 0.2) dist['0.0-0.2']++
78
+ else if (score < 0.4) dist['0.2-0.4']++
79
+ else if (score < 0.6) dist['0.4-0.6']++
80
+ else if (score < 0.8) dist['0.6-0.8']++
81
+ else dist['0.8-1.0']++
82
+ }
83
+
84
+ return dist
85
+ }
@@ -33,18 +33,17 @@ import type {
33
33
  ComparisonMeta,
34
34
  ComparisonReport,
35
35
  HeadToHead,
36
- LatencyStats,
37
36
  PairwiseComparison,
38
37
  PerformanceMetrics,
39
38
  PromptComparison,
40
39
  QualityMetrics,
41
40
  ReliabilityMetrics,
42
- ScoreDistribution,
43
41
  TrajectoryInfo,
44
42
  TrajectoryRichness,
45
43
  } from '../schemas.ts'
46
44
  import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
47
45
  import { runTrialsCompare } from './compare-trials.ts'
46
+ import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts'
48
47
  import type {
49
48
  CompareConfig,
50
49
  ComparisonGrader,
@@ -197,69 +196,6 @@ const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promis
197
196
  }
198
197
  }
199
198
 
200
- /**
201
- * Compute percentile from sorted array.
202
- *
203
- * @param sorted - Sorted array of numbers
204
- * @param p - Percentile (0-1)
205
- * @returns Value at percentile
206
- */
207
- const percentile = (sorted: number[], p: number): number => {
208
- if (sorted.length === 0) return 0
209
- const idx = Math.floor(sorted.length * p)
210
- return sorted[Math.min(idx, sorted.length - 1)] ?? 0
211
- }
212
-
213
- /**
214
- * Compute latency statistics from array of durations.
215
- *
216
- * @param durations - Array of durations in milliseconds
217
- * @returns Latency statistics
218
- */
219
- const computeLatencyStats = (durations: number[]): LatencyStats => {
220
- if (durations.length === 0) {
221
- return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
222
- }
223
-
224
- const sorted = [...durations].sort((a, b) => a - b)
225
- const sum = sorted.reduce((a, b) => a + b, 0)
226
-
227
- return {
228
- p50: percentile(sorted, 0.5),
229
- p90: percentile(sorted, 0.9),
230
- p99: percentile(sorted, 0.99),
231
- mean: sum / sorted.length,
232
- min: sorted[0] ?? 0,
233
- max: sorted[sorted.length - 1] ?? 0,
234
- }
235
- }
236
-
237
- /**
238
- * Compute score distribution histogram.
239
- *
240
- * @param scores - Array of scores (0-1)
241
- * @returns Score distribution histogram
242
- */
243
- const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
244
- const dist: ScoreDistribution = {
245
- '0.0-0.2': 0,
246
- '0.2-0.4': 0,
247
- '0.4-0.6': 0,
248
- '0.6-0.8': 0,
249
- '0.8-1.0': 0,
250
- }
251
-
252
- for (const score of scores) {
253
- if (score < 0.2) dist['0.0-0.2']++
254
- else if (score < 0.4) dist['0.2-0.4']++
255
- else if (score < 0.6) dist['0.4-0.6']++
256
- else if (score < 0.8) dist['0.6-0.8']++
257
- else dist['0.8-1.0']++
258
- }
259
-
260
- return dist
261
- }
262
-
263
199
  /**
264
200
  * Detect trajectory richness from capture results.
265
201
  *
@@ -14,20 +14,23 @@ import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
14
14
  // Test Fixtures
15
15
  // ============================================================================
16
16
 
17
- const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
17
+ const createTrialResult = (
18
+ id: string,
19
+ passAtK: number,
20
+ passExpK: number,
21
+ k: number = 3,
22
+ includeScores: boolean = true,
23
+ ) => ({
18
24
  id,
19
25
  input: `Prompt for ${id}`,
20
26
  k,
21
- passRate: passAtK,
22
- passAtK,
23
- passExpK,
27
+ ...(includeScores && { passRate: passAtK, passAtK, passExpK }),
24
28
  trials: Array.from({ length: k }, (_, i) => ({
25
29
  trialNum: i + 1,
26
30
  output: `Output ${i + 1}`,
27
31
  trajectory: [],
28
32
  duration: 100 + i * 10,
29
- pass: Math.random() < passAtK,
30
- score: passAtK,
33
+ ...(includeScores && { pass: Math.random() < passAtK, score: passAtK }),
31
34
  })),
32
35
  })
33
36
 
@@ -417,4 +420,172 @@ describe('runTrialsCompare', () => {
417
420
  const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
418
421
  expect(topFlakyIds).toContain('flaky')
419
422
  })
423
+
424
+ test('includes performance metrics with latency stats', async () => {
425
+ const run1Path = `${tempDir}/perf-run1.jsonl`
426
+ const run2Path = `${tempDir}/perf-run2.jsonl`
427
+
428
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
429
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
430
+
431
+ await Bun.write(run1Path, JSON.stringify(trial1))
432
+ await Bun.write(run2Path, JSON.stringify(trial2))
433
+
434
+ const report = await runTrialsCompare({
435
+ runs: [
436
+ { label: 'run1', path: run1Path },
437
+ { label: 'run2', path: run2Path },
438
+ ],
439
+ progress: false,
440
+ })
441
+
442
+ // Performance should always be present
443
+ expect(report.performance).toBeDefined()
444
+ expect(report.performance.run1).toBeDefined()
445
+ expect(report.performance.run2).toBeDefined()
446
+
447
+ const perf = report.performance.run1
448
+ expect(perf?.latency).toBeDefined()
449
+ expect(perf?.latency.p50).toBeGreaterThan(0)
450
+ expect(perf?.latency.mean).toBeGreaterThan(0)
451
+ expect(perf?.latency.min).toBeGreaterThan(0)
452
+ expect(perf?.latency.max).toBeGreaterThan(0)
453
+ expect(perf?.totalDuration).toBeGreaterThan(0)
454
+ })
455
+
456
+ test('includes quality metrics when scores are present', async () => {
457
+ const run1Path = `${tempDir}/qual-run1.jsonl`
458
+ const run2Path = `${tempDir}/qual-run2.jsonl`
459
+
460
+ // createTrialResult always includes score fields
461
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
462
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
463
+
464
+ await Bun.write(run1Path, JSON.stringify(trial1))
465
+ await Bun.write(run2Path, JSON.stringify(trial2))
466
+
467
+ const report = await runTrialsCompare({
468
+ runs: [
469
+ { label: 'run1', path: run1Path },
470
+ { label: 'run2', path: run2Path },
471
+ ],
472
+ progress: false,
473
+ })
474
+
475
+ // Quality should be present since trials have scores
476
+ expect(report.quality).toBeDefined()
477
+ expect(report.quality?.run1).toBeDefined()
478
+
479
+ const qual = report.quality?.run1
480
+ expect(qual?.avgScore).toBeGreaterThan(0)
481
+ expect(qual?.medianScore).toBeGreaterThan(0)
482
+ expect(qual?.p25Score).toBeDefined()
483
+ expect(qual?.p75Score).toBeDefined()
484
+ })
485
+
486
+ test('omits quality metrics when scores are absent', async () => {
487
+ const run1Path = `${tempDir}/noqual-run1.jsonl`
488
+ const run2Path = `${tempDir}/noqual-run2.jsonl`
489
+
490
+ // Create trials without scores (includeScores=false)
491
+ const trial1 = createTrialResult('test-001', 0, 0, 3, false)
492
+ const trial2 = createTrialResult('test-001', 0, 0, 3, false)
493
+
494
+ await Bun.write(run1Path, JSON.stringify(trial1))
495
+ await Bun.write(run2Path, JSON.stringify(trial2))
496
+
497
+ const report = await runTrialsCompare({
498
+ runs: [
499
+ { label: 'run1', path: run1Path },
500
+ { label: 'run2', path: run2Path },
501
+ ],
502
+ progress: false,
503
+ })
504
+
505
+ // Quality should NOT be present since no trials have scores
506
+ expect(report.quality).toBeUndefined()
507
+
508
+ // Performance should still be present
509
+ expect(report.performance).toBeDefined()
510
+ expect(report.performance.run1?.latency.mean).toBeGreaterThan(0)
511
+ })
512
+
513
+ test('statistical strategy computes CIs for quality and performance', async () => {
514
+ const run1Path = `${tempDir}/ci-qp-run1.jsonl`
515
+ const run2Path = `${tempDir}/ci-qp-run2.jsonl`
516
+
517
+ const trials1 = [
518
+ createTrialResult('p1', 0.9, 0.8),
519
+ createTrialResult('p2', 0.85, 0.7),
520
+ createTrialResult('p3', 0.95, 0.9),
521
+ ]
522
+ const trials2 = [
523
+ createTrialResult('p1', 0.6, 0.4),
524
+ createTrialResult('p2', 0.5, 0.3),
525
+ createTrialResult('p3', 0.7, 0.5),
526
+ ]
527
+
528
+ await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
529
+ await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
530
+
531
+ const report = await runTrialsCompare({
532
+ runs: [
533
+ { label: 'high', path: run1Path },
534
+ { label: 'low', path: run2Path },
535
+ ],
536
+ strategy: 'statistical',
537
+ progress: false,
538
+ })
539
+
540
+ // Quality CIs
541
+ const highQual = report.quality?.high
542
+ expect(highQual).toBeDefined()
543
+ expect(highQual?.confidenceIntervals).toBeDefined()
544
+ expect(highQual?.confidenceIntervals?.avgScore).toBeDefined()
545
+
546
+ const qualCI = highQual?.confidenceIntervals?.avgScore
547
+ expect(qualCI).toHaveLength(2)
548
+ expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0)
549
+
550
+ // Performance CIs
551
+ const highPerf = report.performance.high
552
+ expect(highPerf).toBeDefined()
553
+ expect(highPerf?.confidenceIntervals).toBeDefined()
554
+ expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined()
555
+
556
+ const perfCI = highPerf?.confidenceIntervals?.latencyMean
557
+ expect(perfCI).toHaveLength(2)
558
+ expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0)
559
+ })
560
+
561
+ test('markdown output includes quality and performance tables', async () => {
562
+ const run1Path = `${tempDir}/md-qp-run1.jsonl`
563
+ const run2Path = `${tempDir}/md-qp-run2.jsonl`
564
+ const outputPath = `${tempDir}/qp-report.md`
565
+
566
+ const trial1 = createTrialResult('test-001', 0.9, 0.7)
567
+ const trial2 = createTrialResult('test-001', 0.8, 0.6)
568
+
569
+ await Bun.write(run1Path, JSON.stringify(trial1))
570
+ await Bun.write(run2Path, JSON.stringify(trial2))
571
+
572
+ await runTrialsCompare({
573
+ runs: [
574
+ { label: 'agent1', path: run1Path },
575
+ { label: 'agent2', path: run2Path },
576
+ ],
577
+ outputPath,
578
+ format: 'markdown',
579
+ progress: false,
580
+ })
581
+
582
+ const content = await Bun.file(outputPath).text()
583
+
584
+ // Should contain quality and performance sections
585
+ expect(content).toContain('## Quality (Scores)')
586
+ expect(content).toContain('## Performance (Latency)')
587
+ expect(content).toContain('Avg Score')
588
+ expect(content).toContain('P50 (ms)')
589
+ expect(content).toContain('Mean (ms)')
590
+ })
420
591
  })
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Unit tests for compare-utils shared utilities.
3
+ *
4
+ * @remarks
5
+ * Tests for percentile, computeLatencyStats, and computeScoreDistribution.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+
10
+ import { describe, expect, test } from 'bun:test'
11
+ import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts'
12
+
13
+ // ============================================================================
14
+ // percentile Tests
15
+ // ============================================================================
16
+
17
+ describe('percentile', () => {
18
+ test('computes correct percentile values', () => {
19
+ const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
20
+
21
+ expect(percentile(sorted, 0.5)).toBe(60)
22
+ expect(percentile(sorted, 0.25)).toBe(30)
23
+ expect(percentile(sorted, 0.75)).toBe(80)
24
+ expect(percentile(sorted, 0.9)).toBe(100)
25
+ })
26
+
27
+ test('returns 0 for empty array', () => {
28
+ expect(percentile([], 0.5)).toBe(0)
29
+ })
30
+
31
+ test('handles single-element array', () => {
32
+ expect(percentile([42], 0.5)).toBe(42)
33
+ expect(percentile([42], 0.0)).toBe(42)
34
+ expect(percentile([42], 1.0)).toBe(42)
35
+ })
36
+
37
+ test('handles p=0 and p=1 boundary values', () => {
38
+ const sorted = [10, 20, 30]
39
+
40
+ expect(percentile(sorted, 0)).toBe(10)
41
+ expect(percentile(sorted, 1)).toBe(30)
42
+ })
43
+ })
44
+
45
+ // ============================================================================
46
+ // computeLatencyStats Tests
47
+ // ============================================================================
48
+
49
+ describe('computeLatencyStats', () => {
50
+ test('returns correct stats for typical durations', () => {
51
+ const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
52
+ const stats = computeLatencyStats(durations)
53
+
54
+ expect(stats.min).toBe(100)
55
+ expect(stats.max).toBe(1000)
56
+ expect(stats.mean).toBe(550)
57
+ expect(stats.p50).toBe(600)
58
+ expect(stats.p90).toBe(1000)
59
+ })
60
+
61
+ test('returns zeros for empty array', () => {
62
+ const stats = computeLatencyStats([])
63
+
64
+ expect(stats.p50).toBe(0)
65
+ expect(stats.p90).toBe(0)
66
+ expect(stats.p99).toBe(0)
67
+ expect(stats.mean).toBe(0)
68
+ expect(stats.min).toBe(0)
69
+ expect(stats.max).toBe(0)
70
+ })
71
+
72
+ test('handles single-element array', () => {
73
+ const stats = computeLatencyStats([42])
74
+
75
+ expect(stats.p50).toBe(42)
76
+ expect(stats.p90).toBe(42)
77
+ expect(stats.mean).toBe(42)
78
+ expect(stats.min).toBe(42)
79
+ expect(stats.max).toBe(42)
80
+ })
81
+
82
+ test('sorts unsorted input', () => {
83
+ const stats = computeLatencyStats([500, 100, 300, 200, 400])
84
+
85
+ expect(stats.min).toBe(100)
86
+ expect(stats.max).toBe(500)
87
+ expect(stats.mean).toBe(300)
88
+ })
89
+ })
90
+
91
+ // ============================================================================
92
+ // computeScoreDistribution Tests
93
+ // ============================================================================
94
+
95
+ describe('computeScoreDistribution', () => {
96
+ test('distributes scores into correct buckets', () => {
97
+ const scores = [0.1, 0.3, 0.5, 0.7, 0.9]
98
+ const dist = computeScoreDistribution(scores)
99
+
100
+ expect(dist['0.0-0.2']).toBe(1)
101
+ expect(dist['0.2-0.4']).toBe(1)
102
+ expect(dist['0.4-0.6']).toBe(1)
103
+ expect(dist['0.6-0.8']).toBe(1)
104
+ expect(dist['0.8-1.0']).toBe(1)
105
+ })
106
+
107
+ test('handles empty scores array', () => {
108
+ const dist = computeScoreDistribution([])
109
+
110
+ expect(dist['0.0-0.2']).toBe(0)
111
+ expect(dist['0.2-0.4']).toBe(0)
112
+ expect(dist['0.4-0.6']).toBe(0)
113
+ expect(dist['0.6-0.8']).toBe(0)
114
+ expect(dist['0.8-1.0']).toBe(0)
115
+ })
116
+
117
+ test('handles boundary values correctly', () => {
118
+ // 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket
119
+ const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
120
+ const dist = computeScoreDistribution(scores)
121
+
122
+ expect(dist['0.0-0.2']).toBe(1) // 0.0
123
+ expect(dist['0.2-0.4']).toBe(1) // 0.2
124
+ expect(dist['0.4-0.6']).toBe(1) // 0.4
125
+ expect(dist['0.6-0.8']).toBe(1) // 0.6
126
+ expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0
127
+ })
128
+ })
@@ -923,6 +923,70 @@ export const TrialsFlakinessMetricsSchema = z.object({
923
923
  /** Trials flakiness metrics type */
924
924
  export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
925
925
 
926
+ /**
927
+ * Confidence intervals for trials quality metrics.
928
+ */
929
+ export const TrialsQualityConfidenceIntervalsSchema = z.object({
930
+ /** CI for avgScore */
931
+ avgScore: ConfidenceIntervalSchema.optional(),
932
+ })
933
+
934
+ /** Trials quality confidence intervals type */
935
+ export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfidenceIntervalsSchema>
936
+
937
+ /**
938
+ * Quality metrics for trials comparison (score-based).
939
+ *
940
+ * @remarks
941
+ * Aggregates grader scores across all trials for each prompt.
942
+ * Only present when a grader was used during trials capture.
943
+ */
944
+ export const TrialsQualityMetricsSchema = z.object({
945
+ /** Average score across all trials */
946
+ avgScore: z.number(),
947
+ /** Median score */
948
+ medianScore: z.number(),
949
+ /** 25th percentile score */
950
+ p25Score: z.number(),
951
+ /** 75th percentile score */
952
+ p75Score: z.number(),
953
+ /** Confidence intervals (only with strategy=statistical) */
954
+ confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(),
955
+ })
956
+
957
+ /** Trials quality metrics type */
958
+ export type TrialsQualityMetrics = z.infer<typeof TrialsQualityMetricsSchema>
959
+
960
+ /**
961
+ * Confidence intervals for trials performance metrics.
962
+ */
963
+ export const TrialsPerformanceConfidenceIntervalsSchema = z.object({
964
+ /** CI for latency mean */
965
+ latencyMean: ConfidenceIntervalSchema.optional(),
966
+ })
967
+
968
+ /** Trials performance confidence intervals type */
969
+ export type TrialsPerformanceConfidenceIntervals = z.infer<typeof TrialsPerformanceConfidenceIntervalsSchema>
970
+
971
+ /**
972
+ * Performance metrics for trials comparison (latency-based).
973
+ *
974
+ * @remarks
975
+ * Aggregates trial durations across all prompts.
976
+ * Always present since TrialEntry.duration is required.
977
+ */
978
+ export const TrialsPerformanceMetricsSchema = z.object({
979
+ /** End-to-end latency statistics across all trials */
980
+ latency: LatencyStatsSchema,
981
+ /** Sum of all trial durations in milliseconds */
982
+ totalDuration: z.number(),
983
+ /** Confidence intervals (only with strategy=statistical) */
984
+ confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(),
985
+ })
986
+
987
+ /** Trials performance metrics type */
988
+ export type TrialsPerformanceMetrics = z.infer<typeof TrialsPerformanceMetricsSchema>
989
+
926
990
  /**
927
991
  * Per-prompt metrics for trials comparison drill-down.
928
992
  */
@@ -984,6 +1048,10 @@ export const TrialsComparisonReportSchema = z.object({
984
1048
  reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
985
1049
  /** Flakiness metrics by run label */
986
1050
  flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
1051
+ /** Quality metrics by run label (only when grader scores are present) */
1052
+ quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(),
1053
+ /** Performance metrics by run label (always present, uses trial.duration) */
1054
+ performance: z.record(z.string(), TrialsPerformanceMetricsSchema),
987
1055
  /** Head-to-head comparison details */
988
1056
  headToHead: z.object({
989
1057
  /** Pairwise wins by capability */
package/src/schemas.ts CHANGED
@@ -113,8 +113,16 @@ export {
113
113
  TrialsComparisonReportSchema,
114
114
  type TrialsFlakinessMetrics,
115
115
  TrialsFlakinessMetricsSchema,
116
+ type TrialsPerformanceConfidenceIntervals,
117
+ TrialsPerformanceConfidenceIntervalsSchema,
118
+ type TrialsPerformanceMetrics,
119
+ TrialsPerformanceMetricsSchema,
116
120
  type TrialsPromptComparison,
117
121
  TrialsPromptComparisonSchema,
122
+ type TrialsQualityConfidenceIntervals,
123
+ TrialsQualityConfidenceIntervalsSchema,
124
+ type TrialsQualityMetrics,
125
+ TrialsQualityMetricsSchema,
118
126
  type TrialsReliabilityMetrics,
119
127
  TrialsReliabilityMetricsSchema,
120
128
  type ValidationResult,