npm - @plaited/agent-eval-harness - Versions diffs - 0.12.0 → 0.12.2 - Mend

@plaited/agent-eval-harness 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +4 -4
package/src/pipeline/compare-trials.ts +166 -17
package/src/pipeline/compare-utils.ts +85 -0
package/src/pipeline/compare.ts +2 -65
package/src/pipeline/tests/compare-statistical.spec.ts +4 -0
package/src/pipeline/tests/compare-trials.spec.ts +178 -6
package/src/pipeline/tests/compare-utils.spec.ts +128 -0
package/src/schemas/schemas.ts +72 -0
package/src/schemas.ts +8 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@plaited/agent-eval-harness",
-  "version": "0.12.0",
+  "version": "0.12.2",
   "description": "CLI tool for capturing agent trajectories from headless CLI agents",
   "license": "ISC",
   "engines": {
@@ -56,12 +56,12 @@
     ]
   },
   "dependencies": {
-    "@plaited/development-skills": "0.7.0",
+    "@plaited/development-skills": "0.8.0",
     "zod": "^4.3.6"
   },
   "devDependencies": {
-    "@biomejs/biome": "2.3.12",
-    "@types/bun": "1.3.6",
+    "@biomejs/biome": "2.3.14",
+    "@types/bun": "1.3.9",
     "format-package": "7.0.0",
     "lint-staged": "16.2.7",
     "typescript": "5.9.3"

package/src/pipeline/compare-trials.ts CHANGED Viewed

@@ -26,10 +26,13 @@ import type {
   TrialsComparisonMeta,
   TrialsComparisonReport,
   TrialsFlakinessMetrics,
+  TrialsPerformanceMetrics,
   TrialsPromptComparison,
+  TrialsQualityMetrics,
   TrialsReliabilityMetrics,
 } from '../schemas.ts'
 import { TrialResultSchema } from '../schemas.ts'
+import { computeLatencyStats, percentile } from './compare-utils.ts'
 import type {
   ComparisonGraderResult,
   LabeledRun,
@@ -148,19 +151,6 @@ const getTrialsGrader = async (
   }
 }
-/**
- * Compute percentile from sorted array.
- *
- * @param sorted - Sorted array of numbers
- * @param p - Percentile (0-1)
- * @returns Value at percentile
- */
-const percentile = (sorted: number[], p: number): number => {
-  if (sorted.length === 0) return 0
-  const idx = Math.floor(sorted.length * p)
-  return sorted[Math.min(idx, sorted.length - 1)] ?? 0
-}
 /**
  * Compute capability metrics from trial results.
  *
@@ -245,6 +235,72 @@ const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 1
   }
 }
+/** Result from quality metrics computation, including raw scores for CI reuse */
+type QualityComputeResult = {
+  metrics: TrialsQualityMetrics
+  rawScores: number[]
+}
+/**
+ * Compute quality metrics from trial results.
+ *
+ * @remarks
+ * Flattens all trial scores across all prompts into a single distribution.
+ * Returns undefined if no scores are present (no grader was used).
+ * Returns raw scores alongside metrics to avoid re-traversal for CI computation.
+ *
+ * @param results - Array of trial results
+ * @returns Quality metrics with raw scores, or undefined if no scores
+ */
+const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => {
+  const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number))
+  if (rawScores.length === 0) return undefined
+  const sorted = [...rawScores].sort((a, b) => a - b)
+  const sum = rawScores.reduce((a, b) => a + b, 0)
+  return {
+    metrics: {
+      type: 'trial',
+      avgScore: sum / rawScores.length,
+      medianScore: percentile(sorted, 0.5),
+      p25Score: percentile(sorted, 0.25),
+      p75Score: percentile(sorted, 0.75),
+    },
+    rawScores,
+  }
+}
+/** Result from performance metrics computation, including raw durations for CI reuse */
+type PerformanceComputeResult = {
+  metrics: TrialsPerformanceMetrics
+  rawDurations: number[]
+}
+/**
+ * Compute performance metrics from trial results.
+ *
+ * @remarks
+ * Flattens all trial durations across all prompts into latency statistics.
+ * Always returns a value since TrialEntry.duration is required.
+ * Returns raw durations alongside metrics to avoid re-traversal for CI computation.
+ *
+ * @param results - Array of trial results
+ * @returns Performance metrics with raw durations
+ */
+const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => {
+  const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration))
+  return {
+    metrics: {
+      latency: computeLatencyStats(rawDurations),
+      totalDuration: rawDurations.reduce((a, b) => a + b, 0),
+    },
+    rawDurations,
+  }
+}
 /**
  * Execute trials comparison and generate aggregate report.
  *
@@ -399,6 +455,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
   const capability: Record<string, TrialsCapabilityMetrics> = {}
   const reliability: Record<string, TrialsReliabilityMetrics> = {}
   const flakiness: Record<string, TrialsFlakinessMetrics> = {}
+  const quality: Record<string, TrialsQualityMetrics> = {}
+  const performance: Record<string, TrialsPerformanceMetrics> = {}
+  const rawScoresByRun: Record<string, number[]> = {}
+  const rawDurationsByRun: Record<string, number[]> = {}
+  let hasQuality = false
   for (const label of runLabels) {
     const resultsMap = runResults[label] ?? new Map()
@@ -407,6 +469,17 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
     capability[label] = computeCapabilityMetrics(results)
     reliability[label] = computeReliabilityMetrics(results)
     flakiness[label] = computeFlakinessMetrics(results)
+    const perfResult = computeTrialsPerformanceMetrics(results)
+    performance[label] = perfResult.metrics
+    rawDurationsByRun[label] = perfResult.rawDurations
+    const qualityResult = computeTrialsQualityMetrics(results)
+    if (qualityResult) {
+      quality[label] = qualityResult.metrics
+      rawScoresByRun[label] = qualityResult.rawScores
+      hasQuality = true
+    }
   }
   // Compute confidence intervals when using statistical strategy
@@ -415,9 +488,9 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
     for (const label of runLabels) {
       const resultsMap = runResults[label] ?? new Map()
-      const results = [...resultsMap.values()]
-      const passAtKValues = results.map((r) => r.passAtK ?? 0)
-      const passExpKValues = results.map((r) => r.passExpK ?? 0)
+      const resultsArr = [...resultsMap.values()]
+      const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0)
+      const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0)
       // Capability CIs
       const capabilityMetrics = capability[label]
@@ -434,6 +507,24 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
           avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
         }
       }
+      // Quality CIs (only when scores present)
+      const qualityMetrics = quality[label]
+      const scores = rawScoresByRun[label]
+      if (qualityMetrics && scores && scores.length > 0) {
+        qualityMetrics.confidenceIntervals = {
+          avgScore: bootstrap(scores, bootstrapConfig).ci,
+        }
+      }
+      // Performance CIs
+      const performanceMetrics = performance[label]
+      const durations = rawDurationsByRun[label]
+      if (performanceMetrics && durations && durations.length > 0) {
+        performanceMetrics.confidenceIntervals = {
+          latencyMean: bootstrap(durations, bootstrapConfig).ci,
+        }
+      }
     }
   }
@@ -505,6 +596,8 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
     capability,
     reliability,
     flakiness,
+    quality: hasQuality ? quality : undefined,
+    performance,
     headToHead: {
       capability: capabilityPairwise,
       reliability: reliabilityPairwise,
@@ -528,8 +621,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
   for (const [label, cap] of Object.entries(capability)) {
     const rel = reliability[label]
     const flak = flakiness[label]
+    const perf = performance[label]
+    const qual = quality[label]
+    const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : ''
+    const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : ''
     logProgress(
-      `  ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
+      `  ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`,
       progress,
     )
   }
@@ -620,6 +717,58 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
   }
   lines.push('')
+  // Quality table (only when scores present)
+  if (report.quality && Object.keys(report.quality).length > 0) {
+    const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
+    lines.push('## Quality (Scores)')
+    lines.push('')
+    if (hasQualityCIs) {
+      lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |')
+      lines.push('|-----|-----------|--------|--------|-----|-----|')
+      for (const [label, q] of Object.entries(report.quality)) {
+        const avgCI = formatCI(q.confidenceIntervals?.avgScore)
+        lines.push(
+          `| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
+        )
+      }
+    } else {
+      lines.push('| Run | Avg Score | Median | P25 | P75 |')
+      lines.push('|-----|-----------|--------|-----|-----|')
+      for (const [label, q] of Object.entries(report.quality)) {
+        lines.push(
+          `| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
+        )
+      }
+    }
+    lines.push('')
+  }
+  // Performance table (always present)
+  const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals)
+  lines.push('## Performance (Latency)')
+  lines.push('')
+  if (hasPerfCIs) {
+    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |')
+    lines.push('|-----|----------|----------|----------|-----------|--------|------------|')
+    for (const [label, p] of Object.entries(report.performance)) {
+      const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
+      lines.push(
+        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`,
+      )
+    }
+  } else {
+    lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |')
+    lines.push('|-----|----------|----------|----------|-----------|------------|')
+    for (const [label, p] of Object.entries(report.performance)) {
+      lines.push(
+        `| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`,
+      )
+    }
+  }
+  lines.push('')
   // Head-to-head
   lines.push('## Head-to-Head')
   lines.push('')

package/src/pipeline/compare-utils.ts ADDED Viewed

@@ -0,0 +1,85 @@
+/**
+ * Shared utility functions for comparison modules.
+ *
+ * @remarks
+ * Extracted from compare.ts and compare-trials.ts to avoid duplication.
+ * Contains statistical helpers used by both CaptureResult and TrialResult comparisons.
+ *
+ * @packageDocumentation
+ */
+import type { LatencyStats, ScoreDistribution } from '../schemas.ts'
+/**
+ * Compute percentile from sorted array using nearest rank method.
+ *
+ * @remarks
+ * Uses floor indexing (nearest rank method). For an array of length N,
+ * returns the element at index `floor(N * p)`, clamped to the last element.
+ * This does not interpolate between ranks.
+ *
+ * @param sorted - Sorted array of numbers
+ * @param p - Percentile (0-1)
+ * @returns Value at percentile
+ *
+ * @public
+ */
+export const percentile = (sorted: number[], p: number): number => {
+  if (sorted.length === 0) return 0
+  const idx = Math.floor(sorted.length * p)
+  return sorted[Math.min(idx, sorted.length - 1)] ?? 0
+}
+/**
+ * Compute latency statistics from array of durations.
+ *
+ * @param durations - Array of durations in milliseconds
+ * @returns Latency statistics
+ *
+ * @public
+ */
+export const computeLatencyStats = (durations: number[]): LatencyStats => {
+  if (durations.length === 0) {
+    return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
+  }
+  const sorted = [...durations].sort((a, b) => a - b)
+  const sum = sorted.reduce((a, b) => a + b, 0)
+  return {
+    p50: percentile(sorted, 0.5),
+    p90: percentile(sorted, 0.9),
+    p99: percentile(sorted, 0.99),
+    mean: sum / sorted.length,
+    min: sorted[0] ?? 0,
+    max: sorted[sorted.length - 1] ?? 0,
+  }
+}
+/**
+ * Compute score distribution histogram.
+ *
+ * @param scores - Array of scores (0-1)
+ * @returns Score distribution histogram
+ *
+ * @public
+ */
+export const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
+  const dist: ScoreDistribution = {
+    '0.0-0.2': 0,
+    '0.2-0.4': 0,
+    '0.4-0.6': 0,
+    '0.6-0.8': 0,
+    '0.8-1.0': 0,
+  }
+  for (const score of scores) {
+    if (score < 0.2) dist['0.0-0.2']++
+    else if (score < 0.4) dist['0.2-0.4']++
+    else if (score < 0.6) dist['0.4-0.6']++
+    else if (score < 0.8) dist['0.6-0.8']++
+    else dist['0.8-1.0']++
+  }
+  return dist
+}

package/src/pipeline/compare.ts CHANGED Viewed

@@ -33,18 +33,17 @@ import type {
   ComparisonMeta,
   ComparisonReport,
   HeadToHead,
-  LatencyStats,
   PairwiseComparison,
   PerformanceMetrics,
   PromptComparison,
   QualityMetrics,
   ReliabilityMetrics,
-  ScoreDistribution,
   TrajectoryInfo,
   TrajectoryRichness,
 } from '../schemas.ts'
 import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
 import { runTrialsCompare } from './compare-trials.ts'
+import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts'
 import type {
   CompareConfig,
   ComparisonGrader,
@@ -197,69 +196,6 @@ const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promis
   }
 }
-/**
- * Compute percentile from sorted array.
- *
- * @param sorted - Sorted array of numbers
- * @param p - Percentile (0-1)
- * @returns Value at percentile
- */
-const percentile = (sorted: number[], p: number): number => {
-  if (sorted.length === 0) return 0
-  const idx = Math.floor(sorted.length * p)
-  return sorted[Math.min(idx, sorted.length - 1)] ?? 0
-}
-/**
- * Compute latency statistics from array of durations.
- *
- * @param durations - Array of durations in milliseconds
- * @returns Latency statistics
- */
-const computeLatencyStats = (durations: number[]): LatencyStats => {
-  if (durations.length === 0) {
-    return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
-  }
-  const sorted = [...durations].sort((a, b) => a - b)
-  const sum = sorted.reduce((a, b) => a + b, 0)
-  return {
-    p50: percentile(sorted, 0.5),
-    p90: percentile(sorted, 0.9),
-    p99: percentile(sorted, 0.99),
-    mean: sum / sorted.length,
-    min: sorted[0] ?? 0,
-    max: sorted[sorted.length - 1] ?? 0,
-  }
-}
-/**
- * Compute score distribution histogram.
- *
- * @param scores - Array of scores (0-1)
- * @returns Score distribution histogram
- */
-const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
-  const dist: ScoreDistribution = {
-    '0.0-0.2': 0,
-    '0.2-0.4': 0,
-    '0.4-0.6': 0,
-    '0.6-0.8': 0,
-    '0.8-1.0': 0,
-  }
-  for (const score of scores) {
-    if (score < 0.2) dist['0.0-0.2']++
-    else if (score < 0.4) dist['0.2-0.4']++
-    else if (score < 0.6) dist['0.4-0.6']++
-    else if (score < 0.8) dist['0.6-0.8']++
-    else dist['0.8-1.0']++
-  }
-  return dist
-}
 /**
  * Detect trajectory richness from capture results.
  *
@@ -429,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
     const fails = results.length - passes
     quality[label] = {
+      type: 'run',
       avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
       passRate: results.length > 0 ? passes / results.length : 0,
       passCount: passes,

package/src/pipeline/tests/compare-statistical.spec.ts CHANGED Viewed

@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
     // Verify reliability metrics include type discriminator
     expect(report.reliability.high?.type).toBe('run')
     expect(report.reliability.low?.type).toBe('run')
+    // Verify quality metrics include type discriminator
+    expect(report.quality.high?.type).toBe('run')
+    expect(report.quality.low?.type).toBe('run')
   })
   test('computes confidence intervals for performance metrics', async () => {

package/src/pipeline/tests/compare-trials.spec.ts CHANGED Viewed

@@ -14,20 +14,23 @@ import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
 // Test Fixtures
 // ============================================================================
-const createTrialResult = (id: string, passAtK: number, passExpK: number, k: number = 3) => ({
+const createTrialResult = (
+  id: string,
+  passAtK: number,
+  passExpK: number,
+  k: number = 3,
+  includeScores: boolean = true,
+) => ({
   id,
   input: `Prompt for ${id}`,
   k,
-  passRate: passAtK,
-  passAtK,
-  passExpK,
+  ...(includeScores && { passRate: passAtK, passAtK, passExpK }),
   trials: Array.from({ length: k }, (_, i) => ({
     trialNum: i + 1,
     output: `Output ${i + 1}`,
     trajectory: [],
     duration: 100 + i * 10,
-    pass: Math.random() < passAtK,
-    score: passAtK,
+    ...(includeScores && { pass: Math.random() < passAtK, score: passAtK }),
   })),
 })
@@ -417,4 +420,173 @@ describe('runTrialsCompare', () => {
     const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
     expect(topFlakyIds).toContain('flaky')
   })
+  test('includes performance metrics with latency stats', async () => {
+    const run1Path = `${tempDir}/perf-run1.jsonl`
+    const run2Path = `${tempDir}/perf-run2.jsonl`
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.8, 0.6)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      progress: false,
+    })
+    // Performance should always be present
+    expect(report.performance).toBeDefined()
+    expect(report.performance.run1).toBeDefined()
+    expect(report.performance.run2).toBeDefined()
+    const perf = report.performance.run1
+    expect(perf?.latency).toBeDefined()
+    expect(perf?.latency.p50).toBeGreaterThan(0)
+    expect(perf?.latency.mean).toBeGreaterThan(0)
+    expect(perf?.latency.min).toBeGreaterThan(0)
+    expect(perf?.latency.max).toBeGreaterThan(0)
+    expect(perf?.totalDuration).toBeGreaterThan(0)
+  })
+  test('includes quality metrics when scores are present', async () => {
+    const run1Path = `${tempDir}/qual-run1.jsonl`
+    const run2Path = `${tempDir}/qual-run2.jsonl`
+    // createTrialResult always includes score fields
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.8, 0.6)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      progress: false,
+    })
+    // Quality should be present since trials have scores
+    expect(report.quality).toBeDefined()
+    expect(report.quality?.run1).toBeDefined()
+    const qual = report.quality?.run1
+    expect(qual?.type).toBe('trial')
+    expect(qual?.avgScore).toBeGreaterThan(0)
+    expect(qual?.medianScore).toBeGreaterThan(0)
+    expect(qual?.p25Score).toBeDefined()
+    expect(qual?.p75Score).toBeDefined()
+  })
+  test('omits quality metrics when scores are absent', async () => {
+    const run1Path = `${tempDir}/noqual-run1.jsonl`
+    const run2Path = `${tempDir}/noqual-run2.jsonl`
+    // Create trials without scores (includeScores=false)
+    const trial1 = createTrialResult('test-001', 0, 0, 3, false)
+    const trial2 = createTrialResult('test-001', 0, 0, 3, false)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'run1', path: run1Path },
+        { label: 'run2', path: run2Path },
+      ],
+      progress: false,
+    })
+    // Quality should NOT be present since no trials have scores
+    expect(report.quality).toBeUndefined()
+    // Performance should still be present
+    expect(report.performance).toBeDefined()
+    expect(report.performance.run1?.latency.mean).toBeGreaterThan(0)
+  })
+  test('statistical strategy computes CIs for quality and performance', async () => {
+    const run1Path = `${tempDir}/ci-qp-run1.jsonl`
+    const run2Path = `${tempDir}/ci-qp-run2.jsonl`
+    const trials1 = [
+      createTrialResult('p1', 0.9, 0.8),
+      createTrialResult('p2', 0.85, 0.7),
+      createTrialResult('p3', 0.95, 0.9),
+    ]
+    const trials2 = [
+      createTrialResult('p1', 0.6, 0.4),
+      createTrialResult('p2', 0.5, 0.3),
+      createTrialResult('p3', 0.7, 0.5),
+    ]
+    await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
+    await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
+    const report = await runTrialsCompare({
+      runs: [
+        { label: 'high', path: run1Path },
+        { label: 'low', path: run2Path },
+      ],
+      strategy: 'statistical',
+      progress: false,
+    })
+    // Quality CIs
+    const highQual = report.quality?.high
+    expect(highQual).toBeDefined()
+    expect(highQual?.confidenceIntervals).toBeDefined()
+    expect(highQual?.confidenceIntervals?.avgScore).toBeDefined()
+    const qualCI = highQual?.confidenceIntervals?.avgScore
+    expect(qualCI).toHaveLength(2)
+    expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0)
+    // Performance CIs
+    const highPerf = report.performance.high
+    expect(highPerf).toBeDefined()
+    expect(highPerf?.confidenceIntervals).toBeDefined()
+    expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined()
+    const perfCI = highPerf?.confidenceIntervals?.latencyMean
+    expect(perfCI).toHaveLength(2)
+    expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0)
+  })
+  test('markdown output includes quality and performance tables', async () => {
+    const run1Path = `${tempDir}/md-qp-run1.jsonl`
+    const run2Path = `${tempDir}/md-qp-run2.jsonl`
+    const outputPath = `${tempDir}/qp-report.md`
+    const trial1 = createTrialResult('test-001', 0.9, 0.7)
+    const trial2 = createTrialResult('test-001', 0.8, 0.6)
+    await Bun.write(run1Path, JSON.stringify(trial1))
+    await Bun.write(run2Path, JSON.stringify(trial2))
+    await runTrialsCompare({
+      runs: [
+        { label: 'agent1', path: run1Path },
+        { label: 'agent2', path: run2Path },
+      ],
+      outputPath,
+      format: 'markdown',
+      progress: false,
+    })
+    const content = await Bun.file(outputPath).text()
+    // Should contain quality and performance sections
+    expect(content).toContain('## Quality (Scores)')
+    expect(content).toContain('## Performance (Latency)')
+    expect(content).toContain('Avg Score')
+    expect(content).toContain('P50 (ms)')
+    expect(content).toContain('Mean (ms)')
+  })
 })

package/src/pipeline/tests/compare-utils.spec.ts ADDED Viewed

@@ -0,0 +1,128 @@
+/**
+ * Unit tests for compare-utils shared utilities.
+ *
+ * @remarks
+ * Tests for percentile, computeLatencyStats, and computeScoreDistribution.
+ *
+ * @packageDocumentation
+ */
+import { describe, expect, test } from 'bun:test'
+import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts'
+// ============================================================================
+// percentile Tests
+// ============================================================================
+describe('percentile', () => {
+  test('computes correct percentile values', () => {
+    const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+    expect(percentile(sorted, 0.5)).toBe(60)
+    expect(percentile(sorted, 0.25)).toBe(30)
+    expect(percentile(sorted, 0.75)).toBe(80)
+    expect(percentile(sorted, 0.9)).toBe(100)
+  })
+  test('returns 0 for empty array', () => {
+    expect(percentile([], 0.5)).toBe(0)
+  })
+  test('handles single-element array', () => {
+    expect(percentile([42], 0.5)).toBe(42)
+    expect(percentile([42], 0.0)).toBe(42)
+    expect(percentile([42], 1.0)).toBe(42)
+  })
+  test('handles p=0 and p=1 boundary values', () => {
+    const sorted = [10, 20, 30]
+    expect(percentile(sorted, 0)).toBe(10)
+    expect(percentile(sorted, 1)).toBe(30)
+  })
+})
+// ============================================================================
+// computeLatencyStats Tests
+// ============================================================================
+describe('computeLatencyStats', () => {
+  test('returns correct stats for typical durations', () => {
+    const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
+    const stats = computeLatencyStats(durations)
+    expect(stats.min).toBe(100)
+    expect(stats.max).toBe(1000)
+    expect(stats.mean).toBe(550)
+    expect(stats.p50).toBe(600)
+    expect(stats.p90).toBe(1000)
+  })
+  test('returns zeros for empty array', () => {
+    const stats = computeLatencyStats([])
+    expect(stats.p50).toBe(0)
+    expect(stats.p90).toBe(0)
+    expect(stats.p99).toBe(0)
+    expect(stats.mean).toBe(0)
+    expect(stats.min).toBe(0)
+    expect(stats.max).toBe(0)
+  })
+  test('handles single-element array', () => {
+    const stats = computeLatencyStats([42])
+    expect(stats.p50).toBe(42)
+    expect(stats.p90).toBe(42)
+    expect(stats.mean).toBe(42)
+    expect(stats.min).toBe(42)
+    expect(stats.max).toBe(42)
+  })
+  test('sorts unsorted input', () => {
+    const stats = computeLatencyStats([500, 100, 300, 200, 400])
+    expect(stats.min).toBe(100)
+    expect(stats.max).toBe(500)
+    expect(stats.mean).toBe(300)
+  })
+})
+// ============================================================================
+// computeScoreDistribution Tests
+// ============================================================================
+describe('computeScoreDistribution', () => {
+  test('distributes scores into correct buckets', () => {
+    const scores = [0.1, 0.3, 0.5, 0.7, 0.9]
+    const dist = computeScoreDistribution(scores)
+    expect(dist['0.0-0.2']).toBe(1)
+    expect(dist['0.2-0.4']).toBe(1)
+    expect(dist['0.4-0.6']).toBe(1)
+    expect(dist['0.6-0.8']).toBe(1)
+    expect(dist['0.8-1.0']).toBe(1)
+  })
+  test('handles empty scores array', () => {
+    const dist = computeScoreDistribution([])
+    expect(dist['0.0-0.2']).toBe(0)
+    expect(dist['0.2-0.4']).toBe(0)
+    expect(dist['0.4-0.6']).toBe(0)
+    expect(dist['0.6-0.8']).toBe(0)
+    expect(dist['0.8-1.0']).toBe(0)
+  })
+  test('handles boundary values correctly', () => {
+    // 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket
+    const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+    const dist = computeScoreDistribution(scores)
+    expect(dist['0.0-0.2']).toBe(1) // 0.0
+    expect(dist['0.2-0.4']).toBe(1) // 0.2
+    expect(dist['0.4-0.6']).toBe(1) // 0.4
+    expect(dist['0.6-0.8']).toBe(1) // 0.6
+    expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0
+  })
+})

package/src/schemas/schemas.ts CHANGED Viewed

@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
  * Quality metrics for a single run in comparison.
  */
 export const QualityMetricsSchema = z.object({
+  /** Discriminator for run-level quality metrics */
+  type: z.literal('run'),
   /** Mean grader score (0-1) */
   avgScore: z.number(),
   /** Percentage of pass=true results */
@@ -923,6 +925,72 @@ export const TrialsFlakinessMetricsSchema = z.object({
 /** Trials flakiness metrics type */
 export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
+/**
+ * Confidence intervals for trials quality metrics.
+ */
+export const TrialsQualityConfidenceIntervalsSchema = z.object({
+  /** CI for avgScore */
+  avgScore: ConfidenceIntervalSchema.optional(),
+})
+/** Trials quality confidence intervals type */
+export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfidenceIntervalsSchema>
+/**
+ * Quality metrics for trials comparison (score-based).
+ *
+ * @remarks
+ * Aggregates grader scores across all trials for each prompt.
+ * Only present when a grader was used during trials capture.
+ */
+export const TrialsQualityMetricsSchema = z.object({
+  /** Discriminator for trial-level quality metrics */
+  type: z.literal('trial'),
+  /** Average score across all trials */
+  avgScore: z.number(),
+  /** Median score */
+  medianScore: z.number(),
+  /** 25th percentile score */
+  p25Score: z.number(),
+  /** 75th percentile score */
+  p75Score: z.number(),
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(),
+})
+/** Trials quality metrics type */
+export type TrialsQualityMetrics = z.infer<typeof TrialsQualityMetricsSchema>
+/**
+ * Confidence intervals for trials performance metrics.
+ */
+export const TrialsPerformanceConfidenceIntervalsSchema = z.object({
+  /** CI for latency mean */
+  latencyMean: ConfidenceIntervalSchema.optional(),
+})
+/** Trials performance confidence intervals type */
+export type TrialsPerformanceConfidenceIntervals = z.infer<typeof TrialsPerformanceConfidenceIntervalsSchema>
+/**
+ * Performance metrics for trials comparison (latency-based).
+ *
+ * @remarks
+ * Aggregates trial durations across all prompts.
+ * Always present since TrialEntry.duration is required.
+ */
+export const TrialsPerformanceMetricsSchema = z.object({
+  /** End-to-end latency statistics across all trials */
+  latency: LatencyStatsSchema,
+  /** Sum of all trial durations in milliseconds */
+  totalDuration: z.number(),
+  /** Confidence intervals (only with strategy=statistical) */
+  confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(),
+})
+/** Trials performance metrics type */
+export type TrialsPerformanceMetrics = z.infer<typeof TrialsPerformanceMetricsSchema>
 /**
  * Per-prompt metrics for trials comparison drill-down.
  */
@@ -984,6 +1052,10 @@ export const TrialsComparisonReportSchema = z.object({
   reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
   /** Flakiness metrics by run label */
   flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
+  /** Quality metrics by run label (only when grader scores are present) */
+  quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(),
+  /** Performance metrics by run label (always present, uses trial.duration) */
+  performance: z.record(z.string(), TrialsPerformanceMetricsSchema),
   /** Head-to-head comparison details */
   headToHead: z.object({
     /** Pairwise wins by capability */

package/src/schemas.ts CHANGED Viewed

@@ -113,8 +113,16 @@ export {
   TrialsComparisonReportSchema,
   type TrialsFlakinessMetrics,
   TrialsFlakinessMetricsSchema,
+  type TrialsPerformanceConfidenceIntervals,
+  TrialsPerformanceConfidenceIntervalsSchema,
+  type TrialsPerformanceMetrics,
+  TrialsPerformanceMetricsSchema,
   type TrialsPromptComparison,
   TrialsPromptComparisonSchema,
+  type TrialsQualityConfidenceIntervals,
+  TrialsQualityConfidenceIntervalsSchema,
+  type TrialsQualityMetrics,
+  TrialsQualityMetricsSchema,
   type TrialsReliabilityMetrics,
   TrialsReliabilityMetricsSchema,
   type ValidationResult,