@plaited/agent-eval-harness 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/pipeline/compare-trials.ts +166 -17
- package/src/pipeline/compare-utils.ts +85 -0
- package/src/pipeline/compare.ts +2 -65
- package/src/pipeline/tests/compare-statistical.spec.ts +4 -0
- package/src/pipeline/tests/compare-trials.spec.ts +178 -6
- package/src/pipeline/tests/compare-utils.spec.ts +128 -0
- package/src/schemas/schemas.ts +72 -0
- package/src/schemas.ts +8 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@plaited/agent-eval-harness",
|
|
3
|
-
"version": "0.12.
|
|
3
|
+
"version": "0.12.2",
|
|
4
4
|
"description": "CLI tool for capturing agent trajectories from headless CLI agents",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"engines": {
|
|
@@ -56,12 +56,12 @@
|
|
|
56
56
|
]
|
|
57
57
|
},
|
|
58
58
|
"dependencies": {
|
|
59
|
-
"@plaited/development-skills": "0.
|
|
59
|
+
"@plaited/development-skills": "0.8.0",
|
|
60
60
|
"zod": "^4.3.6"
|
|
61
61
|
},
|
|
62
62
|
"devDependencies": {
|
|
63
|
-
"@biomejs/biome": "2.3.
|
|
64
|
-
"@types/bun": "1.3.
|
|
63
|
+
"@biomejs/biome": "2.3.14",
|
|
64
|
+
"@types/bun": "1.3.9",
|
|
65
65
|
"format-package": "7.0.0",
|
|
66
66
|
"lint-staged": "16.2.7",
|
|
67
67
|
"typescript": "5.9.3"
|
|
@@ -26,10 +26,13 @@ import type {
|
|
|
26
26
|
TrialsComparisonMeta,
|
|
27
27
|
TrialsComparisonReport,
|
|
28
28
|
TrialsFlakinessMetrics,
|
|
29
|
+
TrialsPerformanceMetrics,
|
|
29
30
|
TrialsPromptComparison,
|
|
31
|
+
TrialsQualityMetrics,
|
|
30
32
|
TrialsReliabilityMetrics,
|
|
31
33
|
} from '../schemas.ts'
|
|
32
34
|
import { TrialResultSchema } from '../schemas.ts'
|
|
35
|
+
import { computeLatencyStats, percentile } from './compare-utils.ts'
|
|
33
36
|
import type {
|
|
34
37
|
ComparisonGraderResult,
|
|
35
38
|
LabeledRun,
|
|
@@ -148,19 +151,6 @@ const getTrialsGrader = async (
|
|
|
148
151
|
}
|
|
149
152
|
}
|
|
150
153
|
|
|
151
|
-
/**
|
|
152
|
-
* Compute percentile from sorted array.
|
|
153
|
-
*
|
|
154
|
-
* @param sorted - Sorted array of numbers
|
|
155
|
-
* @param p - Percentile (0-1)
|
|
156
|
-
* @returns Value at percentile
|
|
157
|
-
*/
|
|
158
|
-
const percentile = (sorted: number[], p: number): number => {
|
|
159
|
-
if (sorted.length === 0) return 0
|
|
160
|
-
const idx = Math.floor(sorted.length * p)
|
|
161
|
-
return sorted[Math.min(idx, sorted.length - 1)] ?? 0
|
|
162
|
-
}
|
|
163
|
-
|
|
164
154
|
/**
|
|
165
155
|
* Compute capability metrics from trial results.
|
|
166
156
|
*
|
|
@@ -245,6 +235,72 @@ const computeFlakinessMetrics = (results: TrialResult[], maxTopFlaky: number = 1
|
|
|
245
235
|
}
|
|
246
236
|
}
|
|
247
237
|
|
|
238
|
+
/** Result from quality metrics computation, including raw scores for CI reuse */
|
|
239
|
+
type QualityComputeResult = {
|
|
240
|
+
metrics: TrialsQualityMetrics
|
|
241
|
+
rawScores: number[]
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Compute quality metrics from trial results.
|
|
246
|
+
*
|
|
247
|
+
* @remarks
|
|
248
|
+
* Flattens all trial scores across all prompts into a single distribution.
|
|
249
|
+
* Returns undefined if no scores are present (no grader was used).
|
|
250
|
+
* Returns raw scores alongside metrics to avoid re-traversal for CI computation.
|
|
251
|
+
*
|
|
252
|
+
* @param results - Array of trial results
|
|
253
|
+
* @returns Quality metrics with raw scores, or undefined if no scores
|
|
254
|
+
*/
|
|
255
|
+
const computeTrialsQualityMetrics = (results: TrialResult[]): QualityComputeResult | undefined => {
|
|
256
|
+
const rawScores = results.flatMap((r) => r.trials.filter((t) => t.score !== undefined).map((t) => t.score as number))
|
|
257
|
+
|
|
258
|
+
if (rawScores.length === 0) return undefined
|
|
259
|
+
|
|
260
|
+
const sorted = [...rawScores].sort((a, b) => a - b)
|
|
261
|
+
const sum = rawScores.reduce((a, b) => a + b, 0)
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
metrics: {
|
|
265
|
+
type: 'trial',
|
|
266
|
+
avgScore: sum / rawScores.length,
|
|
267
|
+
medianScore: percentile(sorted, 0.5),
|
|
268
|
+
p25Score: percentile(sorted, 0.25),
|
|
269
|
+
p75Score: percentile(sorted, 0.75),
|
|
270
|
+
},
|
|
271
|
+
rawScores,
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/** Result from performance metrics computation, including raw durations for CI reuse */
|
|
276
|
+
type PerformanceComputeResult = {
|
|
277
|
+
metrics: TrialsPerformanceMetrics
|
|
278
|
+
rawDurations: number[]
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Compute performance metrics from trial results.
|
|
283
|
+
*
|
|
284
|
+
* @remarks
|
|
285
|
+
* Flattens all trial durations across all prompts into latency statistics.
|
|
286
|
+
* Always returns a value since TrialEntry.duration is required.
|
|
287
|
+
* Returns raw durations alongside metrics to avoid re-traversal for CI computation.
|
|
288
|
+
*
|
|
289
|
+
* @param results - Array of trial results
|
|
290
|
+
* @returns Performance metrics with raw durations
|
|
291
|
+
*/
|
|
292
|
+
const computeTrialsPerformanceMetrics = (results: TrialResult[]): PerformanceComputeResult => {
|
|
293
|
+
const rawDurations = results.flatMap((r) => r.trials.map((t) => t.duration))
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
metrics: {
|
|
297
|
+
latency: computeLatencyStats(rawDurations),
|
|
298
|
+
totalDuration: rawDurations.reduce((a, b) => a + b, 0),
|
|
299
|
+
},
|
|
300
|
+
rawDurations,
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
248
304
|
/**
|
|
249
305
|
* Execute trials comparison and generate aggregate report.
|
|
250
306
|
*
|
|
@@ -399,6 +455,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
399
455
|
const capability: Record<string, TrialsCapabilityMetrics> = {}
|
|
400
456
|
const reliability: Record<string, TrialsReliabilityMetrics> = {}
|
|
401
457
|
const flakiness: Record<string, TrialsFlakinessMetrics> = {}
|
|
458
|
+
const quality: Record<string, TrialsQualityMetrics> = {}
|
|
459
|
+
const performance: Record<string, TrialsPerformanceMetrics> = {}
|
|
460
|
+
const rawScoresByRun: Record<string, number[]> = {}
|
|
461
|
+
const rawDurationsByRun: Record<string, number[]> = {}
|
|
462
|
+
|
|
463
|
+
let hasQuality = false
|
|
402
464
|
|
|
403
465
|
for (const label of runLabels) {
|
|
404
466
|
const resultsMap = runResults[label] ?? new Map()
|
|
@@ -407,6 +469,17 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
407
469
|
capability[label] = computeCapabilityMetrics(results)
|
|
408
470
|
reliability[label] = computeReliabilityMetrics(results)
|
|
409
471
|
flakiness[label] = computeFlakinessMetrics(results)
|
|
472
|
+
|
|
473
|
+
const perfResult = computeTrialsPerformanceMetrics(results)
|
|
474
|
+
performance[label] = perfResult.metrics
|
|
475
|
+
rawDurationsByRun[label] = perfResult.rawDurations
|
|
476
|
+
|
|
477
|
+
const qualityResult = computeTrialsQualityMetrics(results)
|
|
478
|
+
if (qualityResult) {
|
|
479
|
+
quality[label] = qualityResult.metrics
|
|
480
|
+
rawScoresByRun[label] = qualityResult.rawScores
|
|
481
|
+
hasQuality = true
|
|
482
|
+
}
|
|
410
483
|
}
|
|
411
484
|
|
|
412
485
|
// Compute confidence intervals when using statistical strategy
|
|
@@ -415,9 +488,9 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
415
488
|
|
|
416
489
|
for (const label of runLabels) {
|
|
417
490
|
const resultsMap = runResults[label] ?? new Map()
|
|
418
|
-
const
|
|
419
|
-
const passAtKValues =
|
|
420
|
-
const passExpKValues =
|
|
491
|
+
const resultsArr = [...resultsMap.values()]
|
|
492
|
+
const passAtKValues = resultsArr.map((r) => r.passAtK ?? 0)
|
|
493
|
+
const passExpKValues = resultsArr.map((r) => r.passExpK ?? 0)
|
|
421
494
|
|
|
422
495
|
// Capability CIs
|
|
423
496
|
const capabilityMetrics = capability[label]
|
|
@@ -434,6 +507,24 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
434
507
|
avgPassExpK: bootstrap(passExpKValues, bootstrapConfig).ci,
|
|
435
508
|
}
|
|
436
509
|
}
|
|
510
|
+
|
|
511
|
+
// Quality CIs (only when scores present)
|
|
512
|
+
const qualityMetrics = quality[label]
|
|
513
|
+
const scores = rawScoresByRun[label]
|
|
514
|
+
if (qualityMetrics && scores && scores.length > 0) {
|
|
515
|
+
qualityMetrics.confidenceIntervals = {
|
|
516
|
+
avgScore: bootstrap(scores, bootstrapConfig).ci,
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Performance CIs
|
|
521
|
+
const performanceMetrics = performance[label]
|
|
522
|
+
const durations = rawDurationsByRun[label]
|
|
523
|
+
if (performanceMetrics && durations && durations.length > 0) {
|
|
524
|
+
performanceMetrics.confidenceIntervals = {
|
|
525
|
+
latencyMean: bootstrap(durations, bootstrapConfig).ci,
|
|
526
|
+
}
|
|
527
|
+
}
|
|
437
528
|
}
|
|
438
529
|
}
|
|
439
530
|
|
|
@@ -505,6 +596,8 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
505
596
|
capability,
|
|
506
597
|
reliability,
|
|
507
598
|
flakiness,
|
|
599
|
+
quality: hasQuality ? quality : undefined,
|
|
600
|
+
performance,
|
|
508
601
|
headToHead: {
|
|
509
602
|
capability: capabilityPairwise,
|
|
510
603
|
reliability: reliabilityPairwise,
|
|
@@ -528,8 +621,12 @@ export const runTrialsCompare = async (config: TrialsCompareConfig): Promise<Tri
|
|
|
528
621
|
for (const [label, cap] of Object.entries(capability)) {
|
|
529
622
|
const rel = reliability[label]
|
|
530
623
|
const flak = flakiness[label]
|
|
624
|
+
const perf = performance[label]
|
|
625
|
+
const qual = quality[label]
|
|
626
|
+
const qualStr = qual ? ` avgScore=${qual.avgScore.toFixed(3)}` : ''
|
|
627
|
+
const perfStr = perf ? ` latencyP50=${perf.latency.p50.toFixed(0)}ms` : ''
|
|
531
628
|
logProgress(
|
|
532
|
-
` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}`,
|
|
629
|
+
` ${label}: passAtK=${cap?.avgPassAtK.toFixed(3)} passExpK=${rel?.avgPassExpK.toFixed(3)} flakiness=${flak?.avgFlakiness.toFixed(3)}${qualStr}${perfStr}`,
|
|
533
630
|
progress,
|
|
534
631
|
)
|
|
535
632
|
}
|
|
@@ -620,6 +717,58 @@ const formatTrialsReportAsMarkdown = (report: TrialsComparisonReport): string =>
|
|
|
620
717
|
}
|
|
621
718
|
lines.push('')
|
|
622
719
|
|
|
720
|
+
// Quality table (only when scores present)
|
|
721
|
+
if (report.quality && Object.keys(report.quality).length > 0) {
|
|
722
|
+
const hasQualityCIs = Object.values(report.quality).some((q) => q.confidenceIntervals)
|
|
723
|
+
|
|
724
|
+
lines.push('## Quality (Scores)')
|
|
725
|
+
lines.push('')
|
|
726
|
+
if (hasQualityCIs) {
|
|
727
|
+
lines.push('| Run | Avg Score | 95% CI | Median | P25 | P75 |')
|
|
728
|
+
lines.push('|-----|-----------|--------|--------|-----|-----|')
|
|
729
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
730
|
+
const avgCI = formatCI(q.confidenceIntervals?.avgScore)
|
|
731
|
+
lines.push(
|
|
732
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${avgCI} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
|
|
733
|
+
)
|
|
734
|
+
}
|
|
735
|
+
} else {
|
|
736
|
+
lines.push('| Run | Avg Score | Median | P25 | P75 |')
|
|
737
|
+
lines.push('|-----|-----------|--------|-----|-----|')
|
|
738
|
+
for (const [label, q] of Object.entries(report.quality)) {
|
|
739
|
+
lines.push(
|
|
740
|
+
`| ${label} | ${q.avgScore.toFixed(3)} | ${q.medianScore.toFixed(3)} | ${q.p25Score.toFixed(3)} | ${q.p75Score.toFixed(3)} |`,
|
|
741
|
+
)
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
lines.push('')
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// Performance table (always present)
|
|
748
|
+
const hasPerfCIs = Object.values(report.performance).some((p) => p.confidenceIntervals)
|
|
749
|
+
|
|
750
|
+
lines.push('## Performance (Latency)')
|
|
751
|
+
lines.push('')
|
|
752
|
+
if (hasPerfCIs) {
|
|
753
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | 95% CI | Total (ms) |')
|
|
754
|
+
lines.push('|-----|----------|----------|----------|-----------|--------|------------|')
|
|
755
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
756
|
+
const latencyCI = formatCI(p.confidenceIntervals?.latencyMean, 0)
|
|
757
|
+
lines.push(
|
|
758
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${latencyCI} | ${p.totalDuration.toFixed(0)} |`,
|
|
759
|
+
)
|
|
760
|
+
}
|
|
761
|
+
} else {
|
|
762
|
+
lines.push('| Run | P50 (ms) | P90 (ms) | P99 (ms) | Mean (ms) | Total (ms) |')
|
|
763
|
+
lines.push('|-----|----------|----------|----------|-----------|------------|')
|
|
764
|
+
for (const [label, p] of Object.entries(report.performance)) {
|
|
765
|
+
lines.push(
|
|
766
|
+
`| ${label} | ${p.latency.p50.toFixed(0)} | ${p.latency.p90.toFixed(0)} | ${p.latency.p99.toFixed(0)} | ${p.latency.mean.toFixed(0)} | ${p.totalDuration.toFixed(0)} |`,
|
|
767
|
+
)
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
lines.push('')
|
|
771
|
+
|
|
623
772
|
// Head-to-head
|
|
624
773
|
lines.push('## Head-to-Head')
|
|
625
774
|
lines.push('')
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared utility functions for comparison modules.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Extracted from compare.ts and compare-trials.ts to avoid duplication.
|
|
6
|
+
* Contains statistical helpers used by both CaptureResult and TrialResult comparisons.
|
|
7
|
+
*
|
|
8
|
+
* @packageDocumentation
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { LatencyStats, ScoreDistribution } from '../schemas.ts'
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Compute percentile from sorted array using nearest rank method.
|
|
15
|
+
*
|
|
16
|
+
* @remarks
|
|
17
|
+
* Uses floor indexing (nearest rank method). For an array of length N,
|
|
18
|
+
* returns the element at index `floor(N * p)`, clamped to the last element.
|
|
19
|
+
* This does not interpolate between ranks.
|
|
20
|
+
*
|
|
21
|
+
* @param sorted - Sorted array of numbers
|
|
22
|
+
* @param p - Percentile (0-1)
|
|
23
|
+
* @returns Value at percentile
|
|
24
|
+
*
|
|
25
|
+
* @public
|
|
26
|
+
*/
|
|
27
|
+
export const percentile = (sorted: number[], p: number): number => {
|
|
28
|
+
if (sorted.length === 0) return 0
|
|
29
|
+
const idx = Math.floor(sorted.length * p)
|
|
30
|
+
return sorted[Math.min(idx, sorted.length - 1)] ?? 0
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Compute latency statistics from array of durations.
|
|
35
|
+
*
|
|
36
|
+
* @param durations - Array of durations in milliseconds
|
|
37
|
+
* @returns Latency statistics
|
|
38
|
+
*
|
|
39
|
+
* @public
|
|
40
|
+
*/
|
|
41
|
+
export const computeLatencyStats = (durations: number[]): LatencyStats => {
|
|
42
|
+
if (durations.length === 0) {
|
|
43
|
+
return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const sorted = [...durations].sort((a, b) => a - b)
|
|
47
|
+
const sum = sorted.reduce((a, b) => a + b, 0)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
p50: percentile(sorted, 0.5),
|
|
51
|
+
p90: percentile(sorted, 0.9),
|
|
52
|
+
p99: percentile(sorted, 0.99),
|
|
53
|
+
mean: sum / sorted.length,
|
|
54
|
+
min: sorted[0] ?? 0,
|
|
55
|
+
max: sorted[sorted.length - 1] ?? 0,
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Compute score distribution histogram.
|
|
61
|
+
*
|
|
62
|
+
* @param scores - Array of scores (0-1)
|
|
63
|
+
* @returns Score distribution histogram
|
|
64
|
+
*
|
|
65
|
+
* @public
|
|
66
|
+
*/
|
|
67
|
+
export const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
|
|
68
|
+
const dist: ScoreDistribution = {
|
|
69
|
+
'0.0-0.2': 0,
|
|
70
|
+
'0.2-0.4': 0,
|
|
71
|
+
'0.4-0.6': 0,
|
|
72
|
+
'0.6-0.8': 0,
|
|
73
|
+
'0.8-1.0': 0,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
for (const score of scores) {
|
|
77
|
+
if (score < 0.2) dist['0.0-0.2']++
|
|
78
|
+
else if (score < 0.4) dist['0.2-0.4']++
|
|
79
|
+
else if (score < 0.6) dist['0.4-0.6']++
|
|
80
|
+
else if (score < 0.8) dist['0.6-0.8']++
|
|
81
|
+
else dist['0.8-1.0']++
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return dist
|
|
85
|
+
}
|
package/src/pipeline/compare.ts
CHANGED
|
@@ -33,18 +33,17 @@ import type {
|
|
|
33
33
|
ComparisonMeta,
|
|
34
34
|
ComparisonReport,
|
|
35
35
|
HeadToHead,
|
|
36
|
-
LatencyStats,
|
|
37
36
|
PairwiseComparison,
|
|
38
37
|
PerformanceMetrics,
|
|
39
38
|
PromptComparison,
|
|
40
39
|
QualityMetrics,
|
|
41
40
|
ReliabilityMetrics,
|
|
42
|
-
ScoreDistribution,
|
|
43
41
|
TrajectoryInfo,
|
|
44
42
|
TrajectoryRichness,
|
|
45
43
|
} from '../schemas.ts'
|
|
46
44
|
import { type CompareInputFormat, detectAndValidateFormat } from './compare-format-detection.ts'
|
|
47
45
|
import { runTrialsCompare } from './compare-trials.ts'
|
|
46
|
+
import { computeLatencyStats, computeScoreDistribution } from './compare-utils.ts'
|
|
48
47
|
import type {
|
|
49
48
|
CompareConfig,
|
|
50
49
|
ComparisonGrader,
|
|
@@ -197,69 +196,6 @@ const getGrader = async (strategy: CompareStrategy, graderPath?: string): Promis
|
|
|
197
196
|
}
|
|
198
197
|
}
|
|
199
198
|
|
|
200
|
-
/**
|
|
201
|
-
* Compute percentile from sorted array.
|
|
202
|
-
*
|
|
203
|
-
* @param sorted - Sorted array of numbers
|
|
204
|
-
* @param p - Percentile (0-1)
|
|
205
|
-
* @returns Value at percentile
|
|
206
|
-
*/
|
|
207
|
-
const percentile = (sorted: number[], p: number): number => {
|
|
208
|
-
if (sorted.length === 0) return 0
|
|
209
|
-
const idx = Math.floor(sorted.length * p)
|
|
210
|
-
return sorted[Math.min(idx, sorted.length - 1)] ?? 0
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
/**
|
|
214
|
-
* Compute latency statistics from array of durations.
|
|
215
|
-
*
|
|
216
|
-
* @param durations - Array of durations in milliseconds
|
|
217
|
-
* @returns Latency statistics
|
|
218
|
-
*/
|
|
219
|
-
const computeLatencyStats = (durations: number[]): LatencyStats => {
|
|
220
|
-
if (durations.length === 0) {
|
|
221
|
-
return { p50: 0, p90: 0, p99: 0, mean: 0, min: 0, max: 0 }
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
const sorted = [...durations].sort((a, b) => a - b)
|
|
225
|
-
const sum = sorted.reduce((a, b) => a + b, 0)
|
|
226
|
-
|
|
227
|
-
return {
|
|
228
|
-
p50: percentile(sorted, 0.5),
|
|
229
|
-
p90: percentile(sorted, 0.9),
|
|
230
|
-
p99: percentile(sorted, 0.99),
|
|
231
|
-
mean: sum / sorted.length,
|
|
232
|
-
min: sorted[0] ?? 0,
|
|
233
|
-
max: sorted[sorted.length - 1] ?? 0,
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
/**
|
|
238
|
-
* Compute score distribution histogram.
|
|
239
|
-
*
|
|
240
|
-
* @param scores - Array of scores (0-1)
|
|
241
|
-
* @returns Score distribution histogram
|
|
242
|
-
*/
|
|
243
|
-
const computeScoreDistribution = (scores: number[]): ScoreDistribution => {
|
|
244
|
-
const dist: ScoreDistribution = {
|
|
245
|
-
'0.0-0.2': 0,
|
|
246
|
-
'0.2-0.4': 0,
|
|
247
|
-
'0.4-0.6': 0,
|
|
248
|
-
'0.6-0.8': 0,
|
|
249
|
-
'0.8-1.0': 0,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
for (const score of scores) {
|
|
253
|
-
if (score < 0.2) dist['0.0-0.2']++
|
|
254
|
-
else if (score < 0.4) dist['0.2-0.4']++
|
|
255
|
-
else if (score < 0.6) dist['0.4-0.6']++
|
|
256
|
-
else if (score < 0.8) dist['0.6-0.8']++
|
|
257
|
-
else dist['0.8-1.0']++
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
return dist
|
|
261
|
-
}
|
|
262
|
-
|
|
263
199
|
/**
|
|
264
200
|
* Detect trajectory richness from capture results.
|
|
265
201
|
*
|
|
@@ -429,6 +365,7 @@ export const runCompare = async (config: ExtendedCompareConfig): Promise<Compari
|
|
|
429
365
|
const fails = results.length - passes
|
|
430
366
|
|
|
431
367
|
quality[label] = {
|
|
368
|
+
type: 'run',
|
|
432
369
|
avgScore: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
|
|
433
370
|
passRate: results.length > 0 ? passes / results.length : 0,
|
|
434
371
|
passCount: passes,
|
|
@@ -105,6 +105,10 @@ describe('runCompare statistical strategy', () => {
|
|
|
105
105
|
// Verify reliability metrics include type discriminator
|
|
106
106
|
expect(report.reliability.high?.type).toBe('run')
|
|
107
107
|
expect(report.reliability.low?.type).toBe('run')
|
|
108
|
+
|
|
109
|
+
// Verify quality metrics include type discriminator
|
|
110
|
+
expect(report.quality.high?.type).toBe('run')
|
|
111
|
+
expect(report.quality.low?.type).toBe('run')
|
|
108
112
|
})
|
|
109
113
|
|
|
110
114
|
test('computes confidence intervals for performance metrics', async () => {
|
|
@@ -14,20 +14,23 @@ import { buildTrialsIndex, runTrialsCompare } from '../compare-trials.ts'
|
|
|
14
14
|
// Test Fixtures
|
|
15
15
|
// ============================================================================
|
|
16
16
|
|
|
17
|
-
const createTrialResult = (
|
|
17
|
+
const createTrialResult = (
|
|
18
|
+
id: string,
|
|
19
|
+
passAtK: number,
|
|
20
|
+
passExpK: number,
|
|
21
|
+
k: number = 3,
|
|
22
|
+
includeScores: boolean = true,
|
|
23
|
+
) => ({
|
|
18
24
|
id,
|
|
19
25
|
input: `Prompt for ${id}`,
|
|
20
26
|
k,
|
|
21
|
-
passRate: passAtK,
|
|
22
|
-
passAtK,
|
|
23
|
-
passExpK,
|
|
27
|
+
...(includeScores && { passRate: passAtK, passAtK, passExpK }),
|
|
24
28
|
trials: Array.from({ length: k }, (_, i) => ({
|
|
25
29
|
trialNum: i + 1,
|
|
26
30
|
output: `Output ${i + 1}`,
|
|
27
31
|
trajectory: [],
|
|
28
32
|
duration: 100 + i * 10,
|
|
29
|
-
pass: Math.random() < passAtK,
|
|
30
|
-
score: passAtK,
|
|
33
|
+
...(includeScores && { pass: Math.random() < passAtK, score: passAtK }),
|
|
31
34
|
})),
|
|
32
35
|
})
|
|
33
36
|
|
|
@@ -417,4 +420,173 @@ describe('runTrialsCompare', () => {
|
|
|
417
420
|
const topFlakyIds = flak?.topFlakyPrompts.map((p) => p.id) ?? []
|
|
418
421
|
expect(topFlakyIds).toContain('flaky')
|
|
419
422
|
})
|
|
423
|
+
|
|
424
|
+
test('includes performance metrics with latency stats', async () => {
|
|
425
|
+
const run1Path = `${tempDir}/perf-run1.jsonl`
|
|
426
|
+
const run2Path = `${tempDir}/perf-run2.jsonl`
|
|
427
|
+
|
|
428
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
429
|
+
const trial2 = createTrialResult('test-001', 0.8, 0.6)
|
|
430
|
+
|
|
431
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
432
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
433
|
+
|
|
434
|
+
const report = await runTrialsCompare({
|
|
435
|
+
runs: [
|
|
436
|
+
{ label: 'run1', path: run1Path },
|
|
437
|
+
{ label: 'run2', path: run2Path },
|
|
438
|
+
],
|
|
439
|
+
progress: false,
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
// Performance should always be present
|
|
443
|
+
expect(report.performance).toBeDefined()
|
|
444
|
+
expect(report.performance.run1).toBeDefined()
|
|
445
|
+
expect(report.performance.run2).toBeDefined()
|
|
446
|
+
|
|
447
|
+
const perf = report.performance.run1
|
|
448
|
+
expect(perf?.latency).toBeDefined()
|
|
449
|
+
expect(perf?.latency.p50).toBeGreaterThan(0)
|
|
450
|
+
expect(perf?.latency.mean).toBeGreaterThan(0)
|
|
451
|
+
expect(perf?.latency.min).toBeGreaterThan(0)
|
|
452
|
+
expect(perf?.latency.max).toBeGreaterThan(0)
|
|
453
|
+
expect(perf?.totalDuration).toBeGreaterThan(0)
|
|
454
|
+
})
|
|
455
|
+
|
|
456
|
+
test('includes quality metrics when scores are present', async () => {
|
|
457
|
+
const run1Path = `${tempDir}/qual-run1.jsonl`
|
|
458
|
+
const run2Path = `${tempDir}/qual-run2.jsonl`
|
|
459
|
+
|
|
460
|
+
// createTrialResult always includes score fields
|
|
461
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
462
|
+
const trial2 = createTrialResult('test-001', 0.8, 0.6)
|
|
463
|
+
|
|
464
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
465
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
466
|
+
|
|
467
|
+
const report = await runTrialsCompare({
|
|
468
|
+
runs: [
|
|
469
|
+
{ label: 'run1', path: run1Path },
|
|
470
|
+
{ label: 'run2', path: run2Path },
|
|
471
|
+
],
|
|
472
|
+
progress: false,
|
|
473
|
+
})
|
|
474
|
+
|
|
475
|
+
// Quality should be present since trials have scores
|
|
476
|
+
expect(report.quality).toBeDefined()
|
|
477
|
+
expect(report.quality?.run1).toBeDefined()
|
|
478
|
+
|
|
479
|
+
const qual = report.quality?.run1
|
|
480
|
+
expect(qual?.type).toBe('trial')
|
|
481
|
+
expect(qual?.avgScore).toBeGreaterThan(0)
|
|
482
|
+
expect(qual?.medianScore).toBeGreaterThan(0)
|
|
483
|
+
expect(qual?.p25Score).toBeDefined()
|
|
484
|
+
expect(qual?.p75Score).toBeDefined()
|
|
485
|
+
})
|
|
486
|
+
|
|
487
|
+
test('omits quality metrics when scores are absent', async () => {
|
|
488
|
+
const run1Path = `${tempDir}/noqual-run1.jsonl`
|
|
489
|
+
const run2Path = `${tempDir}/noqual-run2.jsonl`
|
|
490
|
+
|
|
491
|
+
// Create trials without scores (includeScores=false)
|
|
492
|
+
const trial1 = createTrialResult('test-001', 0, 0, 3, false)
|
|
493
|
+
const trial2 = createTrialResult('test-001', 0, 0, 3, false)
|
|
494
|
+
|
|
495
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
496
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
497
|
+
|
|
498
|
+
const report = await runTrialsCompare({
|
|
499
|
+
runs: [
|
|
500
|
+
{ label: 'run1', path: run1Path },
|
|
501
|
+
{ label: 'run2', path: run2Path },
|
|
502
|
+
],
|
|
503
|
+
progress: false,
|
|
504
|
+
})
|
|
505
|
+
|
|
506
|
+
// Quality should NOT be present since no trials have scores
|
|
507
|
+
expect(report.quality).toBeUndefined()
|
|
508
|
+
|
|
509
|
+
// Performance should still be present
|
|
510
|
+
expect(report.performance).toBeDefined()
|
|
511
|
+
expect(report.performance.run1?.latency.mean).toBeGreaterThan(0)
|
|
512
|
+
})
|
|
513
|
+
|
|
514
|
+
test('statistical strategy computes CIs for quality and performance', async () => {
|
|
515
|
+
const run1Path = `${tempDir}/ci-qp-run1.jsonl`
|
|
516
|
+
const run2Path = `${tempDir}/ci-qp-run2.jsonl`
|
|
517
|
+
|
|
518
|
+
const trials1 = [
|
|
519
|
+
createTrialResult('p1', 0.9, 0.8),
|
|
520
|
+
createTrialResult('p2', 0.85, 0.7),
|
|
521
|
+
createTrialResult('p3', 0.95, 0.9),
|
|
522
|
+
]
|
|
523
|
+
const trials2 = [
|
|
524
|
+
createTrialResult('p1', 0.6, 0.4),
|
|
525
|
+
createTrialResult('p2', 0.5, 0.3),
|
|
526
|
+
createTrialResult('p3', 0.7, 0.5),
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
await Bun.write(run1Path, trials1.map((t) => JSON.stringify(t)).join('\n'))
|
|
530
|
+
await Bun.write(run2Path, trials2.map((t) => JSON.stringify(t)).join('\n'))
|
|
531
|
+
|
|
532
|
+
const report = await runTrialsCompare({
|
|
533
|
+
runs: [
|
|
534
|
+
{ label: 'high', path: run1Path },
|
|
535
|
+
{ label: 'low', path: run2Path },
|
|
536
|
+
],
|
|
537
|
+
strategy: 'statistical',
|
|
538
|
+
progress: false,
|
|
539
|
+
})
|
|
540
|
+
|
|
541
|
+
// Quality CIs
|
|
542
|
+
const highQual = report.quality?.high
|
|
543
|
+
expect(highQual).toBeDefined()
|
|
544
|
+
expect(highQual?.confidenceIntervals).toBeDefined()
|
|
545
|
+
expect(highQual?.confidenceIntervals?.avgScore).toBeDefined()
|
|
546
|
+
|
|
547
|
+
const qualCI = highQual?.confidenceIntervals?.avgScore
|
|
548
|
+
expect(qualCI).toHaveLength(2)
|
|
549
|
+
expect(qualCI?.[0]).toBeLessThanOrEqual(qualCI?.[1] ?? 0)
|
|
550
|
+
|
|
551
|
+
// Performance CIs
|
|
552
|
+
const highPerf = report.performance.high
|
|
553
|
+
expect(highPerf).toBeDefined()
|
|
554
|
+
expect(highPerf?.confidenceIntervals).toBeDefined()
|
|
555
|
+
expect(highPerf?.confidenceIntervals?.latencyMean).toBeDefined()
|
|
556
|
+
|
|
557
|
+
const perfCI = highPerf?.confidenceIntervals?.latencyMean
|
|
558
|
+
expect(perfCI).toHaveLength(2)
|
|
559
|
+
expect(perfCI?.[0]).toBeLessThanOrEqual(perfCI?.[1] ?? 0)
|
|
560
|
+
})
|
|
561
|
+
|
|
562
|
+
test('markdown output includes quality and performance tables', async () => {
|
|
563
|
+
const run1Path = `${tempDir}/md-qp-run1.jsonl`
|
|
564
|
+
const run2Path = `${tempDir}/md-qp-run2.jsonl`
|
|
565
|
+
const outputPath = `${tempDir}/qp-report.md`
|
|
566
|
+
|
|
567
|
+
const trial1 = createTrialResult('test-001', 0.9, 0.7)
|
|
568
|
+
const trial2 = createTrialResult('test-001', 0.8, 0.6)
|
|
569
|
+
|
|
570
|
+
await Bun.write(run1Path, JSON.stringify(trial1))
|
|
571
|
+
await Bun.write(run2Path, JSON.stringify(trial2))
|
|
572
|
+
|
|
573
|
+
await runTrialsCompare({
|
|
574
|
+
runs: [
|
|
575
|
+
{ label: 'agent1', path: run1Path },
|
|
576
|
+
{ label: 'agent2', path: run2Path },
|
|
577
|
+
],
|
|
578
|
+
outputPath,
|
|
579
|
+
format: 'markdown',
|
|
580
|
+
progress: false,
|
|
581
|
+
})
|
|
582
|
+
|
|
583
|
+
const content = await Bun.file(outputPath).text()
|
|
584
|
+
|
|
585
|
+
// Should contain quality and performance sections
|
|
586
|
+
expect(content).toContain('## Quality (Scores)')
|
|
587
|
+
expect(content).toContain('## Performance (Latency)')
|
|
588
|
+
expect(content).toContain('Avg Score')
|
|
589
|
+
expect(content).toContain('P50 (ms)')
|
|
590
|
+
expect(content).toContain('Mean (ms)')
|
|
591
|
+
})
|
|
420
592
|
})
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unit tests for compare-utils shared utilities.
|
|
3
|
+
*
|
|
4
|
+
* @remarks
|
|
5
|
+
* Tests for percentile, computeLatencyStats, and computeScoreDistribution.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, expect, test } from 'bun:test'
|
|
11
|
+
import { computeLatencyStats, computeScoreDistribution, percentile } from '../compare-utils.ts'
|
|
12
|
+
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// percentile Tests
|
|
15
|
+
// ============================================================================
|
|
16
|
+
|
|
17
|
+
describe('percentile', () => {
|
|
18
|
+
test('computes correct percentile values', () => {
|
|
19
|
+
const sorted = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
|
|
20
|
+
|
|
21
|
+
expect(percentile(sorted, 0.5)).toBe(60)
|
|
22
|
+
expect(percentile(sorted, 0.25)).toBe(30)
|
|
23
|
+
expect(percentile(sorted, 0.75)).toBe(80)
|
|
24
|
+
expect(percentile(sorted, 0.9)).toBe(100)
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('returns 0 for empty array', () => {
|
|
28
|
+
expect(percentile([], 0.5)).toBe(0)
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
test('handles single-element array', () => {
|
|
32
|
+
expect(percentile([42], 0.5)).toBe(42)
|
|
33
|
+
expect(percentile([42], 0.0)).toBe(42)
|
|
34
|
+
expect(percentile([42], 1.0)).toBe(42)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('handles p=0 and p=1 boundary values', () => {
|
|
38
|
+
const sorted = [10, 20, 30]
|
|
39
|
+
|
|
40
|
+
expect(percentile(sorted, 0)).toBe(10)
|
|
41
|
+
expect(percentile(sorted, 1)).toBe(30)
|
|
42
|
+
})
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
// ============================================================================
|
|
46
|
+
// computeLatencyStats Tests
|
|
47
|
+
// ============================================================================
|
|
48
|
+
|
|
49
|
+
describe('computeLatencyStats', () => {
|
|
50
|
+
test('returns correct stats for typical durations', () => {
|
|
51
|
+
const durations = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
|
|
52
|
+
const stats = computeLatencyStats(durations)
|
|
53
|
+
|
|
54
|
+
expect(stats.min).toBe(100)
|
|
55
|
+
expect(stats.max).toBe(1000)
|
|
56
|
+
expect(stats.mean).toBe(550)
|
|
57
|
+
expect(stats.p50).toBe(600)
|
|
58
|
+
expect(stats.p90).toBe(1000)
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
test('returns zeros for empty array', () => {
|
|
62
|
+
const stats = computeLatencyStats([])
|
|
63
|
+
|
|
64
|
+
expect(stats.p50).toBe(0)
|
|
65
|
+
expect(stats.p90).toBe(0)
|
|
66
|
+
expect(stats.p99).toBe(0)
|
|
67
|
+
expect(stats.mean).toBe(0)
|
|
68
|
+
expect(stats.min).toBe(0)
|
|
69
|
+
expect(stats.max).toBe(0)
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
test('handles single-element array', () => {
|
|
73
|
+
const stats = computeLatencyStats([42])
|
|
74
|
+
|
|
75
|
+
expect(stats.p50).toBe(42)
|
|
76
|
+
expect(stats.p90).toBe(42)
|
|
77
|
+
expect(stats.mean).toBe(42)
|
|
78
|
+
expect(stats.min).toBe(42)
|
|
79
|
+
expect(stats.max).toBe(42)
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
test('sorts unsorted input', () => {
|
|
83
|
+
const stats = computeLatencyStats([500, 100, 300, 200, 400])
|
|
84
|
+
|
|
85
|
+
expect(stats.min).toBe(100)
|
|
86
|
+
expect(stats.max).toBe(500)
|
|
87
|
+
expect(stats.mean).toBe(300)
|
|
88
|
+
})
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
// ============================================================================
|
|
92
|
+
// computeScoreDistribution Tests
|
|
93
|
+
// ============================================================================
|
|
94
|
+
|
|
95
|
+
describe('computeScoreDistribution', () => {
|
|
96
|
+
test('distributes scores into correct buckets', () => {
|
|
97
|
+
const scores = [0.1, 0.3, 0.5, 0.7, 0.9]
|
|
98
|
+
const dist = computeScoreDistribution(scores)
|
|
99
|
+
|
|
100
|
+
expect(dist['0.0-0.2']).toBe(1)
|
|
101
|
+
expect(dist['0.2-0.4']).toBe(1)
|
|
102
|
+
expect(dist['0.4-0.6']).toBe(1)
|
|
103
|
+
expect(dist['0.6-0.8']).toBe(1)
|
|
104
|
+
expect(dist['0.8-1.0']).toBe(1)
|
|
105
|
+
})
|
|
106
|
+
|
|
107
|
+
test('handles empty scores array', () => {
|
|
108
|
+
const dist = computeScoreDistribution([])
|
|
109
|
+
|
|
110
|
+
expect(dist['0.0-0.2']).toBe(0)
|
|
111
|
+
expect(dist['0.2-0.4']).toBe(0)
|
|
112
|
+
expect(dist['0.4-0.6']).toBe(0)
|
|
113
|
+
expect(dist['0.6-0.8']).toBe(0)
|
|
114
|
+
expect(dist['0.8-1.0']).toBe(0)
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
test('handles boundary values correctly', () => {
|
|
118
|
+
// 0.0 → first bucket, 0.2 → second bucket (not first), 1.0 → last bucket
|
|
119
|
+
const scores = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
|
|
120
|
+
const dist = computeScoreDistribution(scores)
|
|
121
|
+
|
|
122
|
+
expect(dist['0.0-0.2']).toBe(1) // 0.0
|
|
123
|
+
expect(dist['0.2-0.4']).toBe(1) // 0.2
|
|
124
|
+
expect(dist['0.4-0.6']).toBe(1) // 0.4
|
|
125
|
+
expect(dist['0.6-0.8']).toBe(1) // 0.6
|
|
126
|
+
expect(dist['0.8-1.0']).toBe(2) // 0.8, 1.0
|
|
127
|
+
})
|
|
128
|
+
})
|
package/src/schemas/schemas.ts
CHANGED
|
@@ -620,6 +620,8 @@ export type QualityConfidenceIntervals = z.infer<typeof QualityConfidenceInterva
|
|
|
620
620
|
* Quality metrics for a single run in comparison.
|
|
621
621
|
*/
|
|
622
622
|
export const QualityMetricsSchema = z.object({
|
|
623
|
+
/** Discriminator for run-level quality metrics */
|
|
624
|
+
type: z.literal('run'),
|
|
623
625
|
/** Mean grader score (0-1) */
|
|
624
626
|
avgScore: z.number(),
|
|
625
627
|
/** Percentage of pass=true results */
|
|
@@ -923,6 +925,72 @@ export const TrialsFlakinessMetricsSchema = z.object({
|
|
|
923
925
|
/** Trials flakiness metrics type */
|
|
924
926
|
export type TrialsFlakinessMetrics = z.infer<typeof TrialsFlakinessMetricsSchema>
|
|
925
927
|
|
|
928
|
+
/**
|
|
929
|
+
* Confidence intervals for trials quality metrics.
|
|
930
|
+
*/
|
|
931
|
+
export const TrialsQualityConfidenceIntervalsSchema = z.object({
|
|
932
|
+
/** CI for avgScore */
|
|
933
|
+
avgScore: ConfidenceIntervalSchema.optional(),
|
|
934
|
+
})
|
|
935
|
+
|
|
936
|
+
/** Trials quality confidence intervals type */
|
|
937
|
+
export type TrialsQualityConfidenceIntervals = z.infer<typeof TrialsQualityConfidenceIntervalsSchema>
|
|
938
|
+
|
|
939
|
+
/**
|
|
940
|
+
* Quality metrics for trials comparison (score-based).
|
|
941
|
+
*
|
|
942
|
+
* @remarks
|
|
943
|
+
* Aggregates grader scores across all trials for each prompt.
|
|
944
|
+
* Only present when a grader was used during trials capture.
|
|
945
|
+
*/
|
|
946
|
+
export const TrialsQualityMetricsSchema = z.object({
|
|
947
|
+
/** Discriminator for trial-level quality metrics */
|
|
948
|
+
type: z.literal('trial'),
|
|
949
|
+
/** Average score across all trials */
|
|
950
|
+
avgScore: z.number(),
|
|
951
|
+
/** Median score */
|
|
952
|
+
medianScore: z.number(),
|
|
953
|
+
/** 25th percentile score */
|
|
954
|
+
p25Score: z.number(),
|
|
955
|
+
/** 75th percentile score */
|
|
956
|
+
p75Score: z.number(),
|
|
957
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
958
|
+
confidenceIntervals: TrialsQualityConfidenceIntervalsSchema.optional(),
|
|
959
|
+
})
|
|
960
|
+
|
|
961
|
+
/** Trials quality metrics type */
|
|
962
|
+
export type TrialsQualityMetrics = z.infer<typeof TrialsQualityMetricsSchema>
|
|
963
|
+
|
|
964
|
+
/**
|
|
965
|
+
* Confidence intervals for trials performance metrics.
|
|
966
|
+
*/
|
|
967
|
+
export const TrialsPerformanceConfidenceIntervalsSchema = z.object({
|
|
968
|
+
/** CI for latency mean */
|
|
969
|
+
latencyMean: ConfidenceIntervalSchema.optional(),
|
|
970
|
+
})
|
|
971
|
+
|
|
972
|
+
/** Trials performance confidence intervals type */
|
|
973
|
+
export type TrialsPerformanceConfidenceIntervals = z.infer<typeof TrialsPerformanceConfidenceIntervalsSchema>
|
|
974
|
+
|
|
975
|
+
/**
|
|
976
|
+
* Performance metrics for trials comparison (latency-based).
|
|
977
|
+
*
|
|
978
|
+
* @remarks
|
|
979
|
+
* Aggregates trial durations across all prompts.
|
|
980
|
+
* Always present since TrialEntry.duration is required.
|
|
981
|
+
*/
|
|
982
|
+
export const TrialsPerformanceMetricsSchema = z.object({
|
|
983
|
+
/** End-to-end latency statistics across all trials */
|
|
984
|
+
latency: LatencyStatsSchema,
|
|
985
|
+
/** Sum of all trial durations in milliseconds */
|
|
986
|
+
totalDuration: z.number(),
|
|
987
|
+
/** Confidence intervals (only with strategy=statistical) */
|
|
988
|
+
confidenceIntervals: TrialsPerformanceConfidenceIntervalsSchema.optional(),
|
|
989
|
+
})
|
|
990
|
+
|
|
991
|
+
/** Trials performance metrics type */
|
|
992
|
+
export type TrialsPerformanceMetrics = z.infer<typeof TrialsPerformanceMetricsSchema>
|
|
993
|
+
|
|
926
994
|
/**
|
|
927
995
|
* Per-prompt metrics for trials comparison drill-down.
|
|
928
996
|
*/
|
|
@@ -984,6 +1052,10 @@ export const TrialsComparisonReportSchema = z.object({
|
|
|
984
1052
|
reliability: z.record(z.string(), TrialsReliabilityMetricsSchema),
|
|
985
1053
|
/** Flakiness metrics by run label */
|
|
986
1054
|
flakiness: z.record(z.string(), TrialsFlakinessMetricsSchema),
|
|
1055
|
+
/** Quality metrics by run label (only when grader scores are present) */
|
|
1056
|
+
quality: z.record(z.string(), TrialsQualityMetricsSchema).optional(),
|
|
1057
|
+
/** Performance metrics by run label (always present, uses trial.duration) */
|
|
1058
|
+
performance: z.record(z.string(), TrialsPerformanceMetricsSchema),
|
|
987
1059
|
/** Head-to-head comparison details */
|
|
988
1060
|
headToHead: z.object({
|
|
989
1061
|
/** Pairwise wins by capability */
|
package/src/schemas.ts
CHANGED
|
@@ -113,8 +113,16 @@ export {
|
|
|
113
113
|
TrialsComparisonReportSchema,
|
|
114
114
|
type TrialsFlakinessMetrics,
|
|
115
115
|
TrialsFlakinessMetricsSchema,
|
|
116
|
+
type TrialsPerformanceConfidenceIntervals,
|
|
117
|
+
TrialsPerformanceConfidenceIntervalsSchema,
|
|
118
|
+
type TrialsPerformanceMetrics,
|
|
119
|
+
TrialsPerformanceMetricsSchema,
|
|
116
120
|
type TrialsPromptComparison,
|
|
117
121
|
TrialsPromptComparisonSchema,
|
|
122
|
+
type TrialsQualityConfidenceIntervals,
|
|
123
|
+
TrialsQualityConfidenceIntervalsSchema,
|
|
124
|
+
type TrialsQualityMetrics,
|
|
125
|
+
TrialsQualityMetricsSchema,
|
|
118
126
|
type TrialsReliabilityMetrics,
|
|
119
127
|
TrialsReliabilityMetricsSchema,
|
|
120
128
|
type ValidationResult,
|