@lythos/skill-arena 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/stats.ts ADDED
@@ -0,0 +1,117 @@
1
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
2
+
3
+ // ── Statistical aggregation for runs_per_side ─────────────────────────────
4
+ // All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
5
+
6
+ export interface CriterionStats {
7
+ name: string
8
+ mean: number
9
+ variance: number
10
+ min: number
11
+ max: number
12
+ count: number // number of runs that reported this criterion
13
+ }
14
+
15
+ export interface SideStats {
16
+ sideName: string
17
+ runs: number
18
+ passRate: number // PASS / total
19
+ failRate: number
20
+ errorRate: number
21
+ meanConfidence: number | null // null if no verdict had confidence
22
+ confidenceVariance: number | null
23
+ criteria: CriterionStats[]
24
+ scoreByCriterion: Record<string, { mean: number; variance: number }>
25
+ }
26
+
27
+ // ── Helpers ────────────────────────────────────────────────────────────────
28
+
29
+ function mean(values: number[]): number {
30
+ if (values.length === 0) return 0
31
+ return values.reduce((a, b) => a + b, 0) / values.length
32
+ }
33
+
34
+ function variance(values: number[], m?: number): number {
35
+ if (values.length < 2) return 0
36
+ const avg = m ?? mean(values)
37
+ return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
38
+ }
39
+
40
+ // ── Aggregator ────────────────────────────────────────────────────────────
41
+
42
+ export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
43
+ const runs = verdicts.length
44
+ const passCount = verdicts.filter(v => v.verdict === 'PASS').length
45
+ const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
46
+ const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
47
+
48
+ // Confidence
49
+ const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
50
+ const meanConf = confidences.length > 0 ? mean(confidences) : null
51
+ const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
52
+
53
+ // Per-criterion stats from verdict.criteria
54
+ const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
55
+ for (const v of verdicts) {
56
+ for (const c of v.criteria ?? []) {
57
+ if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
58
+ criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
59
+ }
60
+ }
61
+
62
+ const criteria: CriterionStats[] = []
63
+ for (const [name, values] of criterionMap) {
64
+ const passRate = values.filter(v => v.passed).length / values.length
65
+ criteria.push({
66
+ name,
67
+ mean: passRate, // for criteria, "mean" = pass rate across runs
68
+ variance: passRate * (1 - passRate), // Bernoulli variance
69
+ min: 0,
70
+ max: 1,
71
+ count: values.length,
72
+ })
73
+ }
74
+
75
+ // Per-criterion scores (1-5) from verdict.scores
76
+ const scoreMap = new Map<string, number[]>()
77
+ for (const v of verdicts) {
78
+ if (v.scores) {
79
+ for (const [criterion, score] of Object.entries(v.scores)) {
80
+ if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
81
+ scoreMap.get(criterion)!.push(score)
82
+ }
83
+ }
84
+ }
85
+
86
+ const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
87
+ for (const [criterion, scores] of scoreMap) {
88
+ const m = mean(scores)
89
+ scoreByCriterion[criterion] = {
90
+ mean: m,
91
+ variance: scores.length > 1 ? variance(scores, m) : 0,
92
+ }
93
+ }
94
+
95
+ return {
96
+ sideName,
97
+ runs,
98
+ passRate: runs > 0 ? passCount / runs : 0,
99
+ failRate: runs > 0 ? failCount / runs : 0,
100
+ errorRate: runs > 0 ? errorCount / runs : 0,
101
+ meanConfidence: meanConf,
102
+ confidenceVariance: confVar,
103
+ criteria,
104
+ scoreByCriterion,
105
+ }
106
+ }
107
+
108
+ /** Aggregate stats for all sides from a map of sideName → verdicts[] */
109
+ export function aggregateAllStats(
110
+ verdictsBySide: Map<string, JudgeVerdict[]>
111
+ ): SideStats[] {
112
+ const stats: SideStats[] = []
113
+ for (const [sideName, verdicts] of verdictsBySide) {
114
+ stats.push(aggregateSideStats(sideName, verdicts))
115
+ }
116
+ return stats
117
+ }