@lythos/skill-arena 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/src/arena-toml.test.ts +164 -0
- package/src/arena-toml.ts +172 -0
- package/src/cli.ts +95 -10
- package/src/comparative-judge.test.ts +92 -0
- package/src/comparative-judge.ts +166 -0
- package/src/player.test.ts +95 -0
- package/src/player.ts +71 -0
- package/src/runner.ts +250 -0
- package/src/stats.test.ts +111 -0
- package/src/stats.ts +117 -0
package/src/stats.ts
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
2
|
+
|
|
3
|
+
// ── Statistical aggregation for runs_per_side ─────────────────────────────
|
|
4
|
+
// All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
|
|
5
|
+
|
|
6
|
+
export interface CriterionStats {
|
|
7
|
+
name: string
|
|
8
|
+
mean: number
|
|
9
|
+
variance: number
|
|
10
|
+
min: number
|
|
11
|
+
max: number
|
|
12
|
+
count: number // number of runs that reported this criterion
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface SideStats {
|
|
16
|
+
sideName: string
|
|
17
|
+
runs: number
|
|
18
|
+
passRate: number // PASS / total
|
|
19
|
+
failRate: number
|
|
20
|
+
errorRate: number
|
|
21
|
+
meanConfidence: number | null // null if no verdict had confidence
|
|
22
|
+
confidenceVariance: number | null
|
|
23
|
+
criteria: CriterionStats[]
|
|
24
|
+
scoreByCriterion: Record<string, { mean: number; variance: number }>
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Helpers ────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
function mean(values: number[]): number {
|
|
30
|
+
if (values.length === 0) return 0
|
|
31
|
+
return values.reduce((a, b) => a + b, 0) / values.length
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function variance(values: number[], m?: number): number {
|
|
35
|
+
if (values.length < 2) return 0
|
|
36
|
+
const avg = m ?? mean(values)
|
|
37
|
+
return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Aggregator ────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
|
|
43
|
+
const runs = verdicts.length
|
|
44
|
+
const passCount = verdicts.filter(v => v.verdict === 'PASS').length
|
|
45
|
+
const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
|
|
46
|
+
const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
|
|
47
|
+
|
|
48
|
+
// Confidence
|
|
49
|
+
const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
|
|
50
|
+
const meanConf = confidences.length > 0 ? mean(confidences) : null
|
|
51
|
+
const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
|
|
52
|
+
|
|
53
|
+
// Per-criterion stats from verdict.criteria
|
|
54
|
+
const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
|
|
55
|
+
for (const v of verdicts) {
|
|
56
|
+
for (const c of v.criteria ?? []) {
|
|
57
|
+
if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
|
|
58
|
+
criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const criteria: CriterionStats[] = []
|
|
63
|
+
for (const [name, values] of criterionMap) {
|
|
64
|
+
const passRate = values.filter(v => v.passed).length / values.length
|
|
65
|
+
criteria.push({
|
|
66
|
+
name,
|
|
67
|
+
mean: passRate, // for criteria, "mean" = pass rate across runs
|
|
68
|
+
variance: passRate * (1 - passRate), // Bernoulli variance
|
|
69
|
+
min: 0,
|
|
70
|
+
max: 1,
|
|
71
|
+
count: values.length,
|
|
72
|
+
})
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Per-criterion scores (1-5) from verdict.scores
|
|
76
|
+
const scoreMap = new Map<string, number[]>()
|
|
77
|
+
for (const v of verdicts) {
|
|
78
|
+
if (v.scores) {
|
|
79
|
+
for (const [criterion, score] of Object.entries(v.scores)) {
|
|
80
|
+
if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
|
|
81
|
+
scoreMap.get(criterion)!.push(score)
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
|
|
87
|
+
for (const [criterion, scores] of scoreMap) {
|
|
88
|
+
const m = mean(scores)
|
|
89
|
+
scoreByCriterion[criterion] = {
|
|
90
|
+
mean: m,
|
|
91
|
+
variance: scores.length > 1 ? variance(scores, m) : 0,
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
sideName,
|
|
97
|
+
runs,
|
|
98
|
+
passRate: runs > 0 ? passCount / runs : 0,
|
|
99
|
+
failRate: runs > 0 ? failCount / runs : 0,
|
|
100
|
+
errorRate: runs > 0 ? errorCount / runs : 0,
|
|
101
|
+
meanConfidence: meanConf,
|
|
102
|
+
confidenceVariance: confVar,
|
|
103
|
+
criteria,
|
|
104
|
+
scoreByCriterion,
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Aggregate stats for all sides from a map of sideName → verdicts[] */
|
|
109
|
+
export function aggregateAllStats(
|
|
110
|
+
verdictsBySide: Map<string, JudgeVerdict[]>
|
|
111
|
+
): SideStats[] {
|
|
112
|
+
const stats: SideStats[] = []
|
|
113
|
+
for (const [sideName, verdicts] of verdictsBySide) {
|
|
114
|
+
stats.push(aggregateSideStats(sideName, verdicts))
|
|
115
|
+
}
|
|
116
|
+
return stats
|
|
117
|
+
}
|