@lythos/skill-arena 0.9.1 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ import { zodToJsonSchema } from 'zod-to-json-schema'
2
+ import { ComparativeReport, ScoreCell, ParetoEntry } from '@lythos/test-utils/schema'
3
+ import type { AgentAdapter } from '@lythos/test-utils/agents'
4
+ import type { ArenaManifest } from '@lythos/test-utils/schema'
5
+
6
+ // ── Pareto Frontier (deterministic algorithm) ──────────────────────────────
7
+
8
+ export interface ScoreVector {
9
+ participant_id: string
10
+ scores: Record<string, number>
11
+ dominated: boolean
12
+ dominated_by: string[]
13
+ }
14
+
15
+ /**
16
+ * Compute Pareto frontier from score vectors.
17
+ * Participant A dominates B if A >= B in all criteria AND A > B in at least one.
18
+ * This is deterministic — never delegated to LLM.
19
+ */
20
+ export function computePareto(vectors: { participant_id: string; scores: Record<string, number> }[]): ParetoEntry[] {
21
+ const result: ParetoEntry[] = vectors.map(v => ({
22
+ participant_id: v.participant_id,
23
+ scores: { ...v.scores },
24
+ dominated: false,
25
+ dominated_by: [] as string[],
26
+ }))
27
+
28
+ // Union of all criteria across all participants
29
+ const allCriteria = [...new Set(vectors.flatMap(v => Object.keys(v.scores)))]
30
+
31
+ if (allCriteria.length === 0) return result
32
+
33
+ for (let i = 0; i < result.length; i++) {
34
+ for (let j = 0; j < result.length; j++) {
35
+ if (i === j) continue
36
+ const a = vectors[i].scores
37
+ const b = vectors[j].scores
38
+
39
+ const allGe = allCriteria.every(k => (a[k] ?? 0) >= (b[k] ?? 0))
40
+ const anyGt = allCriteria.some(k => (a[k] ?? 0) > (b[k] ?? 0))
41
+
42
+ if (allGe && anyGt) {
43
+ // i dominates j
44
+ result[j].dominated = true
45
+ if (!result[j].dominated_by.includes(result[i].participant_id)) {
46
+ result[j].dominated_by.push(result[i].participant_id)
47
+ }
48
+ }
49
+ }
50
+ }
51
+
52
+ return result
53
+ }
54
+
55
+ // ── Comparative Judge Prompt ──────────────────────────────────────────────
56
+
57
+ function buildComparativePrompt(opts: {
58
+ manifest: ArenaManifest
59
+ verdicts: { participantId: string; verdict: unknown }[]
60
+ }): string {
61
+ const criteriaDesc = opts.manifest.criteria.join(', ')
62
+ const participants = opts.manifest.participants
63
+ .map(p => `- ${p.id}: ${p.name} (${p.description || 'no description'})`)
64
+ .join('\n')
65
+
66
+ return `You are a comparative judge evaluating ${opts.manifest.participants.length} participants against shared criteria.
67
+
68
+ ## Task
69
+ ${opts.manifest.task}
70
+
71
+ ## Participants
72
+ ${participants}
73
+
74
+ ## Criteria
75
+ ${criteriaDesc}
76
+
77
+ ## Your Job
78
+ For each participant, score them 1-5 on each criterion. Provide a brief rationale.
79
+ Score meanings: 1=poor, 3=acceptable, 5=excellent.
80
+
81
+ Use the submit_scores tool to return your structured evaluation.`
82
+ }
83
+
84
+ const SCORE_TOOL = {
85
+ name: 'submit_scores',
86
+ description: 'Submit per-participant scores for each criterion with rationales',
87
+ input_schema: zodToJsonSchema(ComparativeReport.pick({ score_matrix: true, key_findings: true, recommendations: true })) as Record<string, unknown>,
88
+ }
89
+
90
+ function toScoreMatrix(
91
+ manifest: ArenaManifest,
92
+ scores: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]
93
+ ): typeof ScoreCell._output[] {
94
+ return scores.map(s => ScoreCell.parse(s))
95
+ }
96
+
97
+ // ── Comparative Judge ─────────────────────────────────────────────────────
98
+
99
+ export async function runComparativeJudge(opts: {
100
+ manifest: ArenaManifest
101
+ verdicts: { participantId: string; verdict: unknown }[]
102
+ judge: AgentAdapter
103
+ workdir: string
104
+ }): Promise<typeof ComparativeReport._output> {
105
+ const { manifest, verdicts, judge, workdir } = opts
106
+
107
+ const prompt = buildComparativePrompt({ manifest, verdicts })
108
+
109
+ let raw: string
110
+ let parsed: unknown
111
+
112
+ if (judge.invokeTool) {
113
+ parsed = await judge.invokeTool({
114
+ tool: SCORE_TOOL,
115
+ prompt,
116
+ cwd: workdir,
117
+ timeoutMs: 120000,
118
+ })
119
+ raw = JSON.stringify(parsed)
120
+ } else {
121
+ const result = await judge.spawn({ cwd: workdir, brief: prompt, timeoutMs: 120000 })
122
+ raw = result.stdout
123
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
124
+ const jsonStr = fenceMatch ? fenceMatch[1].trim() : raw.trim()
125
+ parsed = JSON.parse(jsonStr)
126
+ }
127
+
128
+ // Validate LLM output
129
+ const llmResult = ComparativeReport.pick({
130
+ score_matrix: true,
131
+ key_findings: true,
132
+ recommendations: true,
133
+ }).parse(parsed)
134
+
135
+ const scoreMatrix = toScoreMatrix(manifest, llmResult.score_matrix)
136
+
137
+ // Pareto: deterministic, never delegated to LLM
138
+ const participantScores = manifest.participants.map(p => {
139
+ const pScores: Record<string, number> = {}
140
+ for (const cell of scoreMatrix) {
141
+ if (cell.participant_id === p.id) {
142
+ pScores[cell.criterion] = cell.score
143
+ }
144
+ }
145
+ return { participant_id: p.id, scores: pScores }
146
+ })
147
+
148
+ const pareto = computePareto(participantScores)
149
+
150
+ // Weighted totals (equal weight by default)
151
+ const weightedTotals: Record<string, number> = {}
152
+ for (const p of manifest.participants) {
153
+ const pCells = scoreMatrix.filter(c => c.participant_id === p.id)
154
+ weightedTotals[p.id] = pCells.reduce((sum, c) => sum + c.score * c.weight, 0) / (pCells.length || 1)
155
+ }
156
+
157
+ return ComparativeReport.parse({
158
+ arena_id: manifest.id,
159
+ generated_at: new Date().toISOString(),
160
+ score_matrix: scoreMatrix,
161
+ weighted_totals: weightedTotals,
162
+ pareto,
163
+ key_findings: llmResult.key_findings ?? [],
164
+ recommendations: llmResult.recommendations ?? [],
165
+ })
166
+ }
@@ -0,0 +1,95 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
3
+ import { parseArenaToml } from './arena-toml'
4
+
5
+ const toml = parseArenaToml(`
6
+ [arena]
7
+ task = "Test task"
8
+ criteria = ["a", "b"]
9
+ runs_per_side = 3
10
+
11
+ [[side]]
12
+ name = "minimal"
13
+ player = "claude-code"
14
+ deck = "./decks/minimal.toml"
15
+
16
+ [[side]]
17
+ name = "rich"
18
+ player = "expert-architect"
19
+ deck = "./decks/rich.toml"
20
+ `)
21
+
22
+ describe('resolvePlayer', () => {
23
+ test('maps claude-code → claude', () => {
24
+ expect(resolvePlayer('claude-code')).toBe('claude')
25
+ })
26
+
27
+ test('maps Claude → claude (case insensitive)', () => {
28
+ expect(resolvePlayer('Claude')).toBe('claude')
29
+ })
30
+
31
+ test('maps kimi → kimi', () => {
32
+ expect(resolvePlayer('kimi')).toBe('kimi')
33
+ })
34
+
35
+ test('passes through unknown player names', () => {
36
+ expect(resolvePlayer('expert-architect')).toBe('expert-architect')
37
+ })
38
+
39
+ test('trims whitespace', () => {
40
+ expect(resolvePlayer(' claude-code ')).toBe('claude')
41
+ })
42
+ })
43
+
44
+ describe('resolveSides', () => {
45
+ test('resolves all sides in arena.toml', () => {
46
+ const sides = resolveSides(toml)
47
+ expect(sides).toHaveLength(2)
48
+ expect(sides[0].platform).toBe('claude')
49
+ expect(sides[1].platform).toBe('expert-architect')
50
+ expect(sides[0].playerName).toBe('claude-code')
51
+ })
52
+
53
+ test('preserves side config', () => {
54
+ const sides = resolveSides(toml)
55
+ expect(sides[0].side.name).toBe('minimal')
56
+ expect(sides[0].side.deck).toBe('./decks/minimal.toml')
57
+ })
58
+ })
59
+
60
+ describe('groupBySide', () => {
61
+ test('groups by side name with run count', () => {
62
+ const groups = groupBySide(toml)
63
+ expect(groups).toHaveLength(2)
64
+ expect(groups[0].runs).toBe(3) // runs_per_side
65
+ expect(groups[1].runs).toBe(3)
66
+ expect(groups[0].platform).toBe('claude')
67
+ })
68
+
69
+ test('control flag preserved', () => {
70
+ const controlToml = parseArenaToml(`
71
+ [arena]
72
+ task = "x"
73
+ criteria = ["a"]
74
+
75
+ [[side]]
76
+ name = "test"
77
+ player = "claude-code"
78
+ deck = "a.toml"
79
+
80
+ [[side]]
81
+ name = "baseline"
82
+ player = "claude-code"
83
+ deck = "b.toml"
84
+ control = true
85
+ `)
86
+ const groups = groupBySide(controlToml)
87
+ expect(groups[1].control).toBe(true)
88
+ })
89
+ })
90
+
91
+ describe('totalRuns', () => {
92
+ test('calculates sides × runs_per_side', () => {
93
+ expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
94
+ })
95
+ })
package/src/player.ts ADDED
@@ -0,0 +1,71 @@
1
+ import type { Side, ArenaToml } from './arena-toml'
2
+
3
+ // ── Player reference resolution (pure function) ────────────────────────────
4
+ // Maps arena.toml player names → platform identifiers.
5
+ // AgentAdapter creation is the IO layer's job (T4), not ours.
6
+
7
+ export interface ResolvedSide {
8
+ side: Side
9
+ platform: string // resolved platform for useAgent()
10
+ playerName: string // original player reference
11
+ }
12
+
13
+ /** Built-in player registry. Player names that map directly to useAgent platforms. */
14
+ const BUILTIN_PLAYERS: Record<string, string> = {
15
+ 'claude': 'claude',
16
+ 'claude-code': 'claude',
17
+ 'kimi': 'kimi',
18
+ 'cursor': 'cursor',
19
+ 'gemini': 'gemini',
20
+ }
21
+
22
+ /**
23
+ * Resolve a player reference to its platform identifier.
24
+ * - Built-in names (claude, kimi, cursor) map directly
25
+ * - Unknown names are passed through (assumed to be useAgent-compatible)
26
+ * - Future: custom player.toml files will override built-in mappings
27
+ */
28
+ export function resolvePlayer(name: string): string {
29
+ const normalized = name.toLowerCase().trim()
30
+ return BUILTIN_PLAYERS[normalized] ?? normalized
31
+ }
32
+
33
+ /**
34
+ * Map arena.toml sides to resolved side configs.
35
+ * Pure function — no IO, no agent creation.
36
+ */
37
+ export function resolveSides(toml: ArenaToml): ResolvedSide[] {
38
+ return toml.side.map(side => ({
39
+ side,
40
+ platform: resolvePlayer(side.player),
41
+ playerName: side.player,
42
+ }))
43
+ }
44
+
45
+ // ── Side grouping (for per-side aggregation in T3) ─────────────────────────
46
+
47
+ export interface SideGroup {
48
+ sideName: string
49
+ player: string
50
+ deck: string
51
+ control: boolean
52
+ runs: number
53
+ platform: string
54
+ }
55
+
56
+ /** Group resolved sides by name for per-side statistical aggregation */
57
+ export function groupBySide(toml: ArenaToml): SideGroup[] {
58
+ return resolveSides(toml).map(rs => ({
59
+ sideName: rs.side.name,
60
+ player: rs.playerName,
61
+ deck: rs.side.deck,
62
+ control: rs.side.control,
63
+ runs: toml.arena.runs_per_side,
64
+ platform: rs.platform,
65
+ }))
66
+ }
67
+
68
+ /** Get total run count from arena.toml (sides × runs_per_side) */
69
+ export function totalRuns(toml: ArenaToml): number {
70
+ return toml.side.length * toml.arena.runs_per_side
71
+ }
package/src/runner.ts ADDED
@@ -0,0 +1,250 @@
1
+ import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
2
+ import { join, resolve } from 'node:path'
3
+ import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
4
+ import { useAgent } from '@lythos/test-utils/agents'
5
+ import { ArenaManifest, Player } from '@lythos/test-utils/schema'
6
+ import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
7
+ import { runComparativeJudge } from './comparative-judge'
8
+ import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
9
+ import { resolvePlayer, resolveSides } from './player'
10
+ import { aggregateAllStats } from './stats'
11
+ import type { SideStats } from './stats'
12
+
13
+ // ── Helpers ───────────────────────────────────────────────────────────────
14
+
15
+ function stamp(): string {
16
+ const d = new Date()
17
+ return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
18
+ }
19
+
20
+ // ── Declarative runner (arena.toml → execute) ─────────────────────────────
21
+
22
+ export interface ArenaResult {
23
+ manifest: ArenaManifestType
24
+ report: unknown
25
+ stats: SideStats[]
26
+ artifactsDir: string
27
+ }
28
+
29
+ export async function runArenaFromToml(opts: {
30
+ toml: ArenaToml
31
+ taskPath: string
32
+ outDir?: string
33
+ dryRun?: boolean
34
+ }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
35
+ const { toml, taskPath, outDir, dryRun } = opts
36
+
37
+ const plan = buildExecutionPlan(toml)
38
+
39
+ // dry-run: return plan without executing
40
+ if (dryRun) {
41
+ return { plan }
42
+ }
43
+
44
+ const arenaId = `arena-${stamp()}`
45
+ const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
46
+ const resolved = resolveSides(toml)
47
+
48
+ // Build manifest
49
+ const manifest = ArenaManifest.parse({
50
+ id: arenaId,
51
+ created_at: new Date().toISOString(),
52
+ task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
53
+ mode: 'decks',
54
+ participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
55
+ id: r.side.name,
56
+ name: r.side.name,
57
+ player: r.platform,
58
+ deck: r.side.deck,
59
+ description: `${r.playerName} × ${r.side.deck}`,
60
+ })),
61
+ criteria: toml.arena.criteria,
62
+ status: 'running',
63
+ })
64
+
65
+ mkdirSync(artifactsDir, { recursive: true })
66
+ writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
67
+
68
+ // Execute plan: per-cell agent run
69
+ const verdictsBySide = new Map<string, JudgeVerdict[]>()
70
+
71
+ for (const cell of plan.cells) {
72
+ const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
73
+ mkdirSync(cellDir, { recursive: true })
74
+
75
+ try {
76
+ const agent = useAgent(resolvePlayer(cell.player))
77
+ const result = await runAgentScenario({
78
+ scenarioPath: resolve(taskPath),
79
+ agent,
80
+ setupWorkdir(_scenario: AgentScenario, workdir: string) {
81
+ mkdirSync(workdir, { recursive: true })
82
+ const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
83
+ writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
84
+ },
85
+ baseDir: join(artifactsDir, 'runs', cell.side),
86
+ })
87
+
88
+ const v = (result.verdict ?? {
89
+ verdict: 'ERROR' as const,
90
+ reason: 'No verdict returned',
91
+ criteria: [],
92
+ }) as JudgeVerdict
93
+
94
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
95
+ verdictsBySide.get(cell.side)!.push(v)
96
+ } catch (e) {
97
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
98
+ verdictsBySide.get(cell.side)!.push({
99
+ verdict: 'ERROR' as const,
100
+ reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
101
+ criteria: [],
102
+ })
103
+ }
104
+ }
105
+
106
+ // Aggregate stats
107
+ const stats = aggregateAllStats(verdictsBySide)
108
+
109
+ // Comparative judge
110
+ const flatVerdicts: { participantId: string; verdict: unknown }[] = []
111
+ for (const [side, verdicts] of verdictsBySide) {
112
+ // Use the first run's verdict for comparative judge (or aggregate into one)
113
+ if (verdicts.length > 0) {
114
+ flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
115
+ }
116
+ }
117
+
118
+ const judge = useAgent(resolved[0]?.platform ?? 'claude')
119
+ const report = await runComparativeJudge({
120
+ manifest,
121
+ verdicts: flatVerdicts,
122
+ judge,
123
+ workdir: artifactsDir,
124
+ })
125
+
126
+ // Write report
127
+ writeReport(artifactsDir, manifest, report, stats)
128
+
129
+ // Update manifest
130
+ const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
131
+ writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
132
+
133
+ return { manifest: finalManifest, report, stats, artifactsDir }
134
+ }
135
+
136
+ // ── Backward compat: CLI-flag style runner ─────────────────────────────────
137
+
138
+ export async function runArena(opts: {
139
+ taskPath: string
140
+ playerPaths: string[]
141
+ deckPaths: string[]
142
+ criteria: string[]
143
+ outDir: string
144
+ }): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
145
+ const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
146
+
147
+ // Convert CLI flags to ArenaToml internally
148
+ const toml: ArenaToml = {
149
+ arena: {
150
+ task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
151
+ criteria,
152
+ runs_per_side: 1,
153
+ max_participants: Math.min(playerPaths.length, deckPaths.length),
154
+ },
155
+ side: playerPaths.flatMap((playerPath, pi) =>
156
+ deckPaths.map((deckPath, di) => ({
157
+ name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
158
+ player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
159
+ deck: deckPath,
160
+ }))
161
+ ),
162
+ }
163
+
164
+ const result = await runArenaFromToml({ toml, taskPath, outDir })
165
+ const { manifest, report, artifactsDir } = result as ArenaResult
166
+ return { manifest, report, artifactsDir }
167
+ }
168
+
169
+ // ── Report renderer ────────────────────────────────────────────────────────
170
+
171
+ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
172
+ const lines: string[] = [
173
+ `# Arena Report: ${manifest.id}`,
174
+ '',
175
+ `**Task**: ${manifest.task}`,
176
+ `**Criteria**: ${manifest.criteria.join(', ')}`,
177
+ `**Date**: ${new Date().toISOString()}`,
178
+ '',
179
+ '## Score Matrix',
180
+ '',
181
+ renderScoreMatrix(report),
182
+ '',
183
+ '## Per-Side Statistics',
184
+ '',
185
+ renderStatsTable(stats),
186
+ '',
187
+ '## Pareto Frontier',
188
+ '',
189
+ renderPareto(report),
190
+ '',
191
+ '## Key Findings',
192
+ '',
193
+ ...(report.key_findings ?? []).map((f: string) => `- ${f}`),
194
+ '',
195
+ '## Recommendations',
196
+ '',
197
+ ...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
198
+ ]
199
+
200
+ writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
201
+ }
202
+
203
+ function renderStatsTable(stats: SideStats[]): string {
204
+ if (stats.length === 0) return 'No statistics available.\n'
205
+
206
+ let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
207
+ table += `|------|------|-----------|-----------------|----------|\n`
208
+
209
+ for (const s of stats) {
210
+ const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
211
+ const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
212
+ table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
213
+ }
214
+
215
+ return table
216
+ }
217
+
218
+ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
219
+ if (!report.score_matrix?.length) return 'No scores available.\n'
220
+
221
+ const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
222
+ const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
223
+
224
+ let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
225
+ table += `|${'---|'.repeat(2 + participants.length)}\n`
226
+
227
+ for (const c of criteria) {
228
+ table += `| ${c} | 25% | ${participants.map(p => {
229
+ const cell = report.score_matrix!.find(s => s.participant_id === p && s.criterion === c)
230
+ return `**${cell?.score ?? '?'}**`
231
+ }).join(' | ')} |\n`
232
+ }
233
+
234
+ table += `| **Weighted Total** | 100% | ${participants.map(p => {
235
+ const pScores = report.score_matrix!.filter(s => s.participant_id === p)
236
+ const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
237
+ return `**${avg.toFixed(1)}**`
238
+ }).join(' | ')} |\n`
239
+
240
+ return table
241
+ }
242
+
243
+ function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
244
+ if (!report.pareto?.length) return 'No Pareto analysis.\n'
245
+ return report.pareto.map(p =>
246
+ p.dominated
247
+ ? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
248
+ : `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
249
+ ).join('\n')
250
+ }
@@ -0,0 +1,111 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { aggregateSideStats, aggregateAllStats } from './stats'
3
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
4
+
5
+ function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
6
+ return {
7
+ verdict: 'PASS',
8
+ reason: 'OK',
9
+ criteria: [{ name: 'correctness', passed: true }],
10
+ ...overrides,
11
+ }
12
+ }
13
+
14
+ // ── aggregateSideStats ─────────────────────────────────────────────────────
15
+
16
+ describe('aggregateSideStats', () => {
17
+ test('single run: passRate=1, no variance', () => {
18
+ const stats = aggregateSideStats('test', [makeVerdict()])
19
+ expect(stats.sideName).toBe('test')
20
+ expect(stats.runs).toBe(1)
21
+ expect(stats.passRate).toBe(1)
22
+ expect(stats.failRate).toBe(0)
23
+ expect(stats.errorRate).toBe(0)
24
+ })
25
+
26
+ test('3 runs: 2 PASS, 1 FAIL', () => {
27
+ const verdicts = [
28
+ makeVerdict(),
29
+ makeVerdict(),
30
+ makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
31
+ ]
32
+ const stats = aggregateSideStats('test', verdicts)
33
+ expect(stats.passRate).toBeCloseTo(2 / 3)
34
+ expect(stats.failRate).toBeCloseTo(1 / 3)
35
+ })
36
+
37
+ test('confidence: mean across runs', () => {
38
+ const verdicts = [
39
+ makeVerdict({ confidence: 90 }),
40
+ makeVerdict({ confidence: 80 }),
41
+ makeVerdict({ confidence: 70 }),
42
+ ]
43
+ const stats = aggregateSideStats('test', verdicts)
44
+ expect(stats.meanConfidence).toBeCloseTo(80)
45
+ expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
46
+ })
47
+
48
+ test('confidence: null when no verdict has it', () => {
49
+ const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
50
+ expect(stats.meanConfidence).toBeNull()
51
+ expect(stats.confidenceVariance).toBeNull()
52
+ })
53
+
54
+ test('per-criterion pass rate', () => {
55
+ const verdicts = [
56
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
57
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
58
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
59
+ ]
60
+ const stats = aggregateSideStats('test', verdicts)
61
+ expect(stats.criteria).toHaveLength(1)
62
+ expect(stats.criteria[0].name).toBe('accuracy')
63
+ expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
64
+ })
65
+
66
+ test('per-criterion scores: mean and variance', () => {
67
+ const verdicts = [
68
+ makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
69
+ makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
70
+ makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
71
+ ]
72
+ const stats = aggregateSideStats('test', verdicts)
73
+ expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
74
+ expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
75
+ expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
76
+ })
77
+
78
+ test('zero runs: all zeros', () => {
79
+ const stats = aggregateSideStats('empty', [])
80
+ expect(stats.runs).toBe(0)
81
+ expect(stats.passRate).toBe(0)
82
+ expect(stats.meanConfidence).toBeNull()
83
+ })
84
+
85
+ test('handles ERROR verdicts correctly', () => {
86
+ const verdicts = [
87
+ makeVerdict(),
88
+ makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
89
+ ]
90
+ const stats = aggregateSideStats('test', verdicts)
91
+ expect(stats.passRate).toBe(0.5)
92
+ expect(stats.errorRate).toBe(0.5)
93
+ })
94
+ })
95
+
96
+ // ── aggregateAllStats ──────────────────────────────────────────────────────
97
+
98
+ describe('aggregateAllStats', () => {
99
+ test('aggregates multiple sides', () => {
100
+ const map = new Map<string, JudgeVerdict[]>()
101
+ map.set('side-a', [makeVerdict(), makeVerdict()])
102
+ map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
103
+
104
+ const stats = aggregateAllStats(map)
105
+ expect(stats).toHaveLength(2)
106
+ expect(stats[0].sideName).toBe('side-a')
107
+ expect(stats[0].passRate).toBe(1)
108
+ expect(stats[1].sideName).toBe('side-b')
109
+ expect(stats[1].passRate).toBe(0)
110
+ })
111
+ })