@lythos/skill-arena 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.2",
3
+ "version": "0.9.3",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -0,0 +1,164 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
3
+
4
+ const minimalToml = `
5
+ [arena]
6
+ task = "Test task"
7
+ criteria = ["a", "b"]
8
+
9
+ [[side]]
10
+ name = "runner-a"
11
+ player = "claude-code"
12
+ deck = "./decks/a.toml"
13
+
14
+ [[side]]
15
+ name = "runner-b"
16
+ player = "claude-code"
17
+ deck = "./decks/b.toml"
18
+ `
19
+
20
+ const fullToml = `
21
+ [arena]
22
+ task = "Generate auth flow diagram"
23
+ criteria = ["syntax", "context", "logic", "token"]
24
+ runs_per_side = 3
25
+
26
+ [[side]]
27
+ name = "minimal"
28
+ player = "standard-coder"
29
+ deck = "./decks/minimal.toml"
30
+
31
+ [[side]]
32
+ name = "rich"
33
+ player = "expert-architect"
34
+ deck = "./decks/rich.toml"
35
+
36
+ [[side]]
37
+ name = "baseline"
38
+ player = "standard-coder"
39
+ deck = "./decks/baseline.toml"
40
+ control = true
41
+
42
+ [side.env]
43
+ container = "node:20-alpine"
44
+ pre_run = ["npm ci", "npm run build"]
45
+ working_dir = "/workspace"
46
+ `
47
+
48
+ // ── Schema + Parser ────────────────────────────────────────────────────────
49
+
50
+ describe('parseArenaToml', () => {
51
+ test('parses minimal two-side arena', () => {
52
+ const result = parseArenaToml(minimalToml)
53
+ expect(result.arena.task).toBe('Test task')
54
+ expect(result.arena.criteria).toEqual(['a', 'b'])
55
+ expect(result.arena.runs_per_side).toBe(1) // default
56
+ expect(result.side).toHaveLength(2)
57
+ expect(result.side[0].name).toBe('runner-a')
58
+ expect(result.side[0].player).toBe('claude-code')
59
+ expect(result.side[0].deck).toBe('./decks/a.toml')
60
+ expect(result.side[0].control).toBe(false) // default
61
+ })
62
+
63
+ test('parses full arena with runs_per_side and control', () => {
64
+ const result = parseArenaToml(fullToml)
65
+ expect(result.arena.runs_per_side).toBe(3)
66
+ expect(result.side).toHaveLength(3)
67
+ expect(result.side[2].name).toBe('baseline')
68
+ expect(result.side[2].control).toBe(true)
69
+ })
70
+
71
+ test('parses side env block', () => {
72
+ const result = parseArenaToml(fullToml)
73
+ const env = result.side[2].env
74
+ expect(env.container).toBe('node:20-alpine')
75
+ expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
76
+ expect(env.working_dir).toBe('/workspace')
77
+ expect(env.env_vars).toEqual({})
78
+ })
79
+
80
+ test('rejects fewer than 2 sides', () => {
81
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
82
+ expect(() => parseArenaToml(bad)).toThrow()
83
+ })
84
+
85
+ test('rejects empty criteria', () => {
86
+ const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
87
+ expect(() => parseArenaToml(bad)).toThrow()
88
+ })
89
+
90
+ test('rejects non-object input', () => {
91
+ expect(() => ArenaToml.parse('not valid')).toThrow()
92
+ })
93
+
94
+ test('rejects missing arena section', () => {
95
+ expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
96
+ })
97
+
98
+ test('rejects runs_per_side = 0', () => {
99
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
100
+ expect(() => parseArenaToml(bad)).toThrow()
101
+ })
102
+
103
+ test('parses integer and boolean values correctly', () => {
104
+ const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
105
+ const result = parseArenaToml(toml)
106
+ expect(result.arena.runs_per_side).toBe(2)
107
+ expect(result.arena.max_participants).toBe(5)
108
+ })
109
+
110
+ test('comments are stripped', () => {
111
+ const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
112
+ const result = parseArenaToml(toml)
113
+ expect(result.arena.task).toBe('x')
114
+ })
115
+ })
116
+
117
+ // ── Execution Plan ─────────────────────────────────────────────────────────
118
+
119
+ describe('buildExecutionPlan', () => {
120
+ test('generates plan: 2 sides × 1 run = 2 cells', () => {
121
+ const toml = parseArenaToml(minimalToml)
122
+ const plan = buildExecutionPlan(toml)
123
+ expect(plan.task).toBe('Test task')
124
+ expect(plan.criteria).toEqual(['a', 'b'])
125
+ expect(plan.cells).toHaveLength(2)
126
+ expect(plan.total_runs).toBe(2)
127
+ expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
128
+ expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
129
+ })
130
+
131
+ test('generates plan: 3 sides × 3 runs = 9 cells', () => {
132
+ const toml = parseArenaToml(fullToml)
133
+ const plan = buildExecutionPlan(toml)
134
+ expect(plan.cells).toHaveLength(9)
135
+ expect(plan.total_runs).toBe(9)
136
+
137
+ // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
138
+ expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
139
+ expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
140
+ expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
141
+ expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
142
+ expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
143
+ })
144
+
145
+ test('control flag preserved in plan cells', () => {
146
+ const toml = parseArenaToml(fullToml)
147
+ const plan = buildExecutionPlan(toml)
148
+ const baselineCells = plan.cells.filter(c => c.side === 'baseline')
149
+ expect(baselineCells).toHaveLength(3)
150
+ expect(baselineCells.every(c => c.control)).toBe(true)
151
+ })
152
+
153
+ test('dry-run: plan is pure data, no side effects', () => {
154
+ // The entire plan generation is a pure function — dry-run is just printing it
155
+ const toml = parseArenaToml(fullToml)
156
+ const plan = buildExecutionPlan(toml)
157
+ // Verify plan is self-describing for a --dry-run output
158
+ expect(plan.total_runs).toBeGreaterThan(0)
159
+ expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
160
+ expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
161
+ expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
162
+ expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
163
+ })
164
+ })
@@ -0,0 +1,172 @@
1
+ import { z } from 'zod'
2
+ import type { ArenaManifest } from '@lythos/test-utils/schema'
3
+
4
+ // ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
5
+ // Anchored on: ADR-20260502110308316
6
+
7
+ export const SideEnv = z.object({
8
+ container: z.string().optional(),
9
+ pre_run: z.array(z.string()).default([]),
10
+ working_dir: z.string().optional(),
11
+ env_vars: z.record(z.string()).default({}),
12
+ })
13
+ export type SideEnv = z.infer<typeof SideEnv>
14
+
15
+ export const Side = z.object({
16
+ name: z.string(),
17
+ player: z.string(), // reference to player config (useAgent resolves)
18
+ deck: z.string(), // path to deck.toml
19
+ control: z.boolean().default(false),
20
+ env: SideEnv.default({}),
21
+ })
22
+ export type Side = z.infer<typeof Side>
23
+
24
+ export const ArenaToml = z.object({
25
+ arena: z.object({
26
+ task: z.string(), // task description or path to TASK-arena.md
27
+ criteria: z.array(z.string()).min(1),
28
+ runs_per_side: z.number().int().positive().default(1),
29
+ max_participants: z.number().int().min(2).max(5).default(5),
30
+ }),
31
+ side: z.array(Side).min(2).max(5),
32
+ })
33
+ export type ArenaToml = z.infer<typeof ArenaToml>
34
+
35
+ // ── Parser ─────────────────────────────────────────────────────────────────
36
+
37
+ export function parseArenaToml(content: string): ArenaToml {
38
+ // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
39
+ const parsed = parseToml(content)
40
+ return ArenaToml.parse(parsed)
41
+ }
42
+
43
+ // ── Plan generation (pure function, dry-run visible) ───────────────────────
44
+
45
+ export interface ExecutionCell {
46
+ side: string // side name
47
+ player: string // player reference
48
+ deck: string // deck path
49
+ run: number // 1-indexed run number
50
+ control: boolean
51
+ }
52
+
53
+ export interface ExecutionPlan {
54
+ task: string
55
+ criteria: string[]
56
+ cells: ExecutionCell[]
57
+ total_runs: number
58
+ }
59
+
60
+ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
61
+ const cells: ExecutionCell[] = []
62
+ for (const side of toml.side) {
63
+ for (let run = 1; run <= toml.arena.runs_per_side; run++) {
64
+ cells.push({
65
+ side: side.name,
66
+ player: side.player,
67
+ deck: side.deck,
68
+ run,
69
+ control: side.control,
70
+ })
71
+ }
72
+ }
73
+ return {
74
+ task: toml.arena.task,
75
+ criteria: toml.arena.criteria,
76
+ cells,
77
+ total_runs: cells.length,
78
+ }
79
+ }
80
+
81
+ // ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
82
+
83
+ function parseToml(text: string): Record<string, unknown> {
84
+ const result: Record<string, unknown> = {}
85
+ let currentTable: Record<string, unknown> = result
86
+ let currentTableKey = ''
87
+ const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
88
+
89
+ for (const rawLine of text.split('\n')) {
90
+ const line = rawLine.split('#')[0].trim()
91
+ if (!line) continue
92
+
93
+ // [[array]]
94
+ const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
95
+ if (arrayMatch) {
96
+ const key = arrayMatch[1] // e.g. "side"
97
+ if (!arrayTables.has(key)) arrayTables.set(key, [])
98
+ currentTable = {}
99
+ arrayTables.get(key)!.push(currentTable)
100
+ currentTableKey = key
101
+ continue
102
+ }
103
+
104
+ // [section]
105
+ const sectionMatch = line.match(/^\[(.+?)\]$/)
106
+ if (sectionMatch) {
107
+ const key = sectionMatch[1]
108
+ // nested key like "side.env"
109
+ if (key.includes('.')) {
110
+ const [parent, child] = key.split('.')
111
+ const parentArr = arrayTables.get(parent)
112
+ if (parentArr && parentArr.length > 0) {
113
+ currentTable = {}
114
+ parentArr[parentArr.length - 1][child] = currentTable
115
+ }
116
+ } else {
117
+ result[key] = {}
118
+ currentTable = result[key] as Record<string, unknown>
119
+ }
120
+ currentTableKey = ''
121
+ continue
122
+ }
123
+
124
+ // key = value
125
+ const eqIdx = line.indexOf('=')
126
+ if (eqIdx !== -1) {
127
+ const key = line.slice(0, eqIdx).trim()
128
+ let value = line.slice(eqIdx + 1).trim()
129
+
130
+ // String value
131
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
132
+ value = value.slice(1, -1)
133
+ } else if (value === 'true') {
134
+ value = 'true'
135
+ } else if (value === 'false') {
136
+ value = 'false'
137
+ }
138
+
139
+ // Array value: ["a", "b"]
140
+ if (value.startsWith('[') && value.endsWith(']')) {
141
+ const inner = value.slice(1, -1).trim()
142
+ if (!inner) {
143
+ currentTable[key] = []
144
+ } else {
145
+ const arr = inner.split(',').map(s => {
146
+ const t = s.trim()
147
+ if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
148
+ return t.slice(1, -1)
149
+ }
150
+ return t
151
+ })
152
+ currentTable[key] = arr
153
+ }
154
+ } else if (value === 'true') {
155
+ currentTable[key] = true
156
+ } else if (value === 'false') {
157
+ currentTable[key] = false
158
+ } else if (/^-?\d+(\.\d+)?$/.test(value)) {
159
+ currentTable[key] = Number(value)
160
+ } else {
161
+ currentTable[key] = value
162
+ }
163
+ }
164
+ }
165
+
166
+ // Materialize array tables into result
167
+ for (const [key, arr] of arrayTables) {
168
+ result[key] = arr
169
+ }
170
+
171
+ return result
172
+ }
package/src/cli.ts CHANGED
@@ -35,7 +35,7 @@ Usage:
35
35
  lythoskill-arena viz <arena-dir>
36
36
 
37
37
  Commands:
38
- run Run arena programmatically (cartesian player × deck → judge → report)
38
+ run Run arena programmatically (declarative arena.toml or CLI flags)
39
39
  scaffold Create arena directory structure (legacy, manual subagent execution)
40
40
  viz Visualize arena report (ASCII charts)
41
41
 
@@ -44,14 +44,23 @@ Options:
44
44
  -s, --skills <list> Comma-separated skill names (scaffold only)
45
45
  --decks <list> Comma-separated deck paths
46
46
  -c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
47
- --players <list> Comma-separated player.toml paths (run only)
47
+ --players <list> Comma-separated player.toml paths (CLI run only)
48
+ --config <path> Path to arena.toml (declarative mode, k8s-style)
49
+ --dry-run Print execution plan without running (with --config)
48
50
  --control <skill> Control skill for comparison (scaffold only)
49
51
  --out <dir> Output directory (run: defaults to runs/arena-<id>)
50
52
  -d, --dir <dir> Output directory (scaffold: defaults to tmp)
51
53
  -p, --project <dir> Project directory (default: .)
52
54
 
53
55
  Examples:
54
- lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml,./players/kimi.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
56
+ # Declarative mode (k8s-style)
57
+ lythoskill-arena run --config ./arena.toml
58
+ lythoskill-arena run --config ./arena.toml --dry-run
59
+
60
+ # CLI-flag mode (backward compat)
61
+ lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
62
+
63
+ # Legacy scaffolding
55
64
  lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
56
65
  lythoskill-arena viz runs/arena-20260504
57
66
  `)
@@ -563,9 +572,45 @@ function runViz(argv: string[]) {
563
572
 
564
573
  async function runProgrammaticArena(argv: string[]) {
565
574
  const { options } = parseArgs(argv)
575
+ const { readFileSync } = await import('node:fs')
576
+
577
+ const hasConfig = !!(options as Record<string, string | undefined>).config
578
+ const dryRun = argv.includes('--dry-run')
579
+
580
+ if (hasConfig) {
581
+ // arena.toml declarative mode
582
+ const { parseArenaToml } = await import('./arena-toml')
583
+ const { runArenaFromToml } = await import('./runner')
584
+ const configPath = (options as Record<string, string | undefined>).config!
585
+
586
+ const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
587
+ const result = await runArenaFromToml({
588
+ toml,
589
+ taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
590
+ ? toml.arena.task
591
+ : (options as Record<string, string | undefined>).task ?? toml.arena.task,
592
+ outDir: (options as Record<string, string | undefined>).out,
593
+ dryRun,
594
+ })
595
+
596
+ if ('plan' in result) {
597
+ // dry-run
598
+ console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
599
+ for (const cell of result.plan.cells) {
600
+ console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
601
+ }
602
+ return
603
+ }
604
+
605
+ console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
606
+ console.log(`📁 Artifacts: ${result.artifactsDir}`)
607
+ console.log(`📊 Report: ${result.artifactsDir}/report.md`)
608
+ return
609
+ }
566
610
 
611
+ // CLI-flag mode (backward compat)
567
612
  if (!options.task || !options.decks) {
568
- console.error('❌ --task <path> and --decks <list> are required for "run"')
613
+ console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
569
614
  process.exit(1)
570
615
  }
571
616
 
@@ -577,7 +622,6 @@ async function runProgrammaticArena(argv: string[]) {
577
622
  deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
578
623
  criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
579
624
  outDir: options.out ?? `runs/arena-${timestamp()}`,
580
- projectDir: options.project,
581
625
  })
582
626
 
583
627
  console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
@@ -0,0 +1,95 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
3
+ import { parseArenaToml } from './arena-toml'
4
+
5
+ const toml = parseArenaToml(`
6
+ [arena]
7
+ task = "Test task"
8
+ criteria = ["a", "b"]
9
+ runs_per_side = 3
10
+
11
+ [[side]]
12
+ name = "minimal"
13
+ player = "claude-code"
14
+ deck = "./decks/minimal.toml"
15
+
16
+ [[side]]
17
+ name = "rich"
18
+ player = "expert-architect"
19
+ deck = "./decks/rich.toml"
20
+ `)
21
+
22
+ describe('resolvePlayer', () => {
23
+ test('maps claude-code → claude', () => {
24
+ expect(resolvePlayer('claude-code')).toBe('claude')
25
+ })
26
+
27
+ test('maps Claude → claude (case insensitive)', () => {
28
+ expect(resolvePlayer('Claude')).toBe('claude')
29
+ })
30
+
31
+ test('maps kimi → kimi', () => {
32
+ expect(resolvePlayer('kimi')).toBe('kimi')
33
+ })
34
+
35
+ test('passes through unknown player names', () => {
36
+ expect(resolvePlayer('expert-architect')).toBe('expert-architect')
37
+ })
38
+
39
+ test('trims whitespace', () => {
40
+ expect(resolvePlayer(' claude-code ')).toBe('claude')
41
+ })
42
+ })
43
+
44
+ describe('resolveSides', () => {
45
+ test('resolves all sides in arena.toml', () => {
46
+ const sides = resolveSides(toml)
47
+ expect(sides).toHaveLength(2)
48
+ expect(sides[0].platform).toBe('claude')
49
+ expect(sides[1].platform).toBe('expert-architect')
50
+ expect(sides[0].playerName).toBe('claude-code')
51
+ })
52
+
53
+ test('preserves side config', () => {
54
+ const sides = resolveSides(toml)
55
+ expect(sides[0].side.name).toBe('minimal')
56
+ expect(sides[0].side.deck).toBe('./decks/minimal.toml')
57
+ })
58
+ })
59
+
60
+ describe('groupBySide', () => {
61
+ test('groups by side name with run count', () => {
62
+ const groups = groupBySide(toml)
63
+ expect(groups).toHaveLength(2)
64
+ expect(groups[0].runs).toBe(3) // runs_per_side
65
+ expect(groups[1].runs).toBe(3)
66
+ expect(groups[0].platform).toBe('claude')
67
+ })
68
+
69
+ test('control flag preserved', () => {
70
+ const controlToml = parseArenaToml(`
71
+ [arena]
72
+ task = "x"
73
+ criteria = ["a"]
74
+
75
+ [[side]]
76
+ name = "test"
77
+ player = "claude-code"
78
+ deck = "a.toml"
79
+
80
+ [[side]]
81
+ name = "baseline"
82
+ player = "claude-code"
83
+ deck = "b.toml"
84
+ control = true
85
+ `)
86
+ const groups = groupBySide(controlToml)
87
+ expect(groups[1].control).toBe(true)
88
+ })
89
+ })
90
+
91
+ describe('totalRuns', () => {
92
+ test('calculates sides × runs_per_side', () => {
93
+ expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
94
+ })
95
+ })
package/src/player.ts ADDED
@@ -0,0 +1,71 @@
1
+ import type { Side, ArenaToml } from './arena-toml'
2
+
3
+ // ── Player reference resolution (pure function) ────────────────────────────
4
+ // Maps arena.toml player names → platform identifiers.
5
+ // AgentAdapter creation is the IO layer's job (T4), not ours.
6
+
7
+ export interface ResolvedSide {
8
+ side: Side
9
+ platform: string // resolved platform for useAgent()
10
+ playerName: string // original player reference
11
+ }
12
+
13
+ /** Built-in player registry. Player names that map directly to useAgent platforms. */
14
+ const BUILTIN_PLAYERS: Record<string, string> = {
15
+ 'claude': 'claude',
16
+ 'claude-code': 'claude',
17
+ 'kimi': 'kimi',
18
+ 'cursor': 'cursor',
19
+ 'gemini': 'gemini',
20
+ }
21
+
22
+ /**
23
+ * Resolve a player reference to its platform identifier.
24
+ * - Built-in names (claude, kimi, cursor) map directly
25
+ * - Unknown names are passed through (assumed to be useAgent-compatible)
26
+ * - Future: custom player.toml files will override built-in mappings
27
+ */
28
+ export function resolvePlayer(name: string): string {
29
+ const normalized = name.toLowerCase().trim()
30
+ return BUILTIN_PLAYERS[normalized] ?? normalized
31
+ }
32
+
33
+ /**
34
+ * Map arena.toml sides to resolved side configs.
35
+ * Pure function — no IO, no agent creation.
36
+ */
37
+ export function resolveSides(toml: ArenaToml): ResolvedSide[] {
38
+ return toml.side.map(side => ({
39
+ side,
40
+ platform: resolvePlayer(side.player),
41
+ playerName: side.player,
42
+ }))
43
+ }
44
+
45
+ // ── Side grouping (for per-side aggregation in T3) ─────────────────────────
46
+
47
+ export interface SideGroup {
48
+ sideName: string
49
+ player: string
50
+ deck: string
51
+ control: boolean
52
+ runs: number
53
+ platform: string
54
+ }
55
+
56
+ /** Group resolved sides by name for per-side statistical aggregation */
57
+ export function groupBySide(toml: ArenaToml): SideGroup[] {
58
+ return resolveSides(toml).map(rs => ({
59
+ sideName: rs.side.name,
60
+ player: rs.playerName,
61
+ deck: rs.side.deck,
62
+ control: rs.side.control,
63
+ runs: toml.arena.runs_per_side,
64
+ platform: rs.platform,
65
+ }))
66
+ }
67
+
68
+ /** Get total run count from arena.toml (sides × runs_per_side) */
69
+ export function totalRuns(toml: ArenaToml): number {
70
+ return toml.side.length * toml.arena.runs_per_side
71
+ }
package/src/runner.ts CHANGED
@@ -1,9 +1,14 @@
1
- import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs'
1
+ import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
3
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
4
4
  import { useAgent } from '@lythos/test-utils/agents'
5
- import { ArenaManifest, Player, type ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
5
+ import { ArenaManifest, Player } from '@lythos/test-utils/schema'
6
+ import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
6
7
  import { runComparativeJudge } from './comparative-judge'
8
+ import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
9
+ import { resolvePlayer, resolveSides } from './player'
10
+ import { aggregateAllStats } from './stats'
11
+ import type { SideStats } from './stats'
7
12
 
8
13
  // ── Helpers ───────────────────────────────────────────────────────────────
9
14
 
@@ -12,150 +17,211 @@ function stamp(): string {
12
17
  return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
13
18
  }
14
19
 
15
- function cartesian<T>(arrays: T[][]): T[][] {
16
- if (arrays.length === 0) return [[]]
17
- const [first, ...rest] = arrays
18
- const restProd = cartesian(rest)
19
- return first.flatMap(a => restProd.map(r => [a, ...r]))
20
- }
20
+ // ── Declarative runner (arena.toml → execute) ─────────────────────────────
21
21
 
22
- function slugify(input: string): string {
23
- return input.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 40)
22
+ export interface ArenaResult {
23
+ manifest: ArenaManifestType
24
+ report: unknown
25
+ stats: SideStats[]
26
+ artifactsDir: string
24
27
  }
25
28
 
26
- // ── Runner ────────────────────────────────────────────────────────────────
27
-
28
- export async function runArena(opts: {
29
+ export async function runArenaFromToml(opts: {
30
+ toml: ArenaToml
29
31
  taskPath: string
30
- playerPaths: string[]
31
- deckPaths: string[]
32
- criteria: string[]
33
- outDir: string
34
- projectDir?: string
35
- }): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
36
- const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
37
-
38
- // Load players
39
- const players = playerPaths.map(p => {
40
- const content = readFileSync(resolve(p), 'utf-8')
41
- const parsed = Player.parse(JSON.parse(content))
42
- return { path: p, ...parsed }
43
- })
32
+ outDir?: string
33
+ dryRun?: boolean
34
+ }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
35
+ const { toml, taskPath, outDir, dryRun } = opts
44
36
 
45
- // Load deck labels from deck paths
46
- const decks = deckPaths.map(p => ({ path: resolve(p) }))
37
+ const plan = buildExecutionPlan(toml)
47
38
 
48
- // Build (player × deck) variant matrix
49
- const variants = cartesian([players, decks]).map(([player, deck], i) => ({
50
- participant_id: `run-${String(i + 1).padStart(2, '0')}`,
51
- player,
52
- deck_path: deck.path,
53
- }))
39
+ // dry-run: return plan without executing
40
+ if (dryRun) {
41
+ return { plan }
42
+ }
54
43
 
55
- // Build arena manifest
56
44
  const arenaId = `arena-${stamp()}`
57
45
  const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
46
+ const resolved = resolveSides(toml)
58
47
 
48
+ // Build manifest
59
49
  const manifest = ArenaManifest.parse({
60
50
  id: arenaId,
61
51
  created_at: new Date().toISOString(),
62
52
  task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
63
53
  mode: 'decks',
64
- participants: variants.map(v => ({
65
- id: v.participant_id,
66
- name: v.player.path.split('/').pop()?.replace('.toml', '') ?? v.player.platform,
67
- player: v.player.platform,
68
- deck: v.deck_path,
69
- description: `${v.player.platform} × ${v.deck_path.split('/').pop()?.replace('.toml', '')}`,
54
+ participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
55
+ id: r.side.name,
56
+ name: r.side.name,
57
+ player: r.platform,
58
+ deck: r.side.deck,
59
+ description: `${r.playerName} × ${r.side.deck}`,
70
60
  })),
71
- criteria,
61
+ criteria: toml.arena.criteria,
72
62
  status: 'running',
73
63
  })
74
64
 
75
65
  mkdirSync(artifactsDir, { recursive: true })
76
66
  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
77
67
 
78
- // Run each variant
79
- const verdicts: { participantId: string; verdict: unknown }[] = []
68
+ // Execute plan: per-cell agent run
69
+ const verdictsBySide = new Map<string, JudgeVerdict[]>()
80
70
 
81
- for (const variant of variants) {
82
- const cellDir = join(artifactsDir, 'runs', variant.participant_id)
71
+ for (const cell of plan.cells) {
72
+ const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
83
73
  mkdirSync(cellDir, { recursive: true })
84
74
 
85
75
  try {
76
+ const agent = useAgent(resolvePlayer(cell.player))
86
77
  const result = await runAgentScenario({
87
78
  scenarioPath: resolve(taskPath),
88
- agent: useAgent(variant.player.platform),
79
+ agent,
89
80
  setupWorkdir(_scenario: AgentScenario, workdir: string) {
90
81
  mkdirSync(workdir, { recursive: true })
91
- // Write deck.toml as skill-deck.toml
92
- const deckContent = readFileSync(variant.deck_path, 'utf-8')
82
+ const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
93
83
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
94
84
  },
95
- baseDir: artifactsDir,
85
+ baseDir: join(artifactsDir, 'runs', cell.side),
96
86
  })
97
87
 
98
- verdicts.push({
99
- participantId: variant.participant_id,
100
- verdict: result.verdict,
101
- })
88
+ const v = (result.verdict ?? {
89
+ verdict: 'ERROR' as const,
90
+ reason: 'No verdict returned',
91
+ criteria: [],
92
+ }) as JudgeVerdict
93
+
94
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
95
+ verdictsBySide.get(cell.side)!.push(v)
102
96
  } catch (e) {
103
- verdicts.push({
104
- participantId: variant.participant_id,
105
- verdict: {
106
- verdict: 'ERROR' as const,
107
- reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
108
- },
97
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
98
+ verdictsBySide.get(cell.side)!.push({
99
+ verdict: 'ERROR' as const,
100
+ reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
101
+ criteria: [],
109
102
  })
110
103
  }
111
104
  }
112
105
 
113
- // Run comparative judge
114
- const judge = useAgent(players[0]?.platform ?? 'claude')
106
+ // Aggregate stats
107
+ const stats = aggregateAllStats(verdictsBySide)
108
+
109
+ // Comparative judge
110
+ const flatVerdicts: { participantId: string; verdict: unknown }[] = []
111
+ for (const [side, verdicts] of verdictsBySide) {
112
+ // Use the first run's verdict for comparative judge (or aggregate into one)
113
+ if (verdicts.length > 0) {
114
+ flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
115
+ }
116
+ }
117
+
118
+ const judge = useAgent(resolved[0]?.platform ?? 'claude')
115
119
  const report = await runComparativeJudge({
116
120
  manifest,
117
- verdicts,
121
+ verdicts: flatVerdicts,
118
122
  judge,
119
123
  workdir: artifactsDir,
120
124
  })
121
125
 
122
126
  // Write report
123
- writeFileSync(join(artifactsDir, 'report.md'), `# Arena Report: ${manifest.id}
127
+ writeReport(artifactsDir, manifest, report, stats)
124
128
 
125
- **Task**: ${manifest.task}
126
- **Criteria**: ${manifest.criteria.join(', ')}
127
- **Date**: ${new Date().toISOString()}
129
+ // Update manifest
130
+ const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
131
+ writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
128
132
 
129
- ## Score Matrix
130
- ${renderScoreMatrix(report)}
133
+ return { manifest: finalManifest, report, stats, artifactsDir }
134
+ }
131
135
 
132
- ## Pareto Frontier
133
- ${renderPareto(report)}
136
+ // ── Backward compat: CLI-flag style runner ─────────────────────────────────
134
137
 
135
- ## Key Findings
136
- ${(report.key_findings ?? []).map((f: string) => `- ${f}`).join('\n')}
138
+ export async function runArena(opts: {
139
+ taskPath: string
140
+ playerPaths: string[]
141
+ deckPaths: string[]
142
+ criteria: string[]
143
+ outDir: string
144
+ }): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
145
+ const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
137
146
 
138
- ## Recommendations
139
- ${(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`).join('\n')}
140
- `)
147
+ // Convert CLI flags to ArenaToml internally
148
+ const toml: ArenaToml = {
149
+ arena: {
150
+ task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
151
+ criteria,
152
+ runs_per_side: 1,
153
+ max_participants: Math.min(playerPaths.length, deckPaths.length),
154
+ },
155
+ side: playerPaths.flatMap((playerPath, pi) =>
156
+ deckPaths.map((deckPath, di) => ({
157
+ name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
158
+ player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
159
+ deck: deckPath,
160
+ }))
161
+ ),
162
+ }
141
163
 
142
- // Update manifest status
143
- const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
144
- writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
164
+ const result = await runArenaFromToml({ toml, taskPath, outDir })
165
+ const { manifest, report, artifactsDir } = result as ArenaResult
166
+ return { manifest, report, artifactsDir }
167
+ }
145
168
 
146
- return { manifest: finalManifest, report, artifactsDir }
169
+ // ── Report renderer ────────────────────────────────────────────────────────
170
+
171
+ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
172
+ const lines: string[] = [
173
+ `# Arena Report: ${manifest.id}`,
174
+ '',
175
+ `**Task**: ${manifest.task}`,
176
+ `**Criteria**: ${manifest.criteria.join(', ')}`,
177
+ `**Date**: ${new Date().toISOString()}`,
178
+ '',
179
+ '## Score Matrix',
180
+ '',
181
+ renderScoreMatrix(report),
182
+ '',
183
+ '## Per-Side Statistics',
184
+ '',
185
+ renderStatsTable(stats),
186
+ '',
187
+ '## Pareto Frontier',
188
+ '',
189
+ renderPareto(report),
190
+ '',
191
+ '## Key Findings',
192
+ '',
193
+ ...(report.key_findings ?? []).map((f: string) => `- ${f}`),
194
+ '',
195
+ '## Recommendations',
196
+ '',
197
+ ...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
198
+ ]
199
+
200
+ writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
147
201
  }
148
202
 
149
- // ── Markdown Renderers ────────────────────────────────────────────────────
203
+ function renderStatsTable(stats: SideStats[]): string {
204
+ if (stats.length === 0) return 'No statistics available.\n'
205
+
206
+ let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
207
+ table += `|------|------|-----------|-----------------|----------|\n`
208
+
209
+ for (const s of stats) {
210
+ const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
211
+ const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
212
+ table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
213
+ }
214
+
215
+ return table
216
+ }
150
217
 
151
218
  function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
152
219
  if (!report.score_matrix?.length) return 'No scores available.\n'
153
220
 
154
- // Build participant × criterion matrix
155
221
  const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
156
222
  const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
157
223
 
158
- let table = `| Criterion | Weight | ${participants.map(p => `${p}`).join(' | ')} |\n`
224
+ let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
159
225
  table += `|${'---|'.repeat(2 + participants.length)}\n`
160
226
 
161
227
  for (const c of criteria) {
@@ -165,7 +231,6 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
165
231
  }).join(' | ')} |\n`
166
232
  }
167
233
 
168
- // Weighted totals
169
234
  table += `| **Weighted Total** | 100% | ${participants.map(p => {
170
235
  const pScores = report.score_matrix!.filter(s => s.participant_id === p)
171
236
  const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
@@ -177,11 +242,9 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
177
242
 
178
243
  function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
179
244
  if (!report.pareto?.length) return 'No Pareto analysis.\n'
180
-
181
- return report.pareto.map((p: { participant_id: string; dominated: boolean; dominated_by: string[] }) => {
182
- if (p.dominated) {
183
- return `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
184
- }
185
- return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
186
- }).join('\n')
245
+ return report.pareto.map(p =>
246
+ p.dominated
247
+ ? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
248
+ : `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
249
+ ).join('\n')
187
250
  }
@@ -0,0 +1,111 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { aggregateSideStats, aggregateAllStats } from './stats'
3
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
4
+
5
+ function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
6
+ return {
7
+ verdict: 'PASS',
8
+ reason: 'OK',
9
+ criteria: [{ name: 'correctness', passed: true }],
10
+ ...overrides,
11
+ }
12
+ }
13
+
14
+ // ── aggregateSideStats ─────────────────────────────────────────────────────
15
+
16
+ describe('aggregateSideStats', () => {
17
+ test('single run: passRate=1, no variance', () => {
18
+ const stats = aggregateSideStats('test', [makeVerdict()])
19
+ expect(stats.sideName).toBe('test')
20
+ expect(stats.runs).toBe(1)
21
+ expect(stats.passRate).toBe(1)
22
+ expect(stats.failRate).toBe(0)
23
+ expect(stats.errorRate).toBe(0)
24
+ })
25
+
26
+ test('3 runs: 2 PASS, 1 FAIL', () => {
27
+ const verdicts = [
28
+ makeVerdict(),
29
+ makeVerdict(),
30
+ makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
31
+ ]
32
+ const stats = aggregateSideStats('test', verdicts)
33
+ expect(stats.passRate).toBeCloseTo(2 / 3)
34
+ expect(stats.failRate).toBeCloseTo(1 / 3)
35
+ })
36
+
37
+ test('confidence: mean across runs', () => {
38
+ const verdicts = [
39
+ makeVerdict({ confidence: 90 }),
40
+ makeVerdict({ confidence: 80 }),
41
+ makeVerdict({ confidence: 70 }),
42
+ ]
43
+ const stats = aggregateSideStats('test', verdicts)
44
+ expect(stats.meanConfidence).toBeCloseTo(80)
45
+ expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
46
+ })
47
+
48
+ test('confidence: null when no verdict has it', () => {
49
+ const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
50
+ expect(stats.meanConfidence).toBeNull()
51
+ expect(stats.confidenceVariance).toBeNull()
52
+ })
53
+
54
+ test('per-criterion pass rate', () => {
55
+ const verdicts = [
56
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
57
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
58
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
59
+ ]
60
+ const stats = aggregateSideStats('test', verdicts)
61
+ expect(stats.criteria).toHaveLength(1)
62
+ expect(stats.criteria[0].name).toBe('accuracy')
63
+ expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
64
+ })
65
+
66
+ test('per-criterion scores: mean and variance', () => {
67
+ const verdicts = [
68
+ makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
69
+ makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
70
+ makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
71
+ ]
72
+ const stats = aggregateSideStats('test', verdicts)
73
+ expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
74
+ expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
75
+ expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
76
+ })
77
+
78
+ test('zero runs: all zeros', () => {
79
+ const stats = aggregateSideStats('empty', [])
80
+ expect(stats.runs).toBe(0)
81
+ expect(stats.passRate).toBe(0)
82
+ expect(stats.meanConfidence).toBeNull()
83
+ })
84
+
85
+ test('handles ERROR verdicts correctly', () => {
86
+ const verdicts = [
87
+ makeVerdict(),
88
+ makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
89
+ ]
90
+ const stats = aggregateSideStats('test', verdicts)
91
+ expect(stats.passRate).toBe(0.5)
92
+ expect(stats.errorRate).toBe(0.5)
93
+ })
94
+ })
95
+
96
+ // ── aggregateAllStats ──────────────────────────────────────────────────────
97
+
98
+ describe('aggregateAllStats', () => {
99
+ test('aggregates multiple sides', () => {
100
+ const map = new Map<string, JudgeVerdict[]>()
101
+ map.set('side-a', [makeVerdict(), makeVerdict()])
102
+ map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
103
+
104
+ const stats = aggregateAllStats(map)
105
+ expect(stats).toHaveLength(2)
106
+ expect(stats[0].sideName).toBe('side-a')
107
+ expect(stats[0].passRate).toBe(1)
108
+ expect(stats[1].sideName).toBe('side-b')
109
+ expect(stats[1].passRate).toBe(0)
110
+ })
111
+ })
package/src/stats.ts ADDED
@@ -0,0 +1,117 @@
1
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
2
+
3
+ // ── Statistical aggregation for runs_per_side ─────────────────────────────
4
+ // All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
5
+
6
+ export interface CriterionStats {
7
+ name: string
8
+ mean: number
9
+ variance: number
10
+ min: number
11
+ max: number
12
+ count: number // number of runs that reported this criterion
13
+ }
14
+
15
+ export interface SideStats {
16
+ sideName: string
17
+ runs: number
18
+ passRate: number // PASS / total
19
+ failRate: number
20
+ errorRate: number
21
+ meanConfidence: number | null // null if no verdict had confidence
22
+ confidenceVariance: number | null
23
+ criteria: CriterionStats[]
24
+ scoreByCriterion: Record<string, { mean: number; variance: number }>
25
+ }
26
+
27
+ // ── Helpers ────────────────────────────────────────────────────────────────
28
+
29
+ function mean(values: number[]): number {
30
+ if (values.length === 0) return 0
31
+ return values.reduce((a, b) => a + b, 0) / values.length
32
+ }
33
+
34
+ function variance(values: number[], m?: number): number {
35
+ if (values.length < 2) return 0
36
+ const avg = m ?? mean(values)
37
+ return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
38
+ }
39
+
40
+ // ── Aggregator ────────────────────────────────────────────────────────────
41
+
42
+ export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
43
+ const runs = verdicts.length
44
+ const passCount = verdicts.filter(v => v.verdict === 'PASS').length
45
+ const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
46
+ const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
47
+
48
+ // Confidence
49
+ const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
50
+ const meanConf = confidences.length > 0 ? mean(confidences) : null
51
+ const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
52
+
53
+ // Per-criterion stats from verdict.criteria
54
+ const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
55
+ for (const v of verdicts) {
56
+ for (const c of v.criteria ?? []) {
57
+ if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
58
+ criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
59
+ }
60
+ }
61
+
62
+ const criteria: CriterionStats[] = []
63
+ for (const [name, values] of criterionMap) {
64
+ const passRate = values.filter(v => v.passed).length / values.length
65
+ criteria.push({
66
+ name,
67
+ mean: passRate, // for criteria, "mean" = pass rate across runs
68
+ variance: passRate * (1 - passRate), // Bernoulli variance
69
+ min: 0,
70
+ max: 1,
71
+ count: values.length,
72
+ })
73
+ }
74
+
75
+ // Per-criterion scores (1-5) from verdict.scores
76
+ const scoreMap = new Map<string, number[]>()
77
+ for (const v of verdicts) {
78
+ if (v.scores) {
79
+ for (const [criterion, score] of Object.entries(v.scores)) {
80
+ if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
81
+ scoreMap.get(criterion)!.push(score)
82
+ }
83
+ }
84
+ }
85
+
86
+ const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
87
+ for (const [criterion, scores] of scoreMap) {
88
+ const m = mean(scores)
89
+ scoreByCriterion[criterion] = {
90
+ mean: m,
91
+ variance: scores.length > 1 ? variance(scores, m) : 0,
92
+ }
93
+ }
94
+
95
+ return {
96
+ sideName,
97
+ runs,
98
+ passRate: runs > 0 ? passCount / runs : 0,
99
+ failRate: runs > 0 ? failCount / runs : 0,
100
+ errorRate: runs > 0 ? errorCount / runs : 0,
101
+ meanConfidence: meanConf,
102
+ confidenceVariance: confVar,
103
+ criteria,
104
+ scoreByCriterion,
105
+ }
106
+ }
107
+
108
+ /** Aggregate stats for all sides from a map of sideName → verdicts[] */
109
+ export function aggregateAllStats(
110
+ verdictsBySide: Map<string, JudgeVerdict[]>
111
+ ): SideStats[] {
112
+ const stats: SideStats[] = []
113
+ for (const [sideName, verdicts] of verdictsBySide) {
114
+ stats.push(aggregateSideStats(sideName, verdicts))
115
+ }
116
+ return stats
117
+ }