npm - @lythos/skill-arena - Versions diffs - 0.9.2 → 0.9.6 - Mend

@lythos/skill-arena 0.9.2 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # @lythos/skill-arena
-> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
+![CI](https://img.shields.io/badge/CI-41%20unit%20tests-brightgreen) ![Intent/Plan](https://img.shields.io/badge/arch-intent%2Fplan%2Fexecute-8A2BE2)
+> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
 ## Why
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
 ## Commands
+### Declarative mode (k8s-style, recommended)
+```bash
+# Print execution plan without running
+bunx @lythos/skill-arena run --config arena.toml --dry-run
+# Execute with per-side runs_per_side and statistical aggregation
+bunx @lythos/skill-arena run --config arena.toml
+```
+### CLI-flag mode (backward compat)
+```
+bunx @lythos/skill-arena run \
+  --task ./TASK-arena.md \
+  --players ./players/claude.toml \
+  --decks ./decks/run-01.toml,./decks/run-02.toml \
+  --criteria coverage,relevance,actionability,depth
 ```
-Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
-Mode 1 — Single-Skill Comparison:
-  --task, -t <desc>       Task description (required)
-  --skills, -s <list>     Comma-separated skills, 2–5 (Mode 1)
-  --criteria, -c <list>   Evaluation dimensions (default: syntax,context,logic,token)
-  --control <skill>      Control skill (default: lythoskill-project-scribe)
+### Scaffold mode (legacy, manual execution)
-Mode 2 — Full-Deck Comparison:
-  --decks <paths>        Comma-separated deck toml paths, 2–5 (Mode 2)
-  --criteria, -c <list>   Evaluation dimensions
+```
+bunx @lythos/skill-arena scaffold --task "..." --skills a,b
+```
-Common:
-  --dir, -d <path>       Arena parent directory (default: tmp)
-  --project, -p <path>   Project root (default: .)
+### Viz
-Viz:
-  viz <dir>               Render ASCII charts from report.md
+```bash
+bunx @lythos/skill-arena viz runs/arena-<id>/
 ```
 ## Skill Documentation
@@ -77,6 +90,31 @@ Skill   (packages/<name>/skill/)     → build → SKILL.md + thin scripts
 Output  (skills/<name>/)             → git commit → agent-visible skill
 ```
+### Runtime architecture (intent/plan/execute)
+```
+arena.toml  →  ArenaToml (Zod)  →  ExecutionPlan (pure)  →  per-cell agent spawn (IO)
+                                    ↓
+                aggregateAllStats (pure)  ←  verdicts[]
+                                    ↓
+                runComparativeJudge (IO)  →  report.md + Pareto frontier
+```
+- **Intent**: `arena.toml` declarative config (k8s-manifest style)
+- **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
+- **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
+Built on `@lythos/test-utils` shared infrastructure.
+## Test Coverage
+| Layer | Count | CI | Notes |
+|-------|-------|----|-------|
+| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
+| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
+Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lythos/skill-arena",
-  "version": "0.9.2",
+  "version": "0.9.6",
   "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
   "keywords": [
     "ai-agent",

package/src/arena-toml.test.ts ADDED Viewed

@@ -0,0 +1,191 @@
+import { describe, test, expect } from 'bun:test'
+import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
+import { formatPlanOutput } from './runner'
+const minimalToml = `
+[arena]
+task = "Test task"
+criteria = ["a", "b"]
+[[side]]
+name = "runner-a"
+player = "claude-code"
+deck = "./decks/a.toml"
+[[side]]
+name = "runner-b"
+player = "claude-code"
+deck = "./decks/b.toml"
+`
+const fullToml = `
+[arena]
+task = "Generate auth flow diagram"
+criteria = ["syntax", "context", "logic", "token"]
+runs_per_side = 3
+[[side]]
+name = "minimal"
+player = "standard-coder"
+deck = "./decks/minimal.toml"
+[[side]]
+name = "rich"
+player = "expert-architect"
+deck = "./decks/rich.toml"
+[[side]]
+name = "baseline"
+player = "standard-coder"
+deck = "./decks/baseline.toml"
+control = true
+[side.env]
+container = "node:20-alpine"
+pre_run = ["npm ci", "npm run build"]
+working_dir = "/workspace"
+`
+// ── Schema + Parser ────────────────────────────────────────────────────────
+describe('parseArenaToml', () => {
+  test('parses minimal two-side arena', () => {
+    const result = parseArenaToml(minimalToml)
+    expect(result.arena.task).toBe('Test task')
+    expect(result.arena.criteria).toEqual(['a', 'b'])
+    expect(result.arena.runs_per_side).toBe(1)       // default
+    expect(result.side).toHaveLength(2)
+    expect(result.side[0].name).toBe('runner-a')
+    expect(result.side[0].player).toBe('claude-code')
+    expect(result.side[0].deck).toBe('./decks/a.toml')
+    expect(result.side[0].control).toBe(false)         // default
+  })
+  test('parses full arena with runs_per_side and control', () => {
+    const result = parseArenaToml(fullToml)
+    expect(result.arena.runs_per_side).toBe(3)
+    expect(result.side).toHaveLength(3)
+    expect(result.side[2].name).toBe('baseline')
+    expect(result.side[2].control).toBe(true)
+  })
+  test('parses side env block', () => {
+    const result = parseArenaToml(fullToml)
+    const env = result.side[2].env
+    expect(env.container).toBe('node:20-alpine')
+    expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
+    expect(env.working_dir).toBe('/workspace')
+    expect(env.env_vars).toEqual({})
+  })
+  test('rejects fewer than 2 sides', () => {
+    const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
+    expect(() => parseArenaToml(bad)).toThrow()
+  })
+  test('rejects empty criteria', () => {
+    const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    expect(() => parseArenaToml(bad)).toThrow()
+  })
+  test('rejects non-object input', () => {
+    expect(() => ArenaToml.parse('not valid')).toThrow()
+  })
+  test('rejects missing arena section', () => {
+    expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
+  })
+  test('rejects runs_per_side = 0', () => {
+    const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    expect(() => parseArenaToml(bad)).toThrow()
+  })
+  test('parses integer and boolean values correctly', () => {
+    const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    const result = parseArenaToml(toml)
+    expect(result.arena.runs_per_side).toBe(2)
+    expect(result.arena.max_participants).toBe(5)
+  })
+  test('comments are stripped', () => {
+    const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
+    const result = parseArenaToml(toml)
+    expect(result.arena.task).toBe('x')
+  })
+})
+// ── Execution Plan ─────────────────────────────────────────────────────────
+describe('buildExecutionPlan', () => {
+  test('generates plan: 2 sides × 1 run = 2 cells', () => {
+    const toml = parseArenaToml(minimalToml)
+    const plan = buildExecutionPlan(toml)
+    expect(plan.task).toBe('Test task')
+    expect(plan.criteria).toEqual(['a', 'b'])
+    expect(plan.cells).toHaveLength(2)
+    expect(plan.total_runs).toBe(2)
+    expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
+    expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
+  })
+  test('generates plan: 3 sides × 3 runs = 9 cells', () => {
+    const toml = parseArenaToml(fullToml)
+    const plan = buildExecutionPlan(toml)
+    expect(plan.cells).toHaveLength(9)
+    expect(plan.total_runs).toBe(9)
+    // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
+    expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
+    expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
+    expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
+    expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
+    expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
+  })
+  test('control flag preserved in plan cells', () => {
+    const toml = parseArenaToml(fullToml)
+    const plan = buildExecutionPlan(toml)
+    const baselineCells = plan.cells.filter(c => c.side === 'baseline')
+    expect(baselineCells).toHaveLength(3)
+    expect(baselineCells.every(c => c.control)).toBe(true)
+  })
+  test('dry-run output format matches expected log', () => {
+    const toml = parseArenaToml(minimalToml)
+    const plan = buildExecutionPlan(toml)
+    // Simulate what --dry-run would log
+    const logs: string[] = []
+    for (const line of formatPlanOutput(plan)) {
+      logs.push(line)
+    }
+    expect(logs.some(l => l.includes('2 cells'))).toBe(true)
+    expect(logs.some(l => l.includes('runner-a'))).toBe(true)
+    expect(logs.some(l => l.includes('runner-b'))).toBe(true)
+    expect(logs.some(l => l.includes('claude-code'))).toBe(true)
+    expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
+  })
+  test('dry-run output shows control flag for control sides', () => {
+    const toml = parseArenaToml(fullToml)
+    const plan = buildExecutionPlan(toml)
+    const lines = formatPlanOutput(plan)
+    const baselineLines = lines.filter(l => l.includes('baseline'))
+    // All baseline cells should have [control] flag
+    expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
+  })
+  test('dry-run: plan is pure data, no side effects', () => {
+    // The entire plan generation is a pure function — dry-run is just printing it
+    const toml = parseArenaToml(fullToml)
+    const plan = buildExecutionPlan(toml)
+    // Verify plan is self-describing for a --dry-run output
+    expect(plan.total_runs).toBeGreaterThan(0)
+    expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
+    expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
+    expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
+    expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
+  })
+})

package/src/arena-toml.ts ADDED Viewed

@@ -0,0 +1,172 @@
+import { z } from 'zod'
+import type { ArenaManifest } from '@lythos/test-utils/schema'
+// ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
+// Anchored on: ADR-20260502110308316
+export const SideEnv = z.object({
+  container: z.string().optional(),
+  pre_run: z.array(z.string()).default([]),
+  working_dir: z.string().optional(),
+  env_vars: z.record(z.string()).default({}),
+})
+export type SideEnv = z.infer<typeof SideEnv>
+export const Side = z.object({
+  name: z.string(),
+  player: z.string(),              // reference to player config (useAgent resolves)
+  deck: z.string(),                // path to deck.toml
+  control: z.boolean().default(false),
+  env: SideEnv.default({}),
+})
+export type Side = z.infer<typeof Side>
+export const ArenaToml = z.object({
+  arena: z.object({
+    task: z.string(),              // task description or path to TASK-arena.md
+    criteria: z.array(z.string()).min(1),
+    runs_per_side: z.number().int().positive().default(1),
+    max_participants: z.number().int().min(2).max(5).default(5),
+  }),
+  side: z.array(Side).min(2).max(5),
+})
+export type ArenaToml = z.infer<typeof ArenaToml>
+// ── Parser ─────────────────────────────────────────────────────────────────
+export function parseArenaToml(content: string): ArenaToml {
+  // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
+  const parsed = parseToml(content)
+  return ArenaToml.parse(parsed)
+}
+// ── Plan generation (pure function, dry-run visible) ───────────────────────
+export interface ExecutionCell {
+  side: string                     // side name
+  player: string                   // player reference
+  deck: string                     // deck path
+  run: number                      // 1-indexed run number
+  control: boolean
+}
+export interface ExecutionPlan {
+  task: string
+  criteria: string[]
+  cells: ExecutionCell[]
+  total_runs: number
+}
+export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
+  const cells: ExecutionCell[] = []
+  for (const side of toml.side) {
+    for (let run = 1; run <= toml.arena.runs_per_side; run++) {
+      cells.push({
+        side: side.name,
+        player: side.player,
+        deck: side.deck,
+        run,
+        control: side.control,
+      })
+    }
+  }
+  return {
+    task: toml.arena.task,
+    criteria: toml.arena.criteria,
+    cells,
+    total_runs: cells.length,
+  }
+}
+// ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
+function parseToml(text: string): Record<string, unknown> {
+  const result: Record<string, unknown> = {}
+  let currentTable: Record<string, unknown> = result
+  let currentTableKey = ''
+  const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
+  for (const rawLine of text.split('\n')) {
+    const line = rawLine.split('#')[0].trim()
+    if (!line) continue
+    // [[array]]
+    const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
+    if (arrayMatch) {
+      const key = arrayMatch[1] // e.g. "side"
+      if (!arrayTables.has(key)) arrayTables.set(key, [])
+      currentTable = {}
+      arrayTables.get(key)!.push(currentTable)
+      currentTableKey = key
+      continue
+    }
+    // [section]
+    const sectionMatch = line.match(/^\[(.+?)\]$/)
+    if (sectionMatch) {
+      const key = sectionMatch[1]
+      // nested key like "side.env"
+      if (key.includes('.')) {
+        const [parent, child] = key.split('.')
+        const parentArr = arrayTables.get(parent)
+        if (parentArr && parentArr.length > 0) {
+          currentTable = {}
+          parentArr[parentArr.length - 1][child] = currentTable
+        }
+      } else {
+        result[key] = {}
+        currentTable = result[key] as Record<string, unknown>
+      }
+      currentTableKey = ''
+      continue
+    }
+    // key = value
+    const eqIdx = line.indexOf('=')
+    if (eqIdx !== -1) {
+      const key = line.slice(0, eqIdx).trim()
+      let value = line.slice(eqIdx + 1).trim()
+      // String value
+      if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
+        value = value.slice(1, -1)
+      } else if (value === 'true') {
+        value = 'true'
+      } else if (value === 'false') {
+        value = 'false'
+      }
+      // Array value: ["a", "b"]
+      if (value.startsWith('[') && value.endsWith(']')) {
+        const inner = value.slice(1, -1).trim()
+        if (!inner) {
+          currentTable[key] = []
+        } else {
+          const arr = inner.split(',').map(s => {
+            const t = s.trim()
+            if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
+              return t.slice(1, -1)
+            }
+            return t
+          })
+          currentTable[key] = arr
+        }
+      } else if (value === 'true') {
+        currentTable[key] = true
+      } else if (value === 'false') {
+        currentTable[key] = false
+      } else if (/^-?\d+(\.\d+)?$/.test(value)) {
+        currentTable[key] = Number(value)
+      } else {
+        currentTable[key] = value
+      }
+    }
+  }
+  // Materialize array tables into result
+  for (const [key, arr] of arrayTables) {
+    result[key] = arr
+  }
+  return result
+}

package/src/cli.ts CHANGED Viewed

@@ -35,7 +35,7 @@ Usage:
   lythoskill-arena viz <arena-dir>
 Commands:
-  run       Run arena programmatically (cartesian player × deck → judge → report)
+  run       Run arena programmatically (declarative arena.toml or CLI flags)
   scaffold  Create arena directory structure (legacy, manual subagent execution)
   viz       Visualize arena report (ASCII charts)
@@ -44,14 +44,23 @@ Options:
   -s, --skills <list>    Comma-separated skill names (scaffold only)
       --decks <list>     Comma-separated deck paths
   -c, --criteria <list>  Evaluation criteria (default: syntax,context,logic,token)
-      --players <list>   Comma-separated player.toml paths (run only)
+      --players <list>   Comma-separated player.toml paths (CLI run only)
+      --config <path>    Path to arena.toml (declarative mode, k8s-style)
+      --dry-run          Print execution plan without running (with --config)
       --control <skill>  Control skill for comparison (scaffold only)
       --out <dir>        Output directory (run: defaults to runs/arena-<id>)
   -d, --dir <dir>        Output directory (scaffold: defaults to tmp)
   -p, --project <dir>    Project directory (default: .)
 Examples:
-  lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml,./players/kimi.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
+  # Declarative mode (k8s-style)
+  lythoskill-arena run --config ./arena.toml
+  lythoskill-arena run --config ./arena.toml --dry-run
+  # CLI-flag mode (backward compat)
+  lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
+  # Legacy scaffolding
   lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
   lythoskill-arena viz runs/arena-20260504
 `)
@@ -71,6 +80,9 @@ function parseArgs(argv: string[]) {
     control: 'lythoskill-project-scribe',
     dir: 'tmp',
     project: '.',
+    config: undefined,
+    out: undefined,
+    players: undefined,
   }
   const positionals: string[] = []
@@ -90,6 +102,12 @@ function parseArgs(argv: string[]) {
       options.dir = argv[++i]
     } else if (arg === '--project' || arg === '-p') {
       options.project = argv[++i]
+    } else if (arg === '--config') {
+      options.config = argv[++i]
+    } else if (arg === '--out') {
+      options.out = argv[++i]
+    } else if (arg === '--players') {
+      options.players = argv[++i]
     } else if (!arg.startsWith('-')) {
       positionals.push(arg)
     }
@@ -563,9 +581,45 @@ function runViz(argv: string[]) {
 async function runProgrammaticArena(argv: string[]) {
   const { options } = parseArgs(argv)
+  const { readFileSync } = await import('node:fs')
+  const hasConfig = !!(options as Record<string, string | undefined>).config
+  const dryRun = argv.includes('--dry-run')
+  if (hasConfig) {
+    // arena.toml declarative mode
+    const { parseArenaToml } = await import('./arena-toml')
+    const { runArenaFromToml } = await import('./runner')
+    const configPath = (options as Record<string, string | undefined>).config!
+    const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
+    const result = await runArenaFromToml({
+      toml,
+      taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
+        ? toml.arena.task
+        : (options as Record<string, string | undefined>).task ?? toml.arena.task,
+      outDir: (options as Record<string, string | undefined>).out,
+      dryRun,
+    })
+    if ('plan' in result) {
+      // dry-run
+      console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
+      for (const cell of result.plan.cells) {
+        console.log(`   ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
+      }
+      return
+    }
+    console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
+    console.log(`📁 Artifacts: ${result.artifactsDir}`)
+    console.log(`📊 Report: ${result.artifactsDir}/report.md`)
+    return
+  }
+  // CLI-flag mode (backward compat)
   if (!options.task || !options.decks) {
-    console.error('❌ --task <path> and --decks <list> are required for "run"')
+    console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
     process.exit(1)
   }
@@ -577,7 +631,6 @@ async function runProgrammaticArena(argv: string[]) {
     deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
     criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
     outDir: options.out ?? `runs/arena-${timestamp()}`,
-    projectDir: options.project,
   })
   console.log(`\n🎮 Arena complete: ${result.manifest.id}`)

package/src/player.test.ts ADDED Viewed

@@ -0,0 +1,95 @@
+import { describe, test, expect } from 'bun:test'
+import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
+import { parseArenaToml } from './arena-toml'
+const toml = parseArenaToml(`
+[arena]
+task = "Test task"
+criteria = ["a", "b"]
+runs_per_side = 3
+[[side]]
+name = "minimal"
+player = "claude-code"
+deck = "./decks/minimal.toml"
+[[side]]
+name = "rich"
+player = "expert-architect"
+deck = "./decks/rich.toml"
+`)
+describe('resolvePlayer', () => {
+  test('maps claude-code → claude', () => {
+    expect(resolvePlayer('claude-code')).toBe('claude')
+  })
+  test('maps Claude → claude (case insensitive)', () => {
+    expect(resolvePlayer('Claude')).toBe('claude')
+  })
+  test('maps kimi → kimi', () => {
+    expect(resolvePlayer('kimi')).toBe('kimi')
+  })
+  test('passes through unknown player names', () => {
+    expect(resolvePlayer('expert-architect')).toBe('expert-architect')
+  })
+  test('trims whitespace', () => {
+    expect(resolvePlayer('  claude-code  ')).toBe('claude')
+  })
+})
+describe('resolveSides', () => {
+  test('resolves all sides in arena.toml', () => {
+    const sides = resolveSides(toml)
+    expect(sides).toHaveLength(2)
+    expect(sides[0].platform).toBe('claude')
+    expect(sides[1].platform).toBe('expert-architect')
+    expect(sides[0].playerName).toBe('claude-code')
+  })
+  test('preserves side config', () => {
+    const sides = resolveSides(toml)
+    expect(sides[0].side.name).toBe('minimal')
+    expect(sides[0].side.deck).toBe('./decks/minimal.toml')
+  })
+})
+describe('groupBySide', () => {
+  test('groups by side name with run count', () => {
+    const groups = groupBySide(toml)
+    expect(groups).toHaveLength(2)
+    expect(groups[0].runs).toBe(3) // runs_per_side
+    expect(groups[1].runs).toBe(3)
+    expect(groups[0].platform).toBe('claude')
+  })
+  test('control flag preserved', () => {
+    const controlToml = parseArenaToml(`
+[arena]
+task = "x"
+criteria = ["a"]
+[[side]]
+name = "test"
+player = "claude-code"
+deck = "a.toml"
+[[side]]
+name = "baseline"
+player = "claude-code"
+deck = "b.toml"
+control = true
+`)
+    const groups = groupBySide(controlToml)
+    expect(groups[1].control).toBe(true)
+  })
+})
+describe('totalRuns', () => {
+  test('calculates sides × runs_per_side', () => {
+    expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
+  })
+})

package/src/player.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import type { Side, ArenaToml } from './arena-toml'
+// ── Player reference resolution (pure function) ────────────────────────────
+// Maps arena.toml player names → platform identifiers.
+// AgentAdapter creation is the IO layer's job (T4), not ours.
+export interface ResolvedSide {
+  side: Side
+  platform: string                  // resolved platform for useAgent()
+  playerName: string                // original player reference
+}
+/** Built-in player registry. Player names that map directly to useAgent platforms. */
+const BUILTIN_PLAYERS: Record<string, string> = {
+  'claude': 'claude',
+  'claude-code': 'claude',
+  'kimi': 'kimi',
+  'cursor': 'cursor',
+  'gemini': 'gemini',
+}
+/**
+ * Resolve a player reference to its platform identifier.
+ * - Built-in names (claude, kimi, cursor) map directly
+ * - Unknown names are passed through (assumed to be useAgent-compatible)
+ * - Future: custom player.toml files will override built-in mappings
+ */
+export function resolvePlayer(name: string): string {
+  const normalized = name.toLowerCase().trim()
+  return BUILTIN_PLAYERS[normalized] ?? normalized
+}
+/**
+ * Map arena.toml sides to resolved side configs.
+ * Pure function — no IO, no agent creation.
+ */
+export function resolveSides(toml: ArenaToml): ResolvedSide[] {
+  return toml.side.map(side => ({
+    side,
+    platform: resolvePlayer(side.player),
+    playerName: side.player,
+  }))
+}
+// ── Side grouping (for per-side aggregation in T3) ─────────────────────────
+export interface SideGroup {
+  sideName: string
+  player: string
+  deck: string
+  control: boolean
+  runs: number
+  platform: string
+}
+/** Group resolved sides by name for per-side statistical aggregation */
+export function groupBySide(toml: ArenaToml): SideGroup[] {
+  return resolveSides(toml).map(rs => ({
+    sideName: rs.side.name,
+    player: rs.playerName,
+    deck: rs.side.deck,
+    control: rs.side.control,
+    runs: toml.arena.runs_per_side,
+    platform: rs.platform,
+  }))
+}
+/** Get total run count from arena.toml (sides × runs_per_side) */
+export function totalRuns(toml: ArenaToml): number {
+  return toml.side.length * toml.arena.runs_per_side
+}

package/src/runner.ts CHANGED Viewed

@@ -1,9 +1,14 @@
-import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs'
+import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
 import { join, resolve } from 'node:path'
 import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
 import { useAgent } from '@lythos/test-utils/agents'
-import { ArenaManifest, Player, type ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
+import { ArenaManifest, Player } from '@lythos/test-utils/schema'
+import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
 import { runComparativeJudge } from './comparative-judge'
+import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
+import { resolvePlayer, resolveSides } from './player'
+import { aggregateAllStats } from './stats'
+import type { SideStats } from './stats'
 // ── Helpers ───────────────────────────────────────────────────────────────
@@ -12,150 +17,226 @@ function stamp(): string {
   return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
 }
-function cartesian<T>(arrays: T[][]): T[][] {
-  if (arrays.length === 0) return [[]]
-  const [first, ...rest] = arrays
-  const restProd = cartesian(rest)
-  return first.flatMap(a => restProd.map(r => [a, ...r]))
-}
+// ── Declarative runner (arena.toml → execute) ─────────────────────────────
-function slugify(input: string): string {
-  return input.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 40)
+export interface ArenaResult {
+  manifest: ArenaManifestType
+  report: unknown
+  stats: SideStats[]
+  artifactsDir: string
 }
-// ── Runner ────────────────────────────────────────────────────────────────
+/** Format an execution plan as readable CLI output (pure). */
+export function formatPlanOutput(plan: ExecutionPlan): string[] {
+  const lines: string[] = []
+  const sideCount = new Set(plan.cells.map(c => c.side)).size
+  lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
+  for (const cell of plan.cells) {
+    lines.push(`   ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
+  }
+  return lines
+}
-export async function runArena(opts: {
+export async function runArenaFromToml(opts: {
+  toml: ArenaToml
   taskPath: string
-  playerPaths: string[]
-  deckPaths: string[]
-  criteria: string[]
-  outDir: string
-  projectDir?: string
-}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
-  const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
-  // Load players
-  const players = playerPaths.map(p => {
-    const content = readFileSync(resolve(p), 'utf-8')
-    const parsed = Player.parse(JSON.parse(content))
-    return { path: p, ...parsed }
-  })
-  // Load deck labels from deck paths
-  const decks = deckPaths.map(p => ({ path: resolve(p) }))
-  // Build (player × deck) variant matrix
-  const variants = cartesian([players, decks]).map(([player, deck], i) => ({
-    participant_id: `run-${String(i + 1).padStart(2, '0')}`,
-    player,
-    deck_path: deck.path,
-  }))
+  outDir?: string
+  dryRun?: boolean
+  log?: (msg: string) => void
+}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
+  const { toml, taskPath, outDir, dryRun, log } = opts
+  const plan = buildExecutionPlan(toml)
+  // dry-run: return plan without executing
+  if (dryRun) {
+    for (const line of formatPlanOutput(plan)) {
+      log?.(line)
+    }
+    return { plan }
+  }
-  // Build arena manifest
   const arenaId = `arena-${stamp()}`
   const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
+  const resolved = resolveSides(toml)
+  // Build manifest
   const manifest = ArenaManifest.parse({
     id: arenaId,
     created_at: new Date().toISOString(),
     task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
     mode: 'decks',
-    participants: variants.map(v => ({
-      id: v.participant_id,
-      name: v.player.path.split('/').pop()?.replace('.toml', '') ?? v.player.platform,
-      player: v.player.platform,
-      deck: v.deck_path,
-      description: `${v.player.platform} × ${v.deck_path.split('/').pop()?.replace('.toml', '')}`,
+    participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
+      id: r.side.name,
+      name: r.side.name,
+      player: r.platform,
+      deck: r.side.deck,
+      description: `${r.playerName} × ${r.side.deck}`,
     })),
-    criteria,
+    criteria: toml.arena.criteria,
     status: 'running',
   })
   mkdirSync(artifactsDir, { recursive: true })
   writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
-  // Run each variant
-  const verdicts: { participantId: string; verdict: unknown }[] = []
+  // Execute plan: per-cell agent run
+  const verdictsBySide = new Map<string, JudgeVerdict[]>()
-  for (const variant of variants) {
-    const cellDir = join(artifactsDir, 'runs', variant.participant_id)
+  for (const cell of plan.cells) {
+    const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
     mkdirSync(cellDir, { recursive: true })
     try {
+      const agent = useAgent(resolvePlayer(cell.player))
       const result = await runAgentScenario({
         scenarioPath: resolve(taskPath),
-        agent: useAgent(variant.player.platform),
+        agent,
         setupWorkdir(_scenario: AgentScenario, workdir: string) {
           mkdirSync(workdir, { recursive: true })
-          // Write deck.toml as skill-deck.toml
-          const deckContent = readFileSync(variant.deck_path, 'utf-8')
+          const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
           writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
         },
-        baseDir: artifactsDir,
+        baseDir: join(artifactsDir, 'runs', cell.side),
       })
-      verdicts.push({
-        participantId: variant.participant_id,
-        verdict: result.verdict,
-      })
+      const v = (result.verdict ?? {
+        verdict: 'ERROR' as const,
+        reason: 'No verdict returned',
+        criteria: [],
+      }) as JudgeVerdict
+      if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
+      verdictsBySide.get(cell.side)!.push(v)
     } catch (e) {
-      verdicts.push({
-        participantId: variant.participant_id,
-        verdict: {
-          verdict: 'ERROR' as const,
-          reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
-        },
+      if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
+      verdictsBySide.get(cell.side)!.push({
+        verdict: 'ERROR' as const,
+        reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
+        criteria: [],
       })
     }
   }
-  // Run comparative judge
-  const judge = useAgent(players[0]?.platform ?? 'claude')
+  // Aggregate stats
+  const stats = aggregateAllStats(verdictsBySide)
+  // Comparative judge
+  const flatVerdicts: { participantId: string; verdict: unknown }[] = []
+  for (const [side, verdicts] of verdictsBySide) {
+    // Use the first run's verdict for comparative judge (or aggregate into one)
+    if (verdicts.length > 0) {
+      flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
+    }
+  }
+  const judge = useAgent(resolved[0]?.platform ?? 'claude')
   const report = await runComparativeJudge({
     manifest,
-    verdicts,
+    verdicts: flatVerdicts,
     judge,
     workdir: artifactsDir,
   })
   // Write report
-  writeFileSync(join(artifactsDir, 'report.md'), `# Arena Report: ${manifest.id}
+  writeReport(artifactsDir, manifest, report, stats)
-**Task**: ${manifest.task}
-**Criteria**: ${manifest.criteria.join(', ')}
-**Date**: ${new Date().toISOString()}
+  // Update manifest
+  const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
+  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
-## Score Matrix
-${renderScoreMatrix(report)}
+  return { manifest: finalManifest, report, stats, artifactsDir }
+}
-## Pareto Frontier
-${renderPareto(report)}
+// ── Backward compat: CLI-flag style runner ─────────────────────────────────
-## Key Findings
-${(report.key_findings ?? []).map((f: string) => `- ${f}`).join('\n')}
+export async function runArena(opts: {
+  taskPath: string
+  playerPaths: string[]
+  deckPaths: string[]
+  criteria: string[]
+  outDir: string
+}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
+  const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
-## Recommendations
-${(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`).join('\n')}
-`)
+  // Convert CLI flags to ArenaToml internally
+  const toml: ArenaToml = {
+    arena: {
+      task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
+      criteria,
+      runs_per_side: 1,
+      max_participants: Math.min(playerPaths.length, deckPaths.length),
+    },
+    side: playerPaths.flatMap((playerPath, pi) =>
+      deckPaths.map((deckPath, di) => ({
+        name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
+        player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
+        deck: deckPath,
+      }))
+    ),
+  }
-  // Update manifest status
-  const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
-  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
+  const result = await runArenaFromToml({ toml, taskPath, outDir })
+  const { manifest, report, artifactsDir } = result as ArenaResult
+  return { manifest, report, artifactsDir }
+}
-  return { manifest: finalManifest, report, artifactsDir }
+// ── Report renderer ────────────────────────────────────────────────────────
+function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
+  const lines: string[] = [
+    `# Arena Report: ${manifest.id}`,
+    '',
+    `**Task**: ${manifest.task}`,
+    `**Criteria**: ${manifest.criteria.join(', ')}`,
+    `**Date**: ${new Date().toISOString()}`,
+    '',
+    '## Score Matrix',
+    '',
+    renderScoreMatrix(report),
+    '',
+    '## Per-Side Statistics',
+    '',
+    renderStatsTable(stats),
+    '',
+    '## Pareto Frontier',
+    '',
+    renderPareto(report),
+    '',
+    '## Key Findings',
+    '',
+    ...(report.key_findings ?? []).map((f: string) => `- ${f}`),
+    '',
+    '## Recommendations',
+    '',
+    ...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
+  ]
+  writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
 }
-// ── Markdown Renderers ────────────────────────────────────────────────────
+function renderStatsTable(stats: SideStats[]): string {
+  if (stats.length === 0) return 'No statistics available.\n'
+  let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
+  table += `|------|------|-----------|-----------------|----------|\n`
+  for (const s of stats) {
+    const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
+    const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
+    table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
+  }
+  return table
+}
 function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
   if (!report.score_matrix?.length) return 'No scores available.\n'
-  // Build participant × criterion matrix
   const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
   const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
-  let table = `| Criterion | Weight | ${participants.map(p => `${p}`).join(' | ')} |\n`
+  let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
   table += `|${'---|'.repeat(2 + participants.length)}\n`
   for (const c of criteria) {
@@ -165,7 +246,6 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
     }).join(' | ')} |\n`
   }
-  // Weighted totals
   table += `| **Weighted Total** | 100% | ${participants.map(p => {
     const pScores = report.score_matrix!.filter(s => s.participant_id === p)
     const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
@@ -177,11 +257,9 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
 function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
   if (!report.pareto?.length) return 'No Pareto analysis.\n'
-  return report.pareto.map((p: { participant_id: string; dominated: boolean; dominated_by: string[] }) => {
-    if (p.dominated) {
-      return `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
-    }
-    return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
-  }).join('\n')
+  return report.pareto.map(p =>
+    p.dominated
+      ? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
+      : `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
+  ).join('\n')
 }

package/src/stats.test.ts ADDED Viewed

@@ -0,0 +1,111 @@
+import { describe, test, expect } from 'bun:test'
+import { aggregateSideStats, aggregateAllStats } from './stats'
+import type { JudgeVerdict } from '@lythos/test-utils/schema'
+function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
+  return {
+    verdict: 'PASS',
+    reason: 'OK',
+    criteria: [{ name: 'correctness', passed: true }],
+    ...overrides,
+  }
+}
+// ── aggregateSideStats ─────────────────────────────────────────────────────
+describe('aggregateSideStats', () => {
+  test('single run: passRate=1, no variance', () => {
+    const stats = aggregateSideStats('test', [makeVerdict()])
+    expect(stats.sideName).toBe('test')
+    expect(stats.runs).toBe(1)
+    expect(stats.passRate).toBe(1)
+    expect(stats.failRate).toBe(0)
+    expect(stats.errorRate).toBe(0)
+  })
+  test('3 runs: 2 PASS, 1 FAIL', () => {
+    const verdicts = [
+      makeVerdict(),
+      makeVerdict(),
+      makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
+    ]
+    const stats = aggregateSideStats('test', verdicts)
+    expect(stats.passRate).toBeCloseTo(2 / 3)
+    expect(stats.failRate).toBeCloseTo(1 / 3)
+  })
+  test('confidence: mean across runs', () => {
+    const verdicts = [
+      makeVerdict({ confidence: 90 }),
+      makeVerdict({ confidence: 80 }),
+      makeVerdict({ confidence: 70 }),
+    ]
+    const stats = aggregateSideStats('test', verdicts)
+    expect(stats.meanConfidence).toBeCloseTo(80)
+    expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
+  })
+  test('confidence: null when no verdict has it', () => {
+    const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
+    expect(stats.meanConfidence).toBeNull()
+    expect(stats.confidenceVariance).toBeNull()
+  })
+  test('per-criterion pass rate', () => {
+    const verdicts = [
+      makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
+      makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
+      makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
+    ]
+    const stats = aggregateSideStats('test', verdicts)
+    expect(stats.criteria).toHaveLength(1)
+    expect(stats.criteria[0].name).toBe('accuracy')
+    expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
+  })
+  test('per-criterion scores: mean and variance', () => {
+    const verdicts = [
+      makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
+      makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
+      makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
+    ]
+    const stats = aggregateSideStats('test', verdicts)
+    expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
+    expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
+    expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
+  })
+  test('zero runs: all zeros', () => {
+    const stats = aggregateSideStats('empty', [])
+    expect(stats.runs).toBe(0)
+    expect(stats.passRate).toBe(0)
+    expect(stats.meanConfidence).toBeNull()
+  })
+  test('handles ERROR verdicts correctly', () => {
+    const verdicts = [
+      makeVerdict(),
+      makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
+    ]
+    const stats = aggregateSideStats('test', verdicts)
+    expect(stats.passRate).toBe(0.5)
+    expect(stats.errorRate).toBe(0.5)
+  })
+})
+// ── aggregateAllStats ──────────────────────────────────────────────────────
+describe('aggregateAllStats', () => {
+  test('aggregates multiple sides', () => {
+    const map = new Map<string, JudgeVerdict[]>()
+    map.set('side-a', [makeVerdict(), makeVerdict()])
+    map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
+    const stats = aggregateAllStats(map)
+    expect(stats).toHaveLength(2)
+    expect(stats[0].sideName).toBe('side-a')
+    expect(stats[0].passRate).toBe(1)
+    expect(stats[1].sideName).toBe('side-b')
+    expect(stats[1].passRate).toBe(0)
+  })
+})

package/src/stats.ts ADDED Viewed

@@ -0,0 +1,117 @@
+import type { JudgeVerdict } from '@lythos/test-utils/schema'
+// ── Statistical aggregation for runs_per_side ─────────────────────────────
+// All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
+export interface CriterionStats {
+  name: string
+  mean: number
+  variance: number
+  min: number
+  max: number
+  count: number                     // number of runs that reported this criterion
+}
+export interface SideStats {
+  sideName: string
+  runs: number
+  passRate: number                  // PASS / total
+  failRate: number
+  errorRate: number
+  meanConfidence: number | null     // null if no verdict had confidence
+  confidenceVariance: number | null
+  criteria: CriterionStats[]
+  scoreByCriterion: Record<string, { mean: number; variance: number }>
+}
+// ── Helpers ────────────────────────────────────────────────────────────────
+function mean(values: number[]): number {
+  if (values.length === 0) return 0
+  return values.reduce((a, b) => a + b, 0) / values.length
+}
+function variance(values: number[], m?: number): number {
+  if (values.length < 2) return 0
+  const avg = m ?? mean(values)
+  return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
+}
+// ── Aggregator ────────────────────────────────────────────────────────────
+export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
+  const runs = verdicts.length
+  const passCount = verdicts.filter(v => v.verdict === 'PASS').length
+  const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
+  const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
+  // Confidence
+  const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
+  const meanConf = confidences.length > 0 ? mean(confidences) : null
+  const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
+  // Per-criterion stats from verdict.criteria
+  const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
+  for (const v of verdicts) {
+    for (const c of v.criteria ?? []) {
+      if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
+      criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
+    }
+  }
+  const criteria: CriterionStats[] = []
+  for (const [name, values] of criterionMap) {
+    const passRate = values.filter(v => v.passed).length / values.length
+    criteria.push({
+      name,
+      mean: passRate,               // for criteria, "mean" = pass rate across runs
+      variance: passRate * (1 - passRate), // Bernoulli variance
+      min: 0,
+      max: 1,
+      count: values.length,
+    })
+  }
+  // Per-criterion scores (1-5) from verdict.scores
+  const scoreMap = new Map<string, number[]>()
+  for (const v of verdicts) {
+    if (v.scores) {
+      for (const [criterion, score] of Object.entries(v.scores)) {
+        if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
+        scoreMap.get(criterion)!.push(score)
+      }
+    }
+  }
+  const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
+  for (const [criterion, scores] of scoreMap) {
+    const m = mean(scores)
+    scoreByCriterion[criterion] = {
+      mean: m,
+      variance: scores.length > 1 ? variance(scores, m) : 0,
+    }
+  }
+  return {
+    sideName,
+    runs,
+    passRate: runs > 0 ? passCount / runs : 0,
+    failRate: runs > 0 ? failCount / runs : 0,
+    errorRate: runs > 0 ? errorCount / runs : 0,
+    meanConfidence: meanConf,
+    confidenceVariance: confVar,
+    criteria,
+    scoreByCriterion,
+  }
+}
+/** Aggregate stats for all sides from a map of sideName → verdicts[] */
+export function aggregateAllStats(
+  verdictsBySide: Map<string, JudgeVerdict[]>
+): SideStats[] {
+  const stats: SideStats[] = []
+  for (const [sideName, verdicts] of verdictsBySide) {
+    stats.push(aggregateSideStats(sideName, verdicts))
+  }
+  return stats
+}