@lythos/skill-arena 0.9.2 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # @lythos/skill-arena
2
2
 
3
- > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis.
3
+ ![CI](https://img.shields.io/badge/CI-41%20unit%20tests-brightgreen) ![Intent/Plan](https://img.shields.io/badge/arch-intent%2Fplan%2Fexecute-8A2BE2)
4
+
5
+ > Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
4
6
 
5
7
  ## Why
6
8
 
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
40
42
 
41
43
  ## Commands
42
44
 
45
+ ### Declarative mode (k8s-style, recommended)
46
+
47
+ ```bash
48
+ # Print execution plan without running
49
+ bunx @lythos/skill-arena run --config arena.toml --dry-run
50
+
51
+ # Execute with per-side runs_per_side and statistical aggregation
52
+ bunx @lythos/skill-arena run --config arena.toml
53
+ ```
54
+
55
+ ### CLI-flag mode (backward compat)
56
+
57
+ ```
58
+ bunx @lythos/skill-arena run \
59
+ --task ./TASK-arena.md \
60
+ --players ./players/claude.toml \
61
+ --decks ./decks/run-01.toml,./decks/run-02.toml \
62
+ --criteria coverage,relevance,actionability,depth
43
63
  ```
44
- Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
45
64
 
46
- Mode 1 Single-Skill Comparison:
47
- --task, -t <desc> Task description (required)
48
- --skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
49
- --criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
50
- --control <skill> Control skill (default: lythoskill-project-scribe)
65
+ ### Scaffold mode (legacy, manual execution)
51
66
 
52
- Mode 2 — Full-Deck Comparison:
53
- --decks <paths> Comma-separated deck toml paths, 2–5 (Mode 2)
54
- --criteria, -c <list> Evaluation dimensions
67
+ ```
68
+ bunx @lythos/skill-arena scaffold --task "..." --skills a,b
69
+ ```
55
70
 
56
- Common:
57
- --dir, -d <path> Arena parent directory (default: tmp)
58
- --project, -p <path> Project root (default: .)
71
+ ### Viz
59
72
 
60
- Viz:
61
- viz <dir> Render ASCII charts from report.md
73
+ ```bash
74
+ bunx @lythos/skill-arena viz runs/arena-<id>/
62
75
  ```
63
76
 
64
77
  ## Skill Documentation
@@ -77,6 +90,31 @@ Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
77
90
  Output (skills/<name>/) → git commit → agent-visible skill
78
91
  ```
79
92
 
93
+ ### Runtime architecture (intent/plan/execute)
94
+
95
+ ```
96
+ arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent spawn (IO)
97
+
98
+ aggregateAllStats (pure) ← verdicts[]
99
+
100
+ runComparativeJudge (IO) → report.md + Pareto frontier
101
+ ```
102
+
103
+ - **Intent**: `arena.toml` declarative config (k8s-manifest style)
104
+ - **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
105
+ - **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
106
+
107
+ Built on `@lythos/test-utils` shared infrastructure.
108
+
109
+ ## Test Coverage
110
+
111
+ | Layer | Count | CI | Notes |
112
+ |-------|-------|----|-------|
113
+ | Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
114
+ | Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
115
+
116
+ Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
117
+
80
118
  ## License
81
119
 
82
120
  MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.9.2",
3
+ "version": "0.9.6",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -0,0 +1,191 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
3
+ import { formatPlanOutput } from './runner'
4
+
5
+ const minimalToml = `
6
+ [arena]
7
+ task = "Test task"
8
+ criteria = ["a", "b"]
9
+
10
+ [[side]]
11
+ name = "runner-a"
12
+ player = "claude-code"
13
+ deck = "./decks/a.toml"
14
+
15
+ [[side]]
16
+ name = "runner-b"
17
+ player = "claude-code"
18
+ deck = "./decks/b.toml"
19
+ `
20
+
21
+ const fullToml = `
22
+ [arena]
23
+ task = "Generate auth flow diagram"
24
+ criteria = ["syntax", "context", "logic", "token"]
25
+ runs_per_side = 3
26
+
27
+ [[side]]
28
+ name = "minimal"
29
+ player = "standard-coder"
30
+ deck = "./decks/minimal.toml"
31
+
32
+ [[side]]
33
+ name = "rich"
34
+ player = "expert-architect"
35
+ deck = "./decks/rich.toml"
36
+
37
+ [[side]]
38
+ name = "baseline"
39
+ player = "standard-coder"
40
+ deck = "./decks/baseline.toml"
41
+ control = true
42
+
43
+ [side.env]
44
+ container = "node:20-alpine"
45
+ pre_run = ["npm ci", "npm run build"]
46
+ working_dir = "/workspace"
47
+ `
48
+
49
+ // ── Schema + Parser ────────────────────────────────────────────────────────
50
+
51
+ describe('parseArenaToml', () => {
52
+ test('parses minimal two-side arena', () => {
53
+ const result = parseArenaToml(minimalToml)
54
+ expect(result.arena.task).toBe('Test task')
55
+ expect(result.arena.criteria).toEqual(['a', 'b'])
56
+ expect(result.arena.runs_per_side).toBe(1) // default
57
+ expect(result.side).toHaveLength(2)
58
+ expect(result.side[0].name).toBe('runner-a')
59
+ expect(result.side[0].player).toBe('claude-code')
60
+ expect(result.side[0].deck).toBe('./decks/a.toml')
61
+ expect(result.side[0].control).toBe(false) // default
62
+ })
63
+
64
+ test('parses full arena with runs_per_side and control', () => {
65
+ const result = parseArenaToml(fullToml)
66
+ expect(result.arena.runs_per_side).toBe(3)
67
+ expect(result.side).toHaveLength(3)
68
+ expect(result.side[2].name).toBe('baseline')
69
+ expect(result.side[2].control).toBe(true)
70
+ })
71
+
72
+ test('parses side env block', () => {
73
+ const result = parseArenaToml(fullToml)
74
+ const env = result.side[2].env
75
+ expect(env.container).toBe('node:20-alpine')
76
+ expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
77
+ expect(env.working_dir).toBe('/workspace')
78
+ expect(env.env_vars).toEqual({})
79
+ })
80
+
81
+ test('rejects fewer than 2 sides', () => {
82
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
83
+ expect(() => parseArenaToml(bad)).toThrow()
84
+ })
85
+
86
+ test('rejects empty criteria', () => {
87
+ const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
88
+ expect(() => parseArenaToml(bad)).toThrow()
89
+ })
90
+
91
+ test('rejects non-object input', () => {
92
+ expect(() => ArenaToml.parse('not valid')).toThrow()
93
+ })
94
+
95
+ test('rejects missing arena section', () => {
96
+ expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
97
+ })
98
+
99
+ test('rejects runs_per_side = 0', () => {
100
+ const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
101
+ expect(() => parseArenaToml(bad)).toThrow()
102
+ })
103
+
104
+ test('parses integer and boolean values correctly', () => {
105
+ const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
106
+ const result = parseArenaToml(toml)
107
+ expect(result.arena.runs_per_side).toBe(2)
108
+ expect(result.arena.max_participants).toBe(5)
109
+ })
110
+
111
+ test('comments are stripped', () => {
112
+ const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
113
+ const result = parseArenaToml(toml)
114
+ expect(result.arena.task).toBe('x')
115
+ })
116
+ })
117
+
118
+ // ── Execution Plan ─────────────────────────────────────────────────────────
119
+
120
+ describe('buildExecutionPlan', () => {
121
+ test('generates plan: 2 sides × 1 run = 2 cells', () => {
122
+ const toml = parseArenaToml(minimalToml)
123
+ const plan = buildExecutionPlan(toml)
124
+ expect(plan.task).toBe('Test task')
125
+ expect(plan.criteria).toEqual(['a', 'b'])
126
+ expect(plan.cells).toHaveLength(2)
127
+ expect(plan.total_runs).toBe(2)
128
+ expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
129
+ expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
130
+ })
131
+
132
+ test('generates plan: 3 sides × 3 runs = 9 cells', () => {
133
+ const toml = parseArenaToml(fullToml)
134
+ const plan = buildExecutionPlan(toml)
135
+ expect(plan.cells).toHaveLength(9)
136
+ expect(plan.total_runs).toBe(9)
137
+
138
+ // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
139
+ expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
140
+ expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
141
+ expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
142
+ expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
143
+ expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
144
+ })
145
+
146
+ test('control flag preserved in plan cells', () => {
147
+ const toml = parseArenaToml(fullToml)
148
+ const plan = buildExecutionPlan(toml)
149
+ const baselineCells = plan.cells.filter(c => c.side === 'baseline')
150
+ expect(baselineCells).toHaveLength(3)
151
+ expect(baselineCells.every(c => c.control)).toBe(true)
152
+ })
153
+
154
+ test('dry-run output format matches expected log', () => {
155
+ const toml = parseArenaToml(minimalToml)
156
+ const plan = buildExecutionPlan(toml)
157
+
158
+ // Simulate what --dry-run would log
159
+ const logs: string[] = []
160
+ for (const line of formatPlanOutput(plan)) {
161
+ logs.push(line)
162
+ }
163
+
164
+ expect(logs.some(l => l.includes('2 cells'))).toBe(true)
165
+ expect(logs.some(l => l.includes('runner-a'))).toBe(true)
166
+ expect(logs.some(l => l.includes('runner-b'))).toBe(true)
167
+ expect(logs.some(l => l.includes('claude-code'))).toBe(true)
168
+ expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
169
+ })
170
+
171
+ test('dry-run output shows control flag for control sides', () => {
172
+ const toml = parseArenaToml(fullToml)
173
+ const plan = buildExecutionPlan(toml)
174
+ const lines = formatPlanOutput(plan)
175
+ const baselineLines = lines.filter(l => l.includes('baseline'))
176
+ // All baseline cells should have [control] flag
177
+ expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
178
+ })
179
+
180
+ test('dry-run: plan is pure data, no side effects', () => {
181
+ // The entire plan generation is a pure function — dry-run is just printing it
182
+ const toml = parseArenaToml(fullToml)
183
+ const plan = buildExecutionPlan(toml)
184
+ // Verify plan is self-describing for a --dry-run output
185
+ expect(plan.total_runs).toBeGreaterThan(0)
186
+ expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
187
+ expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
188
+ expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
189
+ expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
190
+ })
191
+ })
@@ -0,0 +1,172 @@
1
+ import { z } from 'zod'
2
+ import type { ArenaManifest } from '@lythos/test-utils/schema'
3
+
4
+ // ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
5
+ // Anchored on: ADR-20260502110308316
6
+
7
+ export const SideEnv = z.object({
8
+ container: z.string().optional(),
9
+ pre_run: z.array(z.string()).default([]),
10
+ working_dir: z.string().optional(),
11
+ env_vars: z.record(z.string()).default({}),
12
+ })
13
+ export type SideEnv = z.infer<typeof SideEnv>
14
+
15
+ export const Side = z.object({
16
+ name: z.string(),
17
+ player: z.string(), // reference to player config (useAgent resolves)
18
+ deck: z.string(), // path to deck.toml
19
+ control: z.boolean().default(false),
20
+ env: SideEnv.default({}),
21
+ })
22
+ export type Side = z.infer<typeof Side>
23
+
24
+ export const ArenaToml = z.object({
25
+ arena: z.object({
26
+ task: z.string(), // task description or path to TASK-arena.md
27
+ criteria: z.array(z.string()).min(1),
28
+ runs_per_side: z.number().int().positive().default(1),
29
+ max_participants: z.number().int().min(2).max(5).default(5),
30
+ }),
31
+ side: z.array(Side).min(2).max(5),
32
+ })
33
+ export type ArenaToml = z.infer<typeof ArenaToml>
34
+
35
+ // ── Parser ─────────────────────────────────────────────────────────────────
36
+
37
+ export function parseArenaToml(content: string): ArenaToml {
38
+ // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
39
+ const parsed = parseToml(content)
40
+ return ArenaToml.parse(parsed)
41
+ }
42
+
43
+ // ── Plan generation (pure function, dry-run visible) ───────────────────────
44
+
45
+ export interface ExecutionCell {
46
+ side: string // side name
47
+ player: string // player reference
48
+ deck: string // deck path
49
+ run: number // 1-indexed run number
50
+ control: boolean
51
+ }
52
+
53
+ export interface ExecutionPlan {
54
+ task: string
55
+ criteria: string[]
56
+ cells: ExecutionCell[]
57
+ total_runs: number
58
+ }
59
+
60
+ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
61
+ const cells: ExecutionCell[] = []
62
+ for (const side of toml.side) {
63
+ for (let run = 1; run <= toml.arena.runs_per_side; run++) {
64
+ cells.push({
65
+ side: side.name,
66
+ player: side.player,
67
+ deck: side.deck,
68
+ run,
69
+ control: side.control,
70
+ })
71
+ }
72
+ }
73
+ return {
74
+ task: toml.arena.task,
75
+ criteria: toml.arena.criteria,
76
+ cells,
77
+ total_runs: cells.length,
78
+ }
79
+ }
80
+
81
+ // ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
82
+
83
+ function parseToml(text: string): Record<string, unknown> {
84
+ const result: Record<string, unknown> = {}
85
+ let currentTable: Record<string, unknown> = result
86
+ let currentTableKey = ''
87
+ const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
88
+
89
+ for (const rawLine of text.split('\n')) {
90
+ const line = rawLine.split('#')[0].trim()
91
+ if (!line) continue
92
+
93
+ // [[array]]
94
+ const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
95
+ if (arrayMatch) {
96
+ const key = arrayMatch[1] // e.g. "side"
97
+ if (!arrayTables.has(key)) arrayTables.set(key, [])
98
+ currentTable = {}
99
+ arrayTables.get(key)!.push(currentTable)
100
+ currentTableKey = key
101
+ continue
102
+ }
103
+
104
+ // [section]
105
+ const sectionMatch = line.match(/^\[(.+?)\]$/)
106
+ if (sectionMatch) {
107
+ const key = sectionMatch[1]
108
+ // nested key like "side.env"
109
+ if (key.includes('.')) {
110
+ const [parent, child] = key.split('.')
111
+ const parentArr = arrayTables.get(parent)
112
+ if (parentArr && parentArr.length > 0) {
113
+ currentTable = {}
114
+ parentArr[parentArr.length - 1][child] = currentTable
115
+ }
116
+ } else {
117
+ result[key] = {}
118
+ currentTable = result[key] as Record<string, unknown>
119
+ }
120
+ currentTableKey = ''
121
+ continue
122
+ }
123
+
124
+ // key = value
125
+ const eqIdx = line.indexOf('=')
126
+ if (eqIdx !== -1) {
127
+ const key = line.slice(0, eqIdx).trim()
128
+ let value = line.slice(eqIdx + 1).trim()
129
+
130
+ // String value
131
+ if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
132
+ value = value.slice(1, -1)
133
+ } else if (value === 'true') {
134
+ value = 'true'
135
+ } else if (value === 'false') {
136
+ value = 'false'
137
+ }
138
+
139
+ // Array value: ["a", "b"]
140
+ if (value.startsWith('[') && value.endsWith(']')) {
141
+ const inner = value.slice(1, -1).trim()
142
+ if (!inner) {
143
+ currentTable[key] = []
144
+ } else {
145
+ const arr = inner.split(',').map(s => {
146
+ const t = s.trim()
147
+ if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
148
+ return t.slice(1, -1)
149
+ }
150
+ return t
151
+ })
152
+ currentTable[key] = arr
153
+ }
154
+ } else if (value === 'true') {
155
+ currentTable[key] = true
156
+ } else if (value === 'false') {
157
+ currentTable[key] = false
158
+ } else if (/^-?\d+(\.\d+)?$/.test(value)) {
159
+ currentTable[key] = Number(value)
160
+ } else {
161
+ currentTable[key] = value
162
+ }
163
+ }
164
+ }
165
+
166
+ // Materialize array tables into result
167
+ for (const [key, arr] of arrayTables) {
168
+ result[key] = arr
169
+ }
170
+
171
+ return result
172
+ }
package/src/cli.ts CHANGED
@@ -35,7 +35,7 @@ Usage:
35
35
  lythoskill-arena viz <arena-dir>
36
36
 
37
37
  Commands:
38
- run Run arena programmatically (cartesian player × deck → judge → report)
38
+ run Run arena programmatically (declarative arena.toml or CLI flags)
39
39
  scaffold Create arena directory structure (legacy, manual subagent execution)
40
40
  viz Visualize arena report (ASCII charts)
41
41
 
@@ -44,14 +44,23 @@ Options:
44
44
  -s, --skills <list> Comma-separated skill names (scaffold only)
45
45
  --decks <list> Comma-separated deck paths
46
46
  -c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
47
- --players <list> Comma-separated player.toml paths (run only)
47
+ --players <list> Comma-separated player.toml paths (CLI run only)
48
+ --config <path> Path to arena.toml (declarative mode, k8s-style)
49
+ --dry-run Print execution plan without running (with --config)
48
50
  --control <skill> Control skill for comparison (scaffold only)
49
51
  --out <dir> Output directory (run: defaults to runs/arena-<id>)
50
52
  -d, --dir <dir> Output directory (scaffold: defaults to tmp)
51
53
  -p, --project <dir> Project directory (default: .)
52
54
 
53
55
  Examples:
54
- lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml,./players/kimi.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
56
+ # Declarative mode (k8s-style)
57
+ lythoskill-arena run --config ./arena.toml
58
+ lythoskill-arena run --config ./arena.toml --dry-run
59
+
60
+ # CLI-flag mode (backward compat)
61
+ lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
62
+
63
+ # Legacy scaffolding
55
64
  lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
56
65
  lythoskill-arena viz runs/arena-20260504
57
66
  `)
@@ -71,6 +80,9 @@ function parseArgs(argv: string[]) {
71
80
  control: 'lythoskill-project-scribe',
72
81
  dir: 'tmp',
73
82
  project: '.',
83
+ config: undefined,
84
+ out: undefined,
85
+ players: undefined,
74
86
  }
75
87
  const positionals: string[] = []
76
88
 
@@ -90,6 +102,12 @@ function parseArgs(argv: string[]) {
90
102
  options.dir = argv[++i]
91
103
  } else if (arg === '--project' || arg === '-p') {
92
104
  options.project = argv[++i]
105
+ } else if (arg === '--config') {
106
+ options.config = argv[++i]
107
+ } else if (arg === '--out') {
108
+ options.out = argv[++i]
109
+ } else if (arg === '--players') {
110
+ options.players = argv[++i]
93
111
  } else if (!arg.startsWith('-')) {
94
112
  positionals.push(arg)
95
113
  }
@@ -563,9 +581,45 @@ function runViz(argv: string[]) {
563
581
 
564
582
  async function runProgrammaticArena(argv: string[]) {
565
583
  const { options } = parseArgs(argv)
584
+ const { readFileSync } = await import('node:fs')
585
+
586
+ const hasConfig = !!(options as Record<string, string | undefined>).config
587
+ const dryRun = argv.includes('--dry-run')
588
+
589
+ if (hasConfig) {
590
+ // arena.toml declarative mode
591
+ const { parseArenaToml } = await import('./arena-toml')
592
+ const { runArenaFromToml } = await import('./runner')
593
+ const configPath = (options as Record<string, string | undefined>).config!
594
+
595
+ const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
596
+ const result = await runArenaFromToml({
597
+ toml,
598
+ taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
599
+ ? toml.arena.task
600
+ : (options as Record<string, string | undefined>).task ?? toml.arena.task,
601
+ outDir: (options as Record<string, string | undefined>).out,
602
+ dryRun,
603
+ })
604
+
605
+ if ('plan' in result) {
606
+ // dry-run
607
+ console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
608
+ for (const cell of result.plan.cells) {
609
+ console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
610
+ }
611
+ return
612
+ }
613
+
614
+ console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
615
+ console.log(`📁 Artifacts: ${result.artifactsDir}`)
616
+ console.log(`📊 Report: ${result.artifactsDir}/report.md`)
617
+ return
618
+ }
566
619
 
620
+ // CLI-flag mode (backward compat)
567
621
  if (!options.task || !options.decks) {
568
- console.error('❌ --task <path> and --decks <list> are required for "run"')
622
+ console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
569
623
  process.exit(1)
570
624
  }
571
625
 
@@ -577,7 +631,6 @@ async function runProgrammaticArena(argv: string[]) {
577
631
  deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
578
632
  criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
579
633
  outDir: options.out ?? `runs/arena-${timestamp()}`,
580
- projectDir: options.project,
581
634
  })
582
635
 
583
636
  console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
@@ -0,0 +1,95 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
3
+ import { parseArenaToml } from './arena-toml'
4
+
5
+ const toml = parseArenaToml(`
6
+ [arena]
7
+ task = "Test task"
8
+ criteria = ["a", "b"]
9
+ runs_per_side = 3
10
+
11
+ [[side]]
12
+ name = "minimal"
13
+ player = "claude-code"
14
+ deck = "./decks/minimal.toml"
15
+
16
+ [[side]]
17
+ name = "rich"
18
+ player = "expert-architect"
19
+ deck = "./decks/rich.toml"
20
+ `)
21
+
22
+ describe('resolvePlayer', () => {
23
+ test('maps claude-code → claude', () => {
24
+ expect(resolvePlayer('claude-code')).toBe('claude')
25
+ })
26
+
27
+ test('maps Claude → claude (case insensitive)', () => {
28
+ expect(resolvePlayer('Claude')).toBe('claude')
29
+ })
30
+
31
+ test('maps kimi → kimi', () => {
32
+ expect(resolvePlayer('kimi')).toBe('kimi')
33
+ })
34
+
35
+ test('passes through unknown player names', () => {
36
+ expect(resolvePlayer('expert-architect')).toBe('expert-architect')
37
+ })
38
+
39
+ test('trims whitespace', () => {
40
+ expect(resolvePlayer(' claude-code ')).toBe('claude')
41
+ })
42
+ })
43
+
44
+ describe('resolveSides', () => {
45
+ test('resolves all sides in arena.toml', () => {
46
+ const sides = resolveSides(toml)
47
+ expect(sides).toHaveLength(2)
48
+ expect(sides[0].platform).toBe('claude')
49
+ expect(sides[1].platform).toBe('expert-architect')
50
+ expect(sides[0].playerName).toBe('claude-code')
51
+ })
52
+
53
+ test('preserves side config', () => {
54
+ const sides = resolveSides(toml)
55
+ expect(sides[0].side.name).toBe('minimal')
56
+ expect(sides[0].side.deck).toBe('./decks/minimal.toml')
57
+ })
58
+ })
59
+
60
+ describe('groupBySide', () => {
61
+ test('groups by side name with run count', () => {
62
+ const groups = groupBySide(toml)
63
+ expect(groups).toHaveLength(2)
64
+ expect(groups[0].runs).toBe(3) // runs_per_side
65
+ expect(groups[1].runs).toBe(3)
66
+ expect(groups[0].platform).toBe('claude')
67
+ })
68
+
69
+ test('control flag preserved', () => {
70
+ const controlToml = parseArenaToml(`
71
+ [arena]
72
+ task = "x"
73
+ criteria = ["a"]
74
+
75
+ [[side]]
76
+ name = "test"
77
+ player = "claude-code"
78
+ deck = "a.toml"
79
+
80
+ [[side]]
81
+ name = "baseline"
82
+ player = "claude-code"
83
+ deck = "b.toml"
84
+ control = true
85
+ `)
86
+ const groups = groupBySide(controlToml)
87
+ expect(groups[1].control).toBe(true)
88
+ })
89
+ })
90
+
91
+ describe('totalRuns', () => {
92
+ test('calculates sides × runs_per_side', () => {
93
+ expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
94
+ })
95
+ })
package/src/player.ts ADDED
@@ -0,0 +1,71 @@
1
+ import type { Side, ArenaToml } from './arena-toml'
2
+
3
+ // ── Player reference resolution (pure function) ────────────────────────────
4
+ // Maps arena.toml player names → platform identifiers.
5
+ // AgentAdapter creation is the IO layer's job (T4), not ours.
6
+
7
+ export interface ResolvedSide {
8
+ side: Side
9
+ platform: string // resolved platform for useAgent()
10
+ playerName: string // original player reference
11
+ }
12
+
13
+ /** Built-in player registry. Player names that map directly to useAgent platforms. */
14
+ const BUILTIN_PLAYERS: Record<string, string> = {
15
+ 'claude': 'claude',
16
+ 'claude-code': 'claude',
17
+ 'kimi': 'kimi',
18
+ 'cursor': 'cursor',
19
+ 'gemini': 'gemini',
20
+ }
21
+
22
+ /**
23
+ * Resolve a player reference to its platform identifier.
24
+ * - Built-in names (claude, kimi, cursor) map directly
25
+ * - Unknown names are passed through (assumed to be useAgent-compatible)
26
+ * - Future: custom player.toml files will override built-in mappings
27
+ */
28
+ export function resolvePlayer(name: string): string {
29
+ const normalized = name.toLowerCase().trim()
30
+ return BUILTIN_PLAYERS[normalized] ?? normalized
31
+ }
32
+
33
+ /**
34
+ * Map arena.toml sides to resolved side configs.
35
+ * Pure function — no IO, no agent creation.
36
+ */
37
+ export function resolveSides(toml: ArenaToml): ResolvedSide[] {
38
+ return toml.side.map(side => ({
39
+ side,
40
+ platform: resolvePlayer(side.player),
41
+ playerName: side.player,
42
+ }))
43
+ }
44
+
45
+ // ── Side grouping (for per-side aggregation in T3) ─────────────────────────
46
+
47
+ export interface SideGroup {
48
+ sideName: string
49
+ player: string
50
+ deck: string
51
+ control: boolean
52
+ runs: number
53
+ platform: string
54
+ }
55
+
56
+ /** Group resolved sides by name for per-side statistical aggregation */
57
+ export function groupBySide(toml: ArenaToml): SideGroup[] {
58
+ return resolveSides(toml).map(rs => ({
59
+ sideName: rs.side.name,
60
+ player: rs.playerName,
61
+ deck: rs.side.deck,
62
+ control: rs.side.control,
63
+ runs: toml.arena.runs_per_side,
64
+ platform: rs.platform,
65
+ }))
66
+ }
67
+
68
+ /** Get total run count from arena.toml (sides × runs_per_side) */
69
+ export function totalRuns(toml: ArenaToml): number {
70
+ return toml.side.length * toml.arena.runs_per_side
71
+ }
package/src/runner.ts CHANGED
@@ -1,9 +1,14 @@
1
- import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs'
1
+ import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
2
2
  import { join, resolve } from 'node:path'
3
3
  import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
4
4
  import { useAgent } from '@lythos/test-utils/agents'
5
- import { ArenaManifest, Player, type ArenaManifest as ArenaManifestType } from '@lythos/test-utils/schema'
5
+ import { ArenaManifest, Player } from '@lythos/test-utils/schema'
6
+ import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
6
7
  import { runComparativeJudge } from './comparative-judge'
8
+ import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
9
+ import { resolvePlayer, resolveSides } from './player'
10
+ import { aggregateAllStats } from './stats'
11
+ import type { SideStats } from './stats'
7
12
 
8
13
  // ── Helpers ───────────────────────────────────────────────────────────────
9
14
 
@@ -12,150 +17,226 @@ function stamp(): string {
12
17
  return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
13
18
  }
14
19
 
15
- function cartesian<T>(arrays: T[][]): T[][] {
16
- if (arrays.length === 0) return [[]]
17
- const [first, ...rest] = arrays
18
- const restProd = cartesian(rest)
19
- return first.flatMap(a => restProd.map(r => [a, ...r]))
20
- }
20
+ // ── Declarative runner (arena.toml → execute) ─────────────────────────────
21
21
 
22
- function slugify(input: string): string {
23
- return input.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-+|-+$/g, '').slice(0, 40)
22
+ export interface ArenaResult {
23
+ manifest: ArenaManifestType
24
+ report: unknown
25
+ stats: SideStats[]
26
+ artifactsDir: string
24
27
  }
25
28
 
26
- // ── Runner ────────────────────────────────────────────────────────────────
29
+ /** Format an execution plan as readable CLI output (pure). */
30
+ export function formatPlanOutput(plan: ExecutionPlan): string[] {
31
+ const lines: string[] = []
32
+ const sideCount = new Set(plan.cells.map(c => c.side)).size
33
+ lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
34
+ for (const cell of plan.cells) {
35
+ lines.push(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
36
+ }
37
+ return lines
38
+ }
27
39
 
28
- export async function runArena(opts: {
40
+ export async function runArenaFromToml(opts: {
41
+ toml: ArenaToml
29
42
  taskPath: string
30
- playerPaths: string[]
31
- deckPaths: string[]
32
- criteria: string[]
33
- outDir: string
34
- projectDir?: string
35
- }): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
36
- const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
37
-
38
- // Load players
39
- const players = playerPaths.map(p => {
40
- const content = readFileSync(resolve(p), 'utf-8')
41
- const parsed = Player.parse(JSON.parse(content))
42
- return { path: p, ...parsed }
43
- })
44
-
45
- // Load deck labels from deck paths
46
- const decks = deckPaths.map(p => ({ path: resolve(p) }))
47
-
48
- // Build (player × deck) variant matrix
49
- const variants = cartesian([players, decks]).map(([player, deck], i) => ({
50
- participant_id: `run-${String(i + 1).padStart(2, '0')}`,
51
- player,
52
- deck_path: deck.path,
53
- }))
43
+ outDir?: string
44
+ dryRun?: boolean
45
+ log?: (msg: string) => void
46
+ }): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
47
+ const { toml, taskPath, outDir, dryRun, log } = opts
48
+
49
+ const plan = buildExecutionPlan(toml)
50
+
51
+ // dry-run: return plan without executing
52
+ if (dryRun) {
53
+ for (const line of formatPlanOutput(plan)) {
54
+ log?.(line)
55
+ }
56
+ return { plan }
57
+ }
54
58
 
55
- // Build arena manifest
56
59
  const arenaId = `arena-${stamp()}`
57
60
  const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
61
+ const resolved = resolveSides(toml)
58
62
 
63
+ // Build manifest
59
64
  const manifest = ArenaManifest.parse({
60
65
  id: arenaId,
61
66
  created_at: new Date().toISOString(),
62
67
  task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
63
68
  mode: 'decks',
64
- participants: variants.map(v => ({
65
- id: v.participant_id,
66
- name: v.player.path.split('/').pop()?.replace('.toml', '') ?? v.player.platform,
67
- player: v.player.platform,
68
- deck: v.deck_path,
69
- description: `${v.player.platform} × ${v.deck_path.split('/').pop()?.replace('.toml', '')}`,
69
+ participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
70
+ id: r.side.name,
71
+ name: r.side.name,
72
+ player: r.platform,
73
+ deck: r.side.deck,
74
+ description: `${r.playerName} × ${r.side.deck}`,
70
75
  })),
71
- criteria,
76
+ criteria: toml.arena.criteria,
72
77
  status: 'running',
73
78
  })
74
79
 
75
80
  mkdirSync(artifactsDir, { recursive: true })
76
81
  writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
77
82
 
78
- // Run each variant
79
- const verdicts: { participantId: string; verdict: unknown }[] = []
83
+ // Execute plan: per-cell agent run
84
+ const verdictsBySide = new Map<string, JudgeVerdict[]>()
80
85
 
81
- for (const variant of variants) {
82
- const cellDir = join(artifactsDir, 'runs', variant.participant_id)
86
+ for (const cell of plan.cells) {
87
+ const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
83
88
  mkdirSync(cellDir, { recursive: true })
84
89
 
85
90
  try {
91
+ const agent = useAgent(resolvePlayer(cell.player))
86
92
  const result = await runAgentScenario({
87
93
  scenarioPath: resolve(taskPath),
88
- agent: useAgent(variant.player.platform),
94
+ agent,
89
95
  setupWorkdir(_scenario: AgentScenario, workdir: string) {
90
96
  mkdirSync(workdir, { recursive: true })
91
- // Write deck.toml as skill-deck.toml
92
- const deckContent = readFileSync(variant.deck_path, 'utf-8')
97
+ const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
93
98
  writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
94
99
  },
95
- baseDir: artifactsDir,
100
+ baseDir: join(artifactsDir, 'runs', cell.side),
96
101
  })
97
102
 
98
- verdicts.push({
99
- participantId: variant.participant_id,
100
- verdict: result.verdict,
101
- })
103
+ const v = (result.verdict ?? {
104
+ verdict: 'ERROR' as const,
105
+ reason: 'No verdict returned',
106
+ criteria: [],
107
+ }) as JudgeVerdict
108
+
109
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
110
+ verdictsBySide.get(cell.side)!.push(v)
102
111
  } catch (e) {
103
- verdicts.push({
104
- participantId: variant.participant_id,
105
- verdict: {
106
- verdict: 'ERROR' as const,
107
- reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
108
- },
112
+ if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
113
+ verdictsBySide.get(cell.side)!.push({
114
+ verdict: 'ERROR' as const,
115
+ reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
116
+ criteria: [],
109
117
  })
110
118
  }
111
119
  }
112
120
 
113
- // Run comparative judge
114
- const judge = useAgent(players[0]?.platform ?? 'claude')
121
+ // Aggregate stats
122
+ const stats = aggregateAllStats(verdictsBySide)
123
+
124
+ // Comparative judge
125
+ const flatVerdicts: { participantId: string; verdict: unknown }[] = []
126
+ for (const [side, verdicts] of verdictsBySide) {
127
+ // Use the first run's verdict for comparative judge (or aggregate into one)
128
+ if (verdicts.length > 0) {
129
+ flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
130
+ }
131
+ }
132
+
133
+ const judge = useAgent(resolved[0]?.platform ?? 'claude')
115
134
  const report = await runComparativeJudge({
116
135
  manifest,
117
- verdicts,
136
+ verdicts: flatVerdicts,
118
137
  judge,
119
138
  workdir: artifactsDir,
120
139
  })
121
140
 
122
141
  // Write report
123
- writeFileSync(join(artifactsDir, 'report.md'), `# Arena Report: ${manifest.id}
142
+ writeReport(artifactsDir, manifest, report, stats)
124
143
 
125
- **Task**: ${manifest.task}
126
- **Criteria**: ${manifest.criteria.join(', ')}
127
- **Date**: ${new Date().toISOString()}
144
+ // Update manifest
145
+ const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
146
+ writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
128
147
 
129
- ## Score Matrix
130
- ${renderScoreMatrix(report)}
148
+ return { manifest: finalManifest, report, stats, artifactsDir }
149
+ }
131
150
 
132
- ## Pareto Frontier
133
- ${renderPareto(report)}
151
+ // ── Backward compat: CLI-flag style runner ─────────────────────────────────
134
152
 
135
- ## Key Findings
136
- ${(report.key_findings ?? []).map((f: string) => `- ${f}`).join('\n')}
153
+ export async function runArena(opts: {
154
+ taskPath: string
155
+ playerPaths: string[]
156
+ deckPaths: string[]
157
+ criteria: string[]
158
+ outDir: string
159
+ }): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
160
+ const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
137
161
 
138
- ## Recommendations
139
- ${(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`).join('\n')}
140
- `)
162
+ // Convert CLI flags to ArenaToml internally
163
+ const toml: ArenaToml = {
164
+ arena: {
165
+ task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
166
+ criteria,
167
+ runs_per_side: 1,
168
+ max_participants: Math.min(playerPaths.length, deckPaths.length),
169
+ },
170
+ side: playerPaths.flatMap((playerPath, pi) =>
171
+ deckPaths.map((deckPath, di) => ({
172
+ name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
173
+ player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
174
+ deck: deckPath,
175
+ }))
176
+ ),
177
+ }
141
178
 
142
- // Update manifest status
143
- const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
144
- writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
179
+ const result = await runArenaFromToml({ toml, taskPath, outDir })
180
+ const { manifest, report, artifactsDir } = result as ArenaResult
181
+ return { manifest, report, artifactsDir }
182
+ }
145
183
 
146
- return { manifest: finalManifest, report, artifactsDir }
184
+ // ── Report renderer ────────────────────────────────────────────────────────
185
+
186
+ function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
187
+ const lines: string[] = [
188
+ `# Arena Report: ${manifest.id}`,
189
+ '',
190
+ `**Task**: ${manifest.task}`,
191
+ `**Criteria**: ${manifest.criteria.join(', ')}`,
192
+ `**Date**: ${new Date().toISOString()}`,
193
+ '',
194
+ '## Score Matrix',
195
+ '',
196
+ renderScoreMatrix(report),
197
+ '',
198
+ '## Per-Side Statistics',
199
+ '',
200
+ renderStatsTable(stats),
201
+ '',
202
+ '## Pareto Frontier',
203
+ '',
204
+ renderPareto(report),
205
+ '',
206
+ '## Key Findings',
207
+ '',
208
+ ...(report.key_findings ?? []).map((f: string) => `- ${f}`),
209
+ '',
210
+ '## Recommendations',
211
+ '',
212
+ ...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
213
+ ]
214
+
215
+ writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
147
216
  }
148
217
 
149
- // ── Markdown Renderers ────────────────────────────────────────────────────
218
+ function renderStatsTable(stats: SideStats[]): string {
219
+ if (stats.length === 0) return 'No statistics available.\n'
220
+
221
+ let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
222
+ table += `|------|------|-----------|-----------------|----------|\n`
223
+
224
+ for (const s of stats) {
225
+ const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
226
+ const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
227
+ table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
228
+ }
229
+
230
+ return table
231
+ }
150
232
 
151
233
  function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
152
234
  if (!report.score_matrix?.length) return 'No scores available.\n'
153
235
 
154
- // Build participant × criterion matrix
155
236
  const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
156
237
  const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
157
238
 
158
- let table = `| Criterion | Weight | ${participants.map(p => `${p}`).join(' | ')} |\n`
239
+ let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
159
240
  table += `|${'---|'.repeat(2 + participants.length)}\n`
160
241
 
161
242
  for (const c of criteria) {
@@ -165,7 +246,6 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
165
246
  }).join(' | ')} |\n`
166
247
  }
167
248
 
168
- // Weighted totals
169
249
  table += `| **Weighted Total** | 100% | ${participants.map(p => {
170
250
  const pScores = report.score_matrix!.filter(s => s.participant_id === p)
171
251
  const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
@@ -177,11 +257,9 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
177
257
 
178
258
  function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
179
259
  if (!report.pareto?.length) return 'No Pareto analysis.\n'
180
-
181
- return report.pareto.map((p: { participant_id: string; dominated: boolean; dominated_by: string[] }) => {
182
- if (p.dominated) {
183
- return `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
184
- }
185
- return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
186
- }).join('\n')
260
+ return report.pareto.map(p =>
261
+ p.dominated
262
+ ? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
263
+ : `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
264
+ ).join('\n')
187
265
  }
@@ -0,0 +1,111 @@
1
+ import { describe, test, expect } from 'bun:test'
2
+ import { aggregateSideStats, aggregateAllStats } from './stats'
3
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
4
+
5
+ function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
6
+ return {
7
+ verdict: 'PASS',
8
+ reason: 'OK',
9
+ criteria: [{ name: 'correctness', passed: true }],
10
+ ...overrides,
11
+ }
12
+ }
13
+
14
+ // ── aggregateSideStats ─────────────────────────────────────────────────────
15
+
16
+ describe('aggregateSideStats', () => {
17
+ test('single run: passRate=1, no variance', () => {
18
+ const stats = aggregateSideStats('test', [makeVerdict()])
19
+ expect(stats.sideName).toBe('test')
20
+ expect(stats.runs).toBe(1)
21
+ expect(stats.passRate).toBe(1)
22
+ expect(stats.failRate).toBe(0)
23
+ expect(stats.errorRate).toBe(0)
24
+ })
25
+
26
+ test('3 runs: 2 PASS, 1 FAIL', () => {
27
+ const verdicts = [
28
+ makeVerdict(),
29
+ makeVerdict(),
30
+ makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
31
+ ]
32
+ const stats = aggregateSideStats('test', verdicts)
33
+ expect(stats.passRate).toBeCloseTo(2 / 3)
34
+ expect(stats.failRate).toBeCloseTo(1 / 3)
35
+ })
36
+
37
+ test('confidence: mean across runs', () => {
38
+ const verdicts = [
39
+ makeVerdict({ confidence: 90 }),
40
+ makeVerdict({ confidence: 80 }),
41
+ makeVerdict({ confidence: 70 }),
42
+ ]
43
+ const stats = aggregateSideStats('test', verdicts)
44
+ expect(stats.meanConfidence).toBeCloseTo(80)
45
+ expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
46
+ })
47
+
48
+ test('confidence: null when no verdict has it', () => {
49
+ const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
50
+ expect(stats.meanConfidence).toBeNull()
51
+ expect(stats.confidenceVariance).toBeNull()
52
+ })
53
+
54
+ test('per-criterion pass rate', () => {
55
+ const verdicts = [
56
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
57
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
58
+ makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
59
+ ]
60
+ const stats = aggregateSideStats('test', verdicts)
61
+ expect(stats.criteria).toHaveLength(1)
62
+ expect(stats.criteria[0].name).toBe('accuracy')
63
+ expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
64
+ })
65
+
66
+ test('per-criterion scores: mean and variance', () => {
67
+ const verdicts = [
68
+ makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
69
+ makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
70
+ makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
71
+ ]
72
+ const stats = aggregateSideStats('test', verdicts)
73
+ expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
74
+ expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
75
+ expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
76
+ })
77
+
78
+ test('zero runs: all zeros', () => {
79
+ const stats = aggregateSideStats('empty', [])
80
+ expect(stats.runs).toBe(0)
81
+ expect(stats.passRate).toBe(0)
82
+ expect(stats.meanConfidence).toBeNull()
83
+ })
84
+
85
+ test('handles ERROR verdicts correctly', () => {
86
+ const verdicts = [
87
+ makeVerdict(),
88
+ makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
89
+ ]
90
+ const stats = aggregateSideStats('test', verdicts)
91
+ expect(stats.passRate).toBe(0.5)
92
+ expect(stats.errorRate).toBe(0.5)
93
+ })
94
+ })
95
+
96
+ // ── aggregateAllStats ──────────────────────────────────────────────────────
97
+
98
+ describe('aggregateAllStats', () => {
99
+ test('aggregates multiple sides', () => {
100
+ const map = new Map<string, JudgeVerdict[]>()
101
+ map.set('side-a', [makeVerdict(), makeVerdict()])
102
+ map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
103
+
104
+ const stats = aggregateAllStats(map)
105
+ expect(stats).toHaveLength(2)
106
+ expect(stats[0].sideName).toBe('side-a')
107
+ expect(stats[0].passRate).toBe(1)
108
+ expect(stats[1].sideName).toBe('side-b')
109
+ expect(stats[1].passRate).toBe(0)
110
+ })
111
+ })
package/src/stats.ts ADDED
@@ -0,0 +1,117 @@
1
+ import type { JudgeVerdict } from '@lythos/test-utils/schema'
2
+
3
+ // ── Statistical aggregation for runs_per_side ─────────────────────────────
4
+ // All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
5
+
6
+ export interface CriterionStats {
7
+ name: string
8
+ mean: number
9
+ variance: number
10
+ min: number
11
+ max: number
12
+ count: number // number of runs that reported this criterion
13
+ }
14
+
15
+ export interface SideStats {
16
+ sideName: string
17
+ runs: number
18
+ passRate: number // PASS / total
19
+ failRate: number
20
+ errorRate: number
21
+ meanConfidence: number | null // null if no verdict had confidence
22
+ confidenceVariance: number | null
23
+ criteria: CriterionStats[]
24
+ scoreByCriterion: Record<string, { mean: number; variance: number }>
25
+ }
26
+
27
+ // ── Helpers ────────────────────────────────────────────────────────────────
28
+
29
+ function mean(values: number[]): number {
30
+ if (values.length === 0) return 0
31
+ return values.reduce((a, b) => a + b, 0) / values.length
32
+ }
33
+
34
+ function variance(values: number[], m?: number): number {
35
+ if (values.length < 2) return 0
36
+ const avg = m ?? mean(values)
37
+ return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
38
+ }
39
+
40
+ // ── Aggregator ────────────────────────────────────────────────────────────
41
+
42
+ export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
43
+ const runs = verdicts.length
44
+ const passCount = verdicts.filter(v => v.verdict === 'PASS').length
45
+ const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
46
+ const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
47
+
48
+ // Confidence
49
+ const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
50
+ const meanConf = confidences.length > 0 ? mean(confidences) : null
51
+ const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
52
+
53
+ // Per-criterion stats from verdict.criteria
54
+ const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
55
+ for (const v of verdicts) {
56
+ for (const c of v.criteria ?? []) {
57
+ if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
58
+ criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
59
+ }
60
+ }
61
+
62
+ const criteria: CriterionStats[] = []
63
+ for (const [name, values] of criterionMap) {
64
+ const passRate = values.filter(v => v.passed).length / values.length
65
+ criteria.push({
66
+ name,
67
+ mean: passRate, // for criteria, "mean" = pass rate across runs
68
+ variance: passRate * (1 - passRate), // Bernoulli variance
69
+ min: 0,
70
+ max: 1,
71
+ count: values.length,
72
+ })
73
+ }
74
+
75
+ // Per-criterion scores (1-5) from verdict.scores
76
+ const scoreMap = new Map<string, number[]>()
77
+ for (const v of verdicts) {
78
+ if (v.scores) {
79
+ for (const [criterion, score] of Object.entries(v.scores)) {
80
+ if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
81
+ scoreMap.get(criterion)!.push(score)
82
+ }
83
+ }
84
+ }
85
+
86
+ const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
87
+ for (const [criterion, scores] of scoreMap) {
88
+ const m = mean(scores)
89
+ scoreByCriterion[criterion] = {
90
+ mean: m,
91
+ variance: scores.length > 1 ? variance(scores, m) : 0,
92
+ }
93
+ }
94
+
95
+ return {
96
+ sideName,
97
+ runs,
98
+ passRate: runs > 0 ? passCount / runs : 0,
99
+ failRate: runs > 0 ? failCount / runs : 0,
100
+ errorRate: runs > 0 ? errorCount / runs : 0,
101
+ meanConfidence: meanConf,
102
+ confidenceVariance: confVar,
103
+ criteria,
104
+ scoreByCriterion,
105
+ }
106
+ }
107
+
108
+ /** Aggregate stats for all sides from a map of sideName → verdicts[] */
109
+ export function aggregateAllStats(
110
+ verdictsBySide: Map<string, JudgeVerdict[]>
111
+ ): SideStats[] {
112
+ const stats: SideStats[] = []
113
+ for (const [sideName, verdicts] of verdictsBySide) {
114
+ stats.push(aggregateSideStats(sideName, verdicts))
115
+ }
116
+ return stats
117
+ }