@lythos/skill-arena 0.9.2 → 0.9.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -15
- package/package.json +1 -1
- package/src/arena-toml.test.ts +191 -0
- package/src/arena-toml.ts +172 -0
- package/src/cli.ts +58 -5
- package/src/player.test.ts +95 -0
- package/src/player.ts +71 -0
- package/src/runner.ts +171 -93
- package/src/stats.test.ts +111 -0
- package/src/stats.ts +117 -0
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# @lythos/skill-arena
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
 
|
|
4
|
+
|
|
5
|
+
> Controlled-variable benchmark for AI agent skills. Compare skills, decks, or configurations on the same task — single-skill A/B or full-deck Pareto frontier analysis. Now with declarative `arena.toml` (k8s-manifest style) and deterministic Pareto frontier.
|
|
4
6
|
|
|
5
7
|
## Why
|
|
6
8
|
|
|
@@ -40,25 +42,36 @@ bunx @lythos/skill-arena viz tmp/arena-<id>/
|
|
|
40
42
|
|
|
41
43
|
## Commands
|
|
42
44
|
|
|
45
|
+
### Declarative mode (k8s-style, recommended)
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Print execution plan without running
|
|
49
|
+
bunx @lythos/skill-arena run --config arena.toml --dry-run
|
|
50
|
+
|
|
51
|
+
# Execute with per-side runs_per_side and statistical aggregation
|
|
52
|
+
bunx @lythos/skill-arena run --config arena.toml
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### CLI-flag mode (backward compat)
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
bunx @lythos/skill-arena run \
|
|
59
|
+
--task ./TASK-arena.md \
|
|
60
|
+
--players ./players/claude.toml \
|
|
61
|
+
--decks ./decks/run-01.toml,./decks/run-02.toml \
|
|
62
|
+
--criteria coverage,relevance,actionability,depth
|
|
43
63
|
```
|
|
44
|
-
Usage: bunx @lythos/skill-arena <options> | bunx @lythos/skill-arena viz <dir>
|
|
45
64
|
|
|
46
|
-
|
|
47
|
-
--task, -t <desc> Task description (required)
|
|
48
|
-
--skills, -s <list> Comma-separated skills, 2–5 (Mode 1)
|
|
49
|
-
--criteria, -c <list> Evaluation dimensions (default: syntax,context,logic,token)
|
|
50
|
-
--control <skill> Control skill (default: lythoskill-project-scribe)
|
|
65
|
+
### Scaffold mode (legacy, manual execution)
|
|
51
66
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
67
|
+
```
|
|
68
|
+
bunx @lythos/skill-arena scaffold --task "..." --skills a,b
|
|
69
|
+
```
|
|
55
70
|
|
|
56
|
-
|
|
57
|
-
--dir, -d <path> Arena parent directory (default: tmp)
|
|
58
|
-
--project, -p <path> Project root (default: .)
|
|
71
|
+
### Viz
|
|
59
72
|
|
|
60
|
-
|
|
61
|
-
|
|
73
|
+
```bash
|
|
74
|
+
bunx @lythos/skill-arena viz runs/arena-<id>/
|
|
62
75
|
```
|
|
63
76
|
|
|
64
77
|
## Skill Documentation
|
|
@@ -77,6 +90,31 @@ Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
|
77
90
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
78
91
|
```
|
|
79
92
|
|
|
93
|
+
### Runtime architecture (intent/plan/execute)
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
arena.toml → ArenaToml (Zod) → ExecutionPlan (pure) → per-cell agent spawn (IO)
|
|
97
|
+
↓
|
|
98
|
+
aggregateAllStats (pure) ← verdicts[]
|
|
99
|
+
↓
|
|
100
|
+
runComparativeJudge (IO) → report.md + Pareto frontier
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
- **Intent**: `arena.toml` declarative config (k8s-manifest style)
|
|
104
|
+
- **Plan**: `buildExecutionPlan()`, `aggregateSideStats()`, `computePareto()` — pure functions
|
|
105
|
+
- **Execute**: `runAgentScenario` per cell, `runComparativeJudge` — IO via `AgentAdapter`
|
|
106
|
+
|
|
107
|
+
Built on `@lythos/test-utils` shared infrastructure.
|
|
108
|
+
|
|
109
|
+
## Test Coverage
|
|
110
|
+
|
|
111
|
+
| Layer | Count | CI | Notes |
|
|
112
|
+
|-------|-------|----|-------|
|
|
113
|
+
| Unit tests | 41 | ✅ | TOML parser, player resolution, Pareto, stats |
|
|
114
|
+
| Agent BDD | — | ❌ | Requires `claude` CLI; run locally |
|
|
115
|
+
|
|
116
|
+
Pareto frontier is a **deterministic algorithm** — never delegated to LLM. 8 unit tests cover dominance, cross-dominance, transitive chains, partial criteria, and empty scores.
|
|
117
|
+
|
|
80
118
|
## License
|
|
81
119
|
|
|
82
120
|
MIT
|
package/package.json
CHANGED
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
|
|
3
|
+
import { formatPlanOutput } from './runner'
|
|
4
|
+
|
|
5
|
+
const minimalToml = `
|
|
6
|
+
[arena]
|
|
7
|
+
task = "Test task"
|
|
8
|
+
criteria = ["a", "b"]
|
|
9
|
+
|
|
10
|
+
[[side]]
|
|
11
|
+
name = "runner-a"
|
|
12
|
+
player = "claude-code"
|
|
13
|
+
deck = "./decks/a.toml"
|
|
14
|
+
|
|
15
|
+
[[side]]
|
|
16
|
+
name = "runner-b"
|
|
17
|
+
player = "claude-code"
|
|
18
|
+
deck = "./decks/b.toml"
|
|
19
|
+
`
|
|
20
|
+
|
|
21
|
+
const fullToml = `
|
|
22
|
+
[arena]
|
|
23
|
+
task = "Generate auth flow diagram"
|
|
24
|
+
criteria = ["syntax", "context", "logic", "token"]
|
|
25
|
+
runs_per_side = 3
|
|
26
|
+
|
|
27
|
+
[[side]]
|
|
28
|
+
name = "minimal"
|
|
29
|
+
player = "standard-coder"
|
|
30
|
+
deck = "./decks/minimal.toml"
|
|
31
|
+
|
|
32
|
+
[[side]]
|
|
33
|
+
name = "rich"
|
|
34
|
+
player = "expert-architect"
|
|
35
|
+
deck = "./decks/rich.toml"
|
|
36
|
+
|
|
37
|
+
[[side]]
|
|
38
|
+
name = "baseline"
|
|
39
|
+
player = "standard-coder"
|
|
40
|
+
deck = "./decks/baseline.toml"
|
|
41
|
+
control = true
|
|
42
|
+
|
|
43
|
+
[side.env]
|
|
44
|
+
container = "node:20-alpine"
|
|
45
|
+
pre_run = ["npm ci", "npm run build"]
|
|
46
|
+
working_dir = "/workspace"
|
|
47
|
+
`
|
|
48
|
+
|
|
49
|
+
// ── Schema + Parser ────────────────────────────────────────────────────────
|
|
50
|
+
|
|
51
|
+
describe('parseArenaToml', () => {
|
|
52
|
+
test('parses minimal two-side arena', () => {
|
|
53
|
+
const result = parseArenaToml(minimalToml)
|
|
54
|
+
expect(result.arena.task).toBe('Test task')
|
|
55
|
+
expect(result.arena.criteria).toEqual(['a', 'b'])
|
|
56
|
+
expect(result.arena.runs_per_side).toBe(1) // default
|
|
57
|
+
expect(result.side).toHaveLength(2)
|
|
58
|
+
expect(result.side[0].name).toBe('runner-a')
|
|
59
|
+
expect(result.side[0].player).toBe('claude-code')
|
|
60
|
+
expect(result.side[0].deck).toBe('./decks/a.toml')
|
|
61
|
+
expect(result.side[0].control).toBe(false) // default
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
test('parses full arena with runs_per_side and control', () => {
|
|
65
|
+
const result = parseArenaToml(fullToml)
|
|
66
|
+
expect(result.arena.runs_per_side).toBe(3)
|
|
67
|
+
expect(result.side).toHaveLength(3)
|
|
68
|
+
expect(result.side[2].name).toBe('baseline')
|
|
69
|
+
expect(result.side[2].control).toBe(true)
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
test('parses side env block', () => {
|
|
73
|
+
const result = parseArenaToml(fullToml)
|
|
74
|
+
const env = result.side[2].env
|
|
75
|
+
expect(env.container).toBe('node:20-alpine')
|
|
76
|
+
expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
|
|
77
|
+
expect(env.working_dir).toBe('/workspace')
|
|
78
|
+
expect(env.env_vars).toEqual({})
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
test('rejects fewer than 2 sides', () => {
|
|
82
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
|
|
83
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
test('rejects empty criteria', () => {
|
|
87
|
+
const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
88
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
test('rejects non-object input', () => {
|
|
92
|
+
expect(() => ArenaToml.parse('not valid')).toThrow()
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
test('rejects missing arena section', () => {
|
|
96
|
+
expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
test('rejects runs_per_side = 0', () => {
|
|
100
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
101
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
test('parses integer and boolean values correctly', () => {
|
|
105
|
+
const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
106
|
+
const result = parseArenaToml(toml)
|
|
107
|
+
expect(result.arena.runs_per_side).toBe(2)
|
|
108
|
+
expect(result.arena.max_participants).toBe(5)
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
test('comments are stripped', () => {
|
|
112
|
+
const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
113
|
+
const result = parseArenaToml(toml)
|
|
114
|
+
expect(result.arena.task).toBe('x')
|
|
115
|
+
})
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
// ── Execution Plan ─────────────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
describe('buildExecutionPlan', () => {
|
|
121
|
+
test('generates plan: 2 sides × 1 run = 2 cells', () => {
|
|
122
|
+
const toml = parseArenaToml(minimalToml)
|
|
123
|
+
const plan = buildExecutionPlan(toml)
|
|
124
|
+
expect(plan.task).toBe('Test task')
|
|
125
|
+
expect(plan.criteria).toEqual(['a', 'b'])
|
|
126
|
+
expect(plan.cells).toHaveLength(2)
|
|
127
|
+
expect(plan.total_runs).toBe(2)
|
|
128
|
+
expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
|
|
129
|
+
expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
test('generates plan: 3 sides × 3 runs = 9 cells', () => {
|
|
133
|
+
const toml = parseArenaToml(fullToml)
|
|
134
|
+
const plan = buildExecutionPlan(toml)
|
|
135
|
+
expect(plan.cells).toHaveLength(9)
|
|
136
|
+
expect(plan.total_runs).toBe(9)
|
|
137
|
+
|
|
138
|
+
// Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
|
|
139
|
+
expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
|
|
140
|
+
expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
|
|
141
|
+
expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
|
|
142
|
+
expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
|
|
143
|
+
expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
test('control flag preserved in plan cells', () => {
|
|
147
|
+
const toml = parseArenaToml(fullToml)
|
|
148
|
+
const plan = buildExecutionPlan(toml)
|
|
149
|
+
const baselineCells = plan.cells.filter(c => c.side === 'baseline')
|
|
150
|
+
expect(baselineCells).toHaveLength(3)
|
|
151
|
+
expect(baselineCells.every(c => c.control)).toBe(true)
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
test('dry-run output format matches expected log', () => {
|
|
155
|
+
const toml = parseArenaToml(minimalToml)
|
|
156
|
+
const plan = buildExecutionPlan(toml)
|
|
157
|
+
|
|
158
|
+
// Simulate what --dry-run would log
|
|
159
|
+
const logs: string[] = []
|
|
160
|
+
for (const line of formatPlanOutput(plan)) {
|
|
161
|
+
logs.push(line)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
expect(logs.some(l => l.includes('2 cells'))).toBe(true)
|
|
165
|
+
expect(logs.some(l => l.includes('runner-a'))).toBe(true)
|
|
166
|
+
expect(logs.some(l => l.includes('runner-b'))).toBe(true)
|
|
167
|
+
expect(logs.some(l => l.includes('claude-code'))).toBe(true)
|
|
168
|
+
expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
test('dry-run output shows control flag for control sides', () => {
|
|
172
|
+
const toml = parseArenaToml(fullToml)
|
|
173
|
+
const plan = buildExecutionPlan(toml)
|
|
174
|
+
const lines = formatPlanOutput(plan)
|
|
175
|
+
const baselineLines = lines.filter(l => l.includes('baseline'))
|
|
176
|
+
// All baseline cells should have [control] flag
|
|
177
|
+
expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
test('dry-run: plan is pure data, no side effects', () => {
|
|
181
|
+
// The entire plan generation is a pure function — dry-run is just printing it
|
|
182
|
+
const toml = parseArenaToml(fullToml)
|
|
183
|
+
const plan = buildExecutionPlan(toml)
|
|
184
|
+
// Verify plan is self-describing for a --dry-run output
|
|
185
|
+
expect(plan.total_runs).toBeGreaterThan(0)
|
|
186
|
+
expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
|
|
187
|
+
expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
|
|
188
|
+
expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
|
|
189
|
+
expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
|
|
190
|
+
})
|
|
191
|
+
})
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { z } from 'zod'
|
|
2
|
+
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
3
|
+
|
|
4
|
+
// ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
|
|
5
|
+
// Anchored on: ADR-20260502110308316
|
|
6
|
+
|
|
7
|
+
export const SideEnv = z.object({
|
|
8
|
+
container: z.string().optional(),
|
|
9
|
+
pre_run: z.array(z.string()).default([]),
|
|
10
|
+
working_dir: z.string().optional(),
|
|
11
|
+
env_vars: z.record(z.string()).default({}),
|
|
12
|
+
})
|
|
13
|
+
export type SideEnv = z.infer<typeof SideEnv>
|
|
14
|
+
|
|
15
|
+
export const Side = z.object({
|
|
16
|
+
name: z.string(),
|
|
17
|
+
player: z.string(), // reference to player config (useAgent resolves)
|
|
18
|
+
deck: z.string(), // path to deck.toml
|
|
19
|
+
control: z.boolean().default(false),
|
|
20
|
+
env: SideEnv.default({}),
|
|
21
|
+
})
|
|
22
|
+
export type Side = z.infer<typeof Side>
|
|
23
|
+
|
|
24
|
+
export const ArenaToml = z.object({
|
|
25
|
+
arena: z.object({
|
|
26
|
+
task: z.string(), // task description or path to TASK-arena.md
|
|
27
|
+
criteria: z.array(z.string()).min(1),
|
|
28
|
+
runs_per_side: z.number().int().positive().default(1),
|
|
29
|
+
max_participants: z.number().int().min(2).max(5).default(5),
|
|
30
|
+
}),
|
|
31
|
+
side: z.array(Side).min(2).max(5),
|
|
32
|
+
})
|
|
33
|
+
export type ArenaToml = z.infer<typeof ArenaToml>
|
|
34
|
+
|
|
35
|
+
// ── Parser ─────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
export function parseArenaToml(content: string): ArenaToml {
|
|
38
|
+
// Simple inline TOML parser for arena.toml (no external dep needed for this subset)
|
|
39
|
+
const parsed = parseToml(content)
|
|
40
|
+
return ArenaToml.parse(parsed)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ── Plan generation (pure function, dry-run visible) ───────────────────────
|
|
44
|
+
|
|
45
|
+
export interface ExecutionCell {
|
|
46
|
+
side: string // side name
|
|
47
|
+
player: string // player reference
|
|
48
|
+
deck: string // deck path
|
|
49
|
+
run: number // 1-indexed run number
|
|
50
|
+
control: boolean
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface ExecutionPlan {
|
|
54
|
+
task: string
|
|
55
|
+
criteria: string[]
|
|
56
|
+
cells: ExecutionCell[]
|
|
57
|
+
total_runs: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
|
|
61
|
+
const cells: ExecutionCell[] = []
|
|
62
|
+
for (const side of toml.side) {
|
|
63
|
+
for (let run = 1; run <= toml.arena.runs_per_side; run++) {
|
|
64
|
+
cells.push({
|
|
65
|
+
side: side.name,
|
|
66
|
+
player: side.player,
|
|
67
|
+
deck: side.deck,
|
|
68
|
+
run,
|
|
69
|
+
control: side.control,
|
|
70
|
+
})
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
task: toml.arena.task,
|
|
75
|
+
criteria: toml.arena.criteria,
|
|
76
|
+
cells,
|
|
77
|
+
total_runs: cells.length,
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
|
|
82
|
+
|
|
83
|
+
function parseToml(text: string): Record<string, unknown> {
|
|
84
|
+
const result: Record<string, unknown> = {}
|
|
85
|
+
let currentTable: Record<string, unknown> = result
|
|
86
|
+
let currentTableKey = ''
|
|
87
|
+
const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
|
|
88
|
+
|
|
89
|
+
for (const rawLine of text.split('\n')) {
|
|
90
|
+
const line = rawLine.split('#')[0].trim()
|
|
91
|
+
if (!line) continue
|
|
92
|
+
|
|
93
|
+
// [[array]]
|
|
94
|
+
const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
|
|
95
|
+
if (arrayMatch) {
|
|
96
|
+
const key = arrayMatch[1] // e.g. "side"
|
|
97
|
+
if (!arrayTables.has(key)) arrayTables.set(key, [])
|
|
98
|
+
currentTable = {}
|
|
99
|
+
arrayTables.get(key)!.push(currentTable)
|
|
100
|
+
currentTableKey = key
|
|
101
|
+
continue
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// [section]
|
|
105
|
+
const sectionMatch = line.match(/^\[(.+?)\]$/)
|
|
106
|
+
if (sectionMatch) {
|
|
107
|
+
const key = sectionMatch[1]
|
|
108
|
+
// nested key like "side.env"
|
|
109
|
+
if (key.includes('.')) {
|
|
110
|
+
const [parent, child] = key.split('.')
|
|
111
|
+
const parentArr = arrayTables.get(parent)
|
|
112
|
+
if (parentArr && parentArr.length > 0) {
|
|
113
|
+
currentTable = {}
|
|
114
|
+
parentArr[parentArr.length - 1][child] = currentTable
|
|
115
|
+
}
|
|
116
|
+
} else {
|
|
117
|
+
result[key] = {}
|
|
118
|
+
currentTable = result[key] as Record<string, unknown>
|
|
119
|
+
}
|
|
120
|
+
currentTableKey = ''
|
|
121
|
+
continue
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// key = value
|
|
125
|
+
const eqIdx = line.indexOf('=')
|
|
126
|
+
if (eqIdx !== -1) {
|
|
127
|
+
const key = line.slice(0, eqIdx).trim()
|
|
128
|
+
let value = line.slice(eqIdx + 1).trim()
|
|
129
|
+
|
|
130
|
+
// String value
|
|
131
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
132
|
+
value = value.slice(1, -1)
|
|
133
|
+
} else if (value === 'true') {
|
|
134
|
+
value = 'true'
|
|
135
|
+
} else if (value === 'false') {
|
|
136
|
+
value = 'false'
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Array value: ["a", "b"]
|
|
140
|
+
if (value.startsWith('[') && value.endsWith(']')) {
|
|
141
|
+
const inner = value.slice(1, -1).trim()
|
|
142
|
+
if (!inner) {
|
|
143
|
+
currentTable[key] = []
|
|
144
|
+
} else {
|
|
145
|
+
const arr = inner.split(',').map(s => {
|
|
146
|
+
const t = s.trim()
|
|
147
|
+
if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
|
|
148
|
+
return t.slice(1, -1)
|
|
149
|
+
}
|
|
150
|
+
return t
|
|
151
|
+
})
|
|
152
|
+
currentTable[key] = arr
|
|
153
|
+
}
|
|
154
|
+
} else if (value === 'true') {
|
|
155
|
+
currentTable[key] = true
|
|
156
|
+
} else if (value === 'false') {
|
|
157
|
+
currentTable[key] = false
|
|
158
|
+
} else if (/^-?\d+(\.\d+)?$/.test(value)) {
|
|
159
|
+
currentTable[key] = Number(value)
|
|
160
|
+
} else {
|
|
161
|
+
currentTable[key] = value
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Materialize array tables into result
|
|
167
|
+
for (const [key, arr] of arrayTables) {
|
|
168
|
+
result[key] = arr
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return result
|
|
172
|
+
}
|
package/src/cli.ts
CHANGED
|
@@ -35,7 +35,7 @@ Usage:
|
|
|
35
35
|
lythoskill-arena viz <arena-dir>
|
|
36
36
|
|
|
37
37
|
Commands:
|
|
38
|
-
run Run arena programmatically (
|
|
38
|
+
run Run arena programmatically (declarative arena.toml or CLI flags)
|
|
39
39
|
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
40
40
|
viz Visualize arena report (ASCII charts)
|
|
41
41
|
|
|
@@ -44,14 +44,23 @@ Options:
|
|
|
44
44
|
-s, --skills <list> Comma-separated skill names (scaffold only)
|
|
45
45
|
--decks <list> Comma-separated deck paths
|
|
46
46
|
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
47
|
-
--players <list> Comma-separated player.toml paths (run only)
|
|
47
|
+
--players <list> Comma-separated player.toml paths (CLI run only)
|
|
48
|
+
--config <path> Path to arena.toml (declarative mode, k8s-style)
|
|
49
|
+
--dry-run Print execution plan without running (with --config)
|
|
48
50
|
--control <skill> Control skill for comparison (scaffold only)
|
|
49
51
|
--out <dir> Output directory (run: defaults to runs/arena-<id>)
|
|
50
52
|
-d, --dir <dir> Output directory (scaffold: defaults to tmp)
|
|
51
53
|
-p, --project <dir> Project directory (default: .)
|
|
52
54
|
|
|
53
55
|
Examples:
|
|
54
|
-
|
|
56
|
+
# Declarative mode (k8s-style)
|
|
57
|
+
lythoskill-arena run --config ./arena.toml
|
|
58
|
+
lythoskill-arena run --config ./arena.toml --dry-run
|
|
59
|
+
|
|
60
|
+
# CLI-flag mode (backward compat)
|
|
61
|
+
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
62
|
+
|
|
63
|
+
# Legacy scaffolding
|
|
55
64
|
lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
|
|
56
65
|
lythoskill-arena viz runs/arena-20260504
|
|
57
66
|
`)
|
|
@@ -71,6 +80,9 @@ function parseArgs(argv: string[]) {
|
|
|
71
80
|
control: 'lythoskill-project-scribe',
|
|
72
81
|
dir: 'tmp',
|
|
73
82
|
project: '.',
|
|
83
|
+
config: undefined,
|
|
84
|
+
out: undefined,
|
|
85
|
+
players: undefined,
|
|
74
86
|
}
|
|
75
87
|
const positionals: string[] = []
|
|
76
88
|
|
|
@@ -90,6 +102,12 @@ function parseArgs(argv: string[]) {
|
|
|
90
102
|
options.dir = argv[++i]
|
|
91
103
|
} else if (arg === '--project' || arg === '-p') {
|
|
92
104
|
options.project = argv[++i]
|
|
105
|
+
} else if (arg === '--config') {
|
|
106
|
+
options.config = argv[++i]
|
|
107
|
+
} else if (arg === '--out') {
|
|
108
|
+
options.out = argv[++i]
|
|
109
|
+
} else if (arg === '--players') {
|
|
110
|
+
options.players = argv[++i]
|
|
93
111
|
} else if (!arg.startsWith('-')) {
|
|
94
112
|
positionals.push(arg)
|
|
95
113
|
}
|
|
@@ -563,9 +581,45 @@ function runViz(argv: string[]) {
|
|
|
563
581
|
|
|
564
582
|
async function runProgrammaticArena(argv: string[]) {
|
|
565
583
|
const { options } = parseArgs(argv)
|
|
584
|
+
const { readFileSync } = await import('node:fs')
|
|
585
|
+
|
|
586
|
+
const hasConfig = !!(options as Record<string, string | undefined>).config
|
|
587
|
+
const dryRun = argv.includes('--dry-run')
|
|
588
|
+
|
|
589
|
+
if (hasConfig) {
|
|
590
|
+
// arena.toml declarative mode
|
|
591
|
+
const { parseArenaToml } = await import('./arena-toml')
|
|
592
|
+
const { runArenaFromToml } = await import('./runner')
|
|
593
|
+
const configPath = (options as Record<string, string | undefined>).config!
|
|
594
|
+
|
|
595
|
+
const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
|
|
596
|
+
const result = await runArenaFromToml({
|
|
597
|
+
toml,
|
|
598
|
+
taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
|
|
599
|
+
? toml.arena.task
|
|
600
|
+
: (options as Record<string, string | undefined>).task ?? toml.arena.task,
|
|
601
|
+
outDir: (options as Record<string, string | undefined>).out,
|
|
602
|
+
dryRun,
|
|
603
|
+
})
|
|
604
|
+
|
|
605
|
+
if ('plan' in result) {
|
|
606
|
+
// dry-run
|
|
607
|
+
console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
|
|
608
|
+
for (const cell of result.plan.cells) {
|
|
609
|
+
console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
|
|
610
|
+
}
|
|
611
|
+
return
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
615
|
+
console.log(`📁 Artifacts: ${result.artifactsDir}`)
|
|
616
|
+
console.log(`📊 Report: ${result.artifactsDir}/report.md`)
|
|
617
|
+
return
|
|
618
|
+
}
|
|
566
619
|
|
|
620
|
+
// CLI-flag mode (backward compat)
|
|
567
621
|
if (!options.task || !options.decks) {
|
|
568
|
-
console.error('❌ --task <path> and --decks <list> are required for "run"')
|
|
622
|
+
console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
|
|
569
623
|
process.exit(1)
|
|
570
624
|
}
|
|
571
625
|
|
|
@@ -577,7 +631,6 @@ async function runProgrammaticArena(argv: string[]) {
|
|
|
577
631
|
deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
|
|
578
632
|
criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
|
|
579
633
|
outDir: options.out ?? `runs/arena-${timestamp()}`,
|
|
580
|
-
projectDir: options.project,
|
|
581
634
|
})
|
|
582
635
|
|
|
583
636
|
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
|
|
3
|
+
import { parseArenaToml } from './arena-toml'
|
|
4
|
+
|
|
5
|
+
const toml = parseArenaToml(`
|
|
6
|
+
[arena]
|
|
7
|
+
task = "Test task"
|
|
8
|
+
criteria = ["a", "b"]
|
|
9
|
+
runs_per_side = 3
|
|
10
|
+
|
|
11
|
+
[[side]]
|
|
12
|
+
name = "minimal"
|
|
13
|
+
player = "claude-code"
|
|
14
|
+
deck = "./decks/minimal.toml"
|
|
15
|
+
|
|
16
|
+
[[side]]
|
|
17
|
+
name = "rich"
|
|
18
|
+
player = "expert-architect"
|
|
19
|
+
deck = "./decks/rich.toml"
|
|
20
|
+
`)
|
|
21
|
+
|
|
22
|
+
describe('resolvePlayer', () => {
|
|
23
|
+
test('maps claude-code → claude', () => {
|
|
24
|
+
expect(resolvePlayer('claude-code')).toBe('claude')
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('maps Claude → claude (case insensitive)', () => {
|
|
28
|
+
expect(resolvePlayer('Claude')).toBe('claude')
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
test('maps kimi → kimi', () => {
|
|
32
|
+
expect(resolvePlayer('kimi')).toBe('kimi')
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
test('passes through unknown player names', () => {
|
|
36
|
+
expect(resolvePlayer('expert-architect')).toBe('expert-architect')
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
test('trims whitespace', () => {
|
|
40
|
+
expect(resolvePlayer(' claude-code ')).toBe('claude')
|
|
41
|
+
})
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
describe('resolveSides', () => {
|
|
45
|
+
test('resolves all sides in arena.toml', () => {
|
|
46
|
+
const sides = resolveSides(toml)
|
|
47
|
+
expect(sides).toHaveLength(2)
|
|
48
|
+
expect(sides[0].platform).toBe('claude')
|
|
49
|
+
expect(sides[1].platform).toBe('expert-architect')
|
|
50
|
+
expect(sides[0].playerName).toBe('claude-code')
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
test('preserves side config', () => {
|
|
54
|
+
const sides = resolveSides(toml)
|
|
55
|
+
expect(sides[0].side.name).toBe('minimal')
|
|
56
|
+
expect(sides[0].side.deck).toBe('./decks/minimal.toml')
|
|
57
|
+
})
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
describe('groupBySide', () => {
|
|
61
|
+
test('groups by side name with run count', () => {
|
|
62
|
+
const groups = groupBySide(toml)
|
|
63
|
+
expect(groups).toHaveLength(2)
|
|
64
|
+
expect(groups[0].runs).toBe(3) // runs_per_side
|
|
65
|
+
expect(groups[1].runs).toBe(3)
|
|
66
|
+
expect(groups[0].platform).toBe('claude')
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('control flag preserved', () => {
|
|
70
|
+
const controlToml = parseArenaToml(`
|
|
71
|
+
[arena]
|
|
72
|
+
task = "x"
|
|
73
|
+
criteria = ["a"]
|
|
74
|
+
|
|
75
|
+
[[side]]
|
|
76
|
+
name = "test"
|
|
77
|
+
player = "claude-code"
|
|
78
|
+
deck = "a.toml"
|
|
79
|
+
|
|
80
|
+
[[side]]
|
|
81
|
+
name = "baseline"
|
|
82
|
+
player = "claude-code"
|
|
83
|
+
deck = "b.toml"
|
|
84
|
+
control = true
|
|
85
|
+
`)
|
|
86
|
+
const groups = groupBySide(controlToml)
|
|
87
|
+
expect(groups[1].control).toBe(true)
|
|
88
|
+
})
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
describe('totalRuns', () => {
|
|
92
|
+
test('calculates sides × runs_per_side', () => {
|
|
93
|
+
expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
|
|
94
|
+
})
|
|
95
|
+
})
|
package/src/player.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { Side, ArenaToml } from './arena-toml'
|
|
2
|
+
|
|
3
|
+
// ── Player reference resolution (pure function) ────────────────────────────
|
|
4
|
+
// Maps arena.toml player names → platform identifiers.
|
|
5
|
+
// AgentAdapter creation is the IO layer's job (T4), not ours.
|
|
6
|
+
|
|
7
|
+
export interface ResolvedSide {
|
|
8
|
+
side: Side
|
|
9
|
+
platform: string // resolved platform for useAgent()
|
|
10
|
+
playerName: string // original player reference
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Built-in player registry. Player names that map directly to useAgent platforms. */
|
|
14
|
+
const BUILTIN_PLAYERS: Record<string, string> = {
|
|
15
|
+
'claude': 'claude',
|
|
16
|
+
'claude-code': 'claude',
|
|
17
|
+
'kimi': 'kimi',
|
|
18
|
+
'cursor': 'cursor',
|
|
19
|
+
'gemini': 'gemini',
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Resolve a player reference to its platform identifier.
|
|
24
|
+
* - Built-in names (claude, kimi, cursor) map directly
|
|
25
|
+
* - Unknown names are passed through (assumed to be useAgent-compatible)
|
|
26
|
+
* - Future: custom player.toml files will override built-in mappings
|
|
27
|
+
*/
|
|
28
|
+
export function resolvePlayer(name: string): string {
|
|
29
|
+
const normalized = name.toLowerCase().trim()
|
|
30
|
+
return BUILTIN_PLAYERS[normalized] ?? normalized
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Map arena.toml sides to resolved side configs.
|
|
35
|
+
* Pure function — no IO, no agent creation.
|
|
36
|
+
*/
|
|
37
|
+
export function resolveSides(toml: ArenaToml): ResolvedSide[] {
|
|
38
|
+
return toml.side.map(side => ({
|
|
39
|
+
side,
|
|
40
|
+
platform: resolvePlayer(side.player),
|
|
41
|
+
playerName: side.player,
|
|
42
|
+
}))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ── Side grouping (for per-side aggregation in T3) ─────────────────────────
|
|
46
|
+
|
|
47
|
+
export interface SideGroup {
|
|
48
|
+
sideName: string
|
|
49
|
+
player: string
|
|
50
|
+
deck: string
|
|
51
|
+
control: boolean
|
|
52
|
+
runs: number
|
|
53
|
+
platform: string
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Group resolved sides by name for per-side statistical aggregation */
|
|
57
|
+
export function groupBySide(toml: ArenaToml): SideGroup[] {
|
|
58
|
+
return resolveSides(toml).map(rs => ({
|
|
59
|
+
sideName: rs.side.name,
|
|
60
|
+
player: rs.playerName,
|
|
61
|
+
deck: rs.side.deck,
|
|
62
|
+
control: rs.side.control,
|
|
63
|
+
runs: toml.arena.runs_per_side,
|
|
64
|
+
platform: rs.platform,
|
|
65
|
+
}))
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Get total run count from arena.toml (sides × runs_per_side) */
|
|
69
|
+
export function totalRuns(toml: ArenaToml): number {
|
|
70
|
+
return toml.side.length * toml.arena.runs_per_side
|
|
71
|
+
}
|
package/src/runner.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { mkdirSync, writeFileSync,
|
|
1
|
+
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
3
|
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
4
|
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
|
-
import { ArenaManifest, Player
|
|
5
|
+
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
6
|
+
import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
|
|
6
7
|
import { runComparativeJudge } from './comparative-judge'
|
|
8
|
+
import { parseArenaToml, buildExecutionPlan, type ArenaToml, type ExecutionPlan } from './arena-toml'
|
|
9
|
+
import { resolvePlayer, resolveSides } from './player'
|
|
10
|
+
import { aggregateAllStats } from './stats'
|
|
11
|
+
import type { SideStats } from './stats'
|
|
7
12
|
|
|
8
13
|
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
9
14
|
|
|
@@ -12,150 +17,226 @@ function stamp(): string {
|
|
|
12
17
|
return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
|
|
13
18
|
}
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
if (arrays.length === 0) return [[]]
|
|
17
|
-
const [first, ...rest] = arrays
|
|
18
|
-
const restProd = cartesian(rest)
|
|
19
|
-
return first.flatMap(a => restProd.map(r => [a, ...r]))
|
|
20
|
-
}
|
|
20
|
+
// ── Declarative runner (arena.toml → execute) ─────────────────────────────
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
export interface ArenaResult {
|
|
23
|
+
manifest: ArenaManifestType
|
|
24
|
+
report: unknown
|
|
25
|
+
stats: SideStats[]
|
|
26
|
+
artifactsDir: string
|
|
24
27
|
}
|
|
25
28
|
|
|
26
|
-
|
|
29
|
+
/** Format an execution plan as readable CLI output (pure). */
|
|
30
|
+
export function formatPlanOutput(plan: ExecutionPlan): string[] {
|
|
31
|
+
const lines: string[] = []
|
|
32
|
+
const sideCount = new Set(plan.cells.map(c => c.side)).size
|
|
33
|
+
lines.push(`\n📋 Dry-run: ${plan.total_runs} cells across ${sideCount} sides × ${plan.cells.length / Math.max(1, sideCount)} runs`)
|
|
34
|
+
for (const cell of plan.cells) {
|
|
35
|
+
lines.push(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
|
|
36
|
+
}
|
|
37
|
+
return lines
|
|
38
|
+
}
|
|
27
39
|
|
|
28
|
-
export async function
|
|
40
|
+
export async function runArenaFromToml(opts: {
|
|
41
|
+
toml: ArenaToml
|
|
29
42
|
taskPath: string
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
//
|
|
39
|
-
|
|
40
|
-
const
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
// Load deck labels from deck paths
|
|
46
|
-
const decks = deckPaths.map(p => ({ path: resolve(p) }))
|
|
47
|
-
|
|
48
|
-
// Build (player × deck) variant matrix
|
|
49
|
-
const variants = cartesian([players, decks]).map(([player, deck], i) => ({
|
|
50
|
-
participant_id: `run-${String(i + 1).padStart(2, '0')}`,
|
|
51
|
-
player,
|
|
52
|
-
deck_path: deck.path,
|
|
53
|
-
}))
|
|
43
|
+
outDir?: string
|
|
44
|
+
dryRun?: boolean
|
|
45
|
+
log?: (msg: string) => void
|
|
46
|
+
}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
|
|
47
|
+
const { toml, taskPath, outDir, dryRun, log } = opts
|
|
48
|
+
|
|
49
|
+
const plan = buildExecutionPlan(toml)
|
|
50
|
+
|
|
51
|
+
// dry-run: return plan without executing
|
|
52
|
+
if (dryRun) {
|
|
53
|
+
for (const line of formatPlanOutput(plan)) {
|
|
54
|
+
log?.(line)
|
|
55
|
+
}
|
|
56
|
+
return { plan }
|
|
57
|
+
}
|
|
54
58
|
|
|
55
|
-
// Build arena manifest
|
|
56
59
|
const arenaId = `arena-${stamp()}`
|
|
57
60
|
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
61
|
+
const resolved = resolveSides(toml)
|
|
58
62
|
|
|
63
|
+
// Build manifest
|
|
59
64
|
const manifest = ArenaManifest.parse({
|
|
60
65
|
id: arenaId,
|
|
61
66
|
created_at: new Date().toISOString(),
|
|
62
67
|
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
63
68
|
mode: 'decks',
|
|
64
|
-
participants:
|
|
65
|
-
id:
|
|
66
|
-
name:
|
|
67
|
-
player:
|
|
68
|
-
deck:
|
|
69
|
-
description: `${
|
|
69
|
+
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
70
|
+
id: r.side.name,
|
|
71
|
+
name: r.side.name,
|
|
72
|
+
player: r.platform,
|
|
73
|
+
deck: r.side.deck,
|
|
74
|
+
description: `${r.playerName} × ${r.side.deck}`,
|
|
70
75
|
})),
|
|
71
|
-
criteria,
|
|
76
|
+
criteria: toml.arena.criteria,
|
|
72
77
|
status: 'running',
|
|
73
78
|
})
|
|
74
79
|
|
|
75
80
|
mkdirSync(artifactsDir, { recursive: true })
|
|
76
81
|
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
|
|
77
82
|
|
|
78
|
-
//
|
|
79
|
-
const
|
|
83
|
+
// Execute plan: per-cell agent run
|
|
84
|
+
const verdictsBySide = new Map<string, JudgeVerdict[]>()
|
|
80
85
|
|
|
81
|
-
for (const
|
|
82
|
-
const cellDir = join(artifactsDir, 'runs',
|
|
86
|
+
for (const cell of plan.cells) {
|
|
87
|
+
const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
|
|
83
88
|
mkdirSync(cellDir, { recursive: true })
|
|
84
89
|
|
|
85
90
|
try {
|
|
91
|
+
const agent = useAgent(resolvePlayer(cell.player))
|
|
86
92
|
const result = await runAgentScenario({
|
|
87
93
|
scenarioPath: resolve(taskPath),
|
|
88
|
-
agent
|
|
94
|
+
agent,
|
|
89
95
|
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
90
96
|
mkdirSync(workdir, { recursive: true })
|
|
91
|
-
|
|
92
|
-
const deckContent = readFileSync(variant.deck_path, 'utf-8')
|
|
97
|
+
const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
|
|
93
98
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
94
99
|
},
|
|
95
|
-
baseDir: artifactsDir,
|
|
100
|
+
baseDir: join(artifactsDir, 'runs', cell.side),
|
|
96
101
|
})
|
|
97
102
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
const v = (result.verdict ?? {
|
|
104
|
+
verdict: 'ERROR' as const,
|
|
105
|
+
reason: 'No verdict returned',
|
|
106
|
+
criteria: [],
|
|
107
|
+
}) as JudgeVerdict
|
|
108
|
+
|
|
109
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
110
|
+
verdictsBySide.get(cell.side)!.push(v)
|
|
102
111
|
} catch (e) {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
verdict:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
},
|
|
112
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
113
|
+
verdictsBySide.get(cell.side)!.push({
|
|
114
|
+
verdict: 'ERROR' as const,
|
|
115
|
+
reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
|
|
116
|
+
criteria: [],
|
|
109
117
|
})
|
|
110
118
|
}
|
|
111
119
|
}
|
|
112
120
|
|
|
113
|
-
//
|
|
114
|
-
const
|
|
121
|
+
// Aggregate stats
|
|
122
|
+
const stats = aggregateAllStats(verdictsBySide)
|
|
123
|
+
|
|
124
|
+
// Comparative judge
|
|
125
|
+
const flatVerdicts: { participantId: string; verdict: unknown }[] = []
|
|
126
|
+
for (const [side, verdicts] of verdictsBySide) {
|
|
127
|
+
// Use the first run's verdict for comparative judge (or aggregate into one)
|
|
128
|
+
if (verdicts.length > 0) {
|
|
129
|
+
flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const judge = useAgent(resolved[0]?.platform ?? 'claude')
|
|
115
134
|
const report = await runComparativeJudge({
|
|
116
135
|
manifest,
|
|
117
|
-
verdicts,
|
|
136
|
+
verdicts: flatVerdicts,
|
|
118
137
|
judge,
|
|
119
138
|
workdir: artifactsDir,
|
|
120
139
|
})
|
|
121
140
|
|
|
122
141
|
// Write report
|
|
123
|
-
|
|
142
|
+
writeReport(artifactsDir, manifest, report, stats)
|
|
124
143
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
144
|
+
// Update manifest
|
|
145
|
+
const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
|
|
146
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
|
|
128
147
|
|
|
129
|
-
|
|
130
|
-
|
|
148
|
+
return { manifest: finalManifest, report, stats, artifactsDir }
|
|
149
|
+
}
|
|
131
150
|
|
|
132
|
-
|
|
133
|
-
${renderPareto(report)}
|
|
151
|
+
// ── Backward compat: CLI-flag style runner ─────────────────────────────────
|
|
134
152
|
|
|
135
|
-
|
|
136
|
-
|
|
153
|
+
export async function runArena(opts: {
|
|
154
|
+
taskPath: string
|
|
155
|
+
playerPaths: string[]
|
|
156
|
+
deckPaths: string[]
|
|
157
|
+
criteria: string[]
|
|
158
|
+
outDir: string
|
|
159
|
+
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
160
|
+
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
137
161
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
162
|
+
// Convert CLI flags to ArenaToml internally
|
|
163
|
+
const toml: ArenaToml = {
|
|
164
|
+
arena: {
|
|
165
|
+
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
166
|
+
criteria,
|
|
167
|
+
runs_per_side: 1,
|
|
168
|
+
max_participants: Math.min(playerPaths.length, deckPaths.length),
|
|
169
|
+
},
|
|
170
|
+
side: playerPaths.flatMap((playerPath, pi) =>
|
|
171
|
+
deckPaths.map((deckPath, di) => ({
|
|
172
|
+
name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
|
|
173
|
+
player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
|
|
174
|
+
deck: deckPath,
|
|
175
|
+
}))
|
|
176
|
+
),
|
|
177
|
+
}
|
|
141
178
|
|
|
142
|
-
|
|
143
|
-
const
|
|
144
|
-
|
|
179
|
+
const result = await runArenaFromToml({ toml, taskPath, outDir })
|
|
180
|
+
const { manifest, report, artifactsDir } = result as ArenaResult
|
|
181
|
+
return { manifest, report, artifactsDir }
|
|
182
|
+
}
|
|
145
183
|
|
|
146
|
-
|
|
184
|
+
// ── Report renderer ────────────────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
|
|
187
|
+
const lines: string[] = [
|
|
188
|
+
`# Arena Report: ${manifest.id}`,
|
|
189
|
+
'',
|
|
190
|
+
`**Task**: ${manifest.task}`,
|
|
191
|
+
`**Criteria**: ${manifest.criteria.join(', ')}`,
|
|
192
|
+
`**Date**: ${new Date().toISOString()}`,
|
|
193
|
+
'',
|
|
194
|
+
'## Score Matrix',
|
|
195
|
+
'',
|
|
196
|
+
renderScoreMatrix(report),
|
|
197
|
+
'',
|
|
198
|
+
'## Per-Side Statistics',
|
|
199
|
+
'',
|
|
200
|
+
renderStatsTable(stats),
|
|
201
|
+
'',
|
|
202
|
+
'## Pareto Frontier',
|
|
203
|
+
'',
|
|
204
|
+
renderPareto(report),
|
|
205
|
+
'',
|
|
206
|
+
'## Key Findings',
|
|
207
|
+
'',
|
|
208
|
+
...(report.key_findings ?? []).map((f: string) => `- ${f}`),
|
|
209
|
+
'',
|
|
210
|
+
'## Recommendations',
|
|
211
|
+
'',
|
|
212
|
+
...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
|
|
147
216
|
}
|
|
148
217
|
|
|
149
|
-
|
|
218
|
+
function renderStatsTable(stats: SideStats[]): string {
|
|
219
|
+
if (stats.length === 0) return 'No statistics available.\n'
|
|
220
|
+
|
|
221
|
+
let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
|
|
222
|
+
table += `|------|------|-----------|-----------------|----------|\n`
|
|
223
|
+
|
|
224
|
+
for (const s of stats) {
|
|
225
|
+
const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
|
|
226
|
+
const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
|
|
227
|
+
table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return table
|
|
231
|
+
}
|
|
150
232
|
|
|
151
233
|
function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
|
|
152
234
|
if (!report.score_matrix?.length) return 'No scores available.\n'
|
|
153
235
|
|
|
154
|
-
// Build participant × criterion matrix
|
|
155
236
|
const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
|
|
156
237
|
const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
|
|
157
238
|
|
|
158
|
-
let table = `| Criterion | Weight | ${participants.
|
|
239
|
+
let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
|
|
159
240
|
table += `|${'---|'.repeat(2 + participants.length)}\n`
|
|
160
241
|
|
|
161
242
|
for (const c of criteria) {
|
|
@@ -165,7 +246,6 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
|
|
|
165
246
|
}).join(' | ')} |\n`
|
|
166
247
|
}
|
|
167
248
|
|
|
168
|
-
// Weighted totals
|
|
169
249
|
table += `| **Weighted Total** | 100% | ${participants.map(p => {
|
|
170
250
|
const pScores = report.score_matrix!.filter(s => s.participant_id === p)
|
|
171
251
|
const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
|
|
@@ -177,11 +257,9 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
|
|
|
177
257
|
|
|
178
258
|
function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
|
|
179
259
|
if (!report.pareto?.length) return 'No Pareto analysis.\n'
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
186
|
-
}).join('\n')
|
|
260
|
+
return report.pareto.map(p =>
|
|
261
|
+
p.dominated
|
|
262
|
+
? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
|
|
263
|
+
: `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
264
|
+
).join('\n')
|
|
187
265
|
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { aggregateSideStats, aggregateAllStats } from './stats'
|
|
3
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
4
|
+
|
|
5
|
+
function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
|
|
6
|
+
return {
|
|
7
|
+
verdict: 'PASS',
|
|
8
|
+
reason: 'OK',
|
|
9
|
+
criteria: [{ name: 'correctness', passed: true }],
|
|
10
|
+
...overrides,
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// ── aggregateSideStats ─────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
describe('aggregateSideStats', () => {
|
|
17
|
+
test('single run: passRate=1, no variance', () => {
|
|
18
|
+
const stats = aggregateSideStats('test', [makeVerdict()])
|
|
19
|
+
expect(stats.sideName).toBe('test')
|
|
20
|
+
expect(stats.runs).toBe(1)
|
|
21
|
+
expect(stats.passRate).toBe(1)
|
|
22
|
+
expect(stats.failRate).toBe(0)
|
|
23
|
+
expect(stats.errorRate).toBe(0)
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('3 runs: 2 PASS, 1 FAIL', () => {
|
|
27
|
+
const verdicts = [
|
|
28
|
+
makeVerdict(),
|
|
29
|
+
makeVerdict(),
|
|
30
|
+
makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
|
|
31
|
+
]
|
|
32
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
33
|
+
expect(stats.passRate).toBeCloseTo(2 / 3)
|
|
34
|
+
expect(stats.failRate).toBeCloseTo(1 / 3)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('confidence: mean across runs', () => {
|
|
38
|
+
const verdicts = [
|
|
39
|
+
makeVerdict({ confidence: 90 }),
|
|
40
|
+
makeVerdict({ confidence: 80 }),
|
|
41
|
+
makeVerdict({ confidence: 70 }),
|
|
42
|
+
]
|
|
43
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
44
|
+
expect(stats.meanConfidence).toBeCloseTo(80)
|
|
45
|
+
expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
test('confidence: null when no verdict has it', () => {
|
|
49
|
+
const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
|
|
50
|
+
expect(stats.meanConfidence).toBeNull()
|
|
51
|
+
expect(stats.confidenceVariance).toBeNull()
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('per-criterion pass rate', () => {
|
|
55
|
+
const verdicts = [
|
|
56
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
57
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
|
|
58
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
59
|
+
]
|
|
60
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
61
|
+
expect(stats.criteria).toHaveLength(1)
|
|
62
|
+
expect(stats.criteria[0].name).toBe('accuracy')
|
|
63
|
+
expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
test('per-criterion scores: mean and variance', () => {
|
|
67
|
+
const verdicts = [
|
|
68
|
+
makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
|
|
69
|
+
makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
|
|
70
|
+
makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
|
|
71
|
+
]
|
|
72
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
73
|
+
expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
|
|
74
|
+
expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
|
|
75
|
+
expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
test('zero runs: all zeros', () => {
|
|
79
|
+
const stats = aggregateSideStats('empty', [])
|
|
80
|
+
expect(stats.runs).toBe(0)
|
|
81
|
+
expect(stats.passRate).toBe(0)
|
|
82
|
+
expect(stats.meanConfidence).toBeNull()
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('handles ERROR verdicts correctly', () => {
|
|
86
|
+
const verdicts = [
|
|
87
|
+
makeVerdict(),
|
|
88
|
+
makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
|
|
89
|
+
]
|
|
90
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
91
|
+
expect(stats.passRate).toBe(0.5)
|
|
92
|
+
expect(stats.errorRate).toBe(0.5)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
// ── aggregateAllStats ──────────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
describe('aggregateAllStats', () => {
|
|
99
|
+
test('aggregates multiple sides', () => {
|
|
100
|
+
const map = new Map<string, JudgeVerdict[]>()
|
|
101
|
+
map.set('side-a', [makeVerdict(), makeVerdict()])
|
|
102
|
+
map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
|
|
103
|
+
|
|
104
|
+
const stats = aggregateAllStats(map)
|
|
105
|
+
expect(stats).toHaveLength(2)
|
|
106
|
+
expect(stats[0].sideName).toBe('side-a')
|
|
107
|
+
expect(stats[0].passRate).toBe(1)
|
|
108
|
+
expect(stats[1].sideName).toBe('side-b')
|
|
109
|
+
expect(stats[1].passRate).toBe(0)
|
|
110
|
+
})
|
|
111
|
+
})
|
package/src/stats.ts
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
2
|
+
|
|
3
|
+
// ── Statistical aggregation for runs_per_side ─────────────────────────────
|
|
4
|
+
// All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
|
|
5
|
+
|
|
6
|
+
export interface CriterionStats {
|
|
7
|
+
name: string
|
|
8
|
+
mean: number
|
|
9
|
+
variance: number
|
|
10
|
+
min: number
|
|
11
|
+
max: number
|
|
12
|
+
count: number // number of runs that reported this criterion
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface SideStats {
|
|
16
|
+
sideName: string
|
|
17
|
+
runs: number
|
|
18
|
+
passRate: number // PASS / total
|
|
19
|
+
failRate: number
|
|
20
|
+
errorRate: number
|
|
21
|
+
meanConfidence: number | null // null if no verdict had confidence
|
|
22
|
+
confidenceVariance: number | null
|
|
23
|
+
criteria: CriterionStats[]
|
|
24
|
+
scoreByCriterion: Record<string, { mean: number; variance: number }>
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Helpers ────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
function mean(values: number[]): number {
|
|
30
|
+
if (values.length === 0) return 0
|
|
31
|
+
return values.reduce((a, b) => a + b, 0) / values.length
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function variance(values: number[], m?: number): number {
|
|
35
|
+
if (values.length < 2) return 0
|
|
36
|
+
const avg = m ?? mean(values)
|
|
37
|
+
return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Aggregator ────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
|
|
43
|
+
const runs = verdicts.length
|
|
44
|
+
const passCount = verdicts.filter(v => v.verdict === 'PASS').length
|
|
45
|
+
const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
|
|
46
|
+
const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
|
|
47
|
+
|
|
48
|
+
// Confidence
|
|
49
|
+
const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
|
|
50
|
+
const meanConf = confidences.length > 0 ? mean(confidences) : null
|
|
51
|
+
const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
|
|
52
|
+
|
|
53
|
+
// Per-criterion stats from verdict.criteria
|
|
54
|
+
const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
|
|
55
|
+
for (const v of verdicts) {
|
|
56
|
+
for (const c of v.criteria ?? []) {
|
|
57
|
+
if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
|
|
58
|
+
criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const criteria: CriterionStats[] = []
|
|
63
|
+
for (const [name, values] of criterionMap) {
|
|
64
|
+
const passRate = values.filter(v => v.passed).length / values.length
|
|
65
|
+
criteria.push({
|
|
66
|
+
name,
|
|
67
|
+
mean: passRate, // for criteria, "mean" = pass rate across runs
|
|
68
|
+
variance: passRate * (1 - passRate), // Bernoulli variance
|
|
69
|
+
min: 0,
|
|
70
|
+
max: 1,
|
|
71
|
+
count: values.length,
|
|
72
|
+
})
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Per-criterion scores (1-5) from verdict.scores
|
|
76
|
+
const scoreMap = new Map<string, number[]>()
|
|
77
|
+
for (const v of verdicts) {
|
|
78
|
+
if (v.scores) {
|
|
79
|
+
for (const [criterion, score] of Object.entries(v.scores)) {
|
|
80
|
+
if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
|
|
81
|
+
scoreMap.get(criterion)!.push(score)
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
|
|
87
|
+
for (const [criterion, scores] of scoreMap) {
|
|
88
|
+
const m = mean(scores)
|
|
89
|
+
scoreByCriterion[criterion] = {
|
|
90
|
+
mean: m,
|
|
91
|
+
variance: scores.length > 1 ? variance(scores, m) : 0,
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
sideName,
|
|
97
|
+
runs,
|
|
98
|
+
passRate: runs > 0 ? passCount / runs : 0,
|
|
99
|
+
failRate: runs > 0 ? failCount / runs : 0,
|
|
100
|
+
errorRate: runs > 0 ? errorCount / runs : 0,
|
|
101
|
+
meanConfidence: meanConf,
|
|
102
|
+
confidenceVariance: confVar,
|
|
103
|
+
criteria,
|
|
104
|
+
scoreByCriterion,
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Aggregate stats for all sides from a map of sideName → verdicts[] */
|
|
109
|
+
export function aggregateAllStats(
|
|
110
|
+
verdictsBySide: Map<string, JudgeVerdict[]>
|
|
111
|
+
): SideStats[] {
|
|
112
|
+
const stats: SideStats[] = []
|
|
113
|
+
for (const [sideName, verdicts] of verdictsBySide) {
|
|
114
|
+
stats.push(aggregateSideStats(sideName, verdicts))
|
|
115
|
+
}
|
|
116
|
+
return stats
|
|
117
|
+
}
|