@lythos/skill-arena 0.9.2 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/arena-toml.test.ts +164 -0
- package/src/arena-toml.ts +172 -0
- package/src/cli.ts +49 -5
- package/src/player.test.ts +95 -0
- package/src/player.ts +71 -0
- package/src/runner.ts +155 -92
- package/src/stats.test.ts +111 -0
- package/src/stats.ts +117 -0
package/package.json
CHANGED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
|
|
3
|
+
|
|
4
|
+
const minimalToml = `
|
|
5
|
+
[arena]
|
|
6
|
+
task = "Test task"
|
|
7
|
+
criteria = ["a", "b"]
|
|
8
|
+
|
|
9
|
+
[[side]]
|
|
10
|
+
name = "runner-a"
|
|
11
|
+
player = "claude-code"
|
|
12
|
+
deck = "./decks/a.toml"
|
|
13
|
+
|
|
14
|
+
[[side]]
|
|
15
|
+
name = "runner-b"
|
|
16
|
+
player = "claude-code"
|
|
17
|
+
deck = "./decks/b.toml"
|
|
18
|
+
`
|
|
19
|
+
|
|
20
|
+
const fullToml = `
|
|
21
|
+
[arena]
|
|
22
|
+
task = "Generate auth flow diagram"
|
|
23
|
+
criteria = ["syntax", "context", "logic", "token"]
|
|
24
|
+
runs_per_side = 3
|
|
25
|
+
|
|
26
|
+
[[side]]
|
|
27
|
+
name = "minimal"
|
|
28
|
+
player = "standard-coder"
|
|
29
|
+
deck = "./decks/minimal.toml"
|
|
30
|
+
|
|
31
|
+
[[side]]
|
|
32
|
+
name = "rich"
|
|
33
|
+
player = "expert-architect"
|
|
34
|
+
deck = "./decks/rich.toml"
|
|
35
|
+
|
|
36
|
+
[[side]]
|
|
37
|
+
name = "baseline"
|
|
38
|
+
player = "standard-coder"
|
|
39
|
+
deck = "./decks/baseline.toml"
|
|
40
|
+
control = true
|
|
41
|
+
|
|
42
|
+
[side.env]
|
|
43
|
+
container = "node:20-alpine"
|
|
44
|
+
pre_run = ["npm ci", "npm run build"]
|
|
45
|
+
working_dir = "/workspace"
|
|
46
|
+
`
|
|
47
|
+
|
|
48
|
+
// ── Schema + Parser ────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
describe('parseArenaToml', () => {
|
|
51
|
+
test('parses minimal two-side arena', () => {
|
|
52
|
+
const result = parseArenaToml(minimalToml)
|
|
53
|
+
expect(result.arena.task).toBe('Test task')
|
|
54
|
+
expect(result.arena.criteria).toEqual(['a', 'b'])
|
|
55
|
+
expect(result.arena.runs_per_side).toBe(1) // default
|
|
56
|
+
expect(result.side).toHaveLength(2)
|
|
57
|
+
expect(result.side[0].name).toBe('runner-a')
|
|
58
|
+
expect(result.side[0].player).toBe('claude-code')
|
|
59
|
+
expect(result.side[0].deck).toBe('./decks/a.toml')
|
|
60
|
+
expect(result.side[0].control).toBe(false) // default
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
test('parses full arena with runs_per_side and control', () => {
|
|
64
|
+
const result = parseArenaToml(fullToml)
|
|
65
|
+
expect(result.arena.runs_per_side).toBe(3)
|
|
66
|
+
expect(result.side).toHaveLength(3)
|
|
67
|
+
expect(result.side[2].name).toBe('baseline')
|
|
68
|
+
expect(result.side[2].control).toBe(true)
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
test('parses side env block', () => {
|
|
72
|
+
const result = parseArenaToml(fullToml)
|
|
73
|
+
const env = result.side[2].env
|
|
74
|
+
expect(env.container).toBe('node:20-alpine')
|
|
75
|
+
expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
|
|
76
|
+
expect(env.working_dir).toBe('/workspace')
|
|
77
|
+
expect(env.env_vars).toEqual({})
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('rejects fewer than 2 sides', () => {
|
|
81
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
|
|
82
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('rejects empty criteria', () => {
|
|
86
|
+
const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
87
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
test('rejects non-object input', () => {
|
|
91
|
+
expect(() => ArenaToml.parse('not valid')).toThrow()
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
test('rejects missing arena section', () => {
|
|
95
|
+
expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
test('rejects runs_per_side = 0', () => {
|
|
99
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
100
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test('parses integer and boolean values correctly', () => {
|
|
104
|
+
const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
105
|
+
const result = parseArenaToml(toml)
|
|
106
|
+
expect(result.arena.runs_per_side).toBe(2)
|
|
107
|
+
expect(result.arena.max_participants).toBe(5)
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
test('comments are stripped', () => {
|
|
111
|
+
const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
112
|
+
const result = parseArenaToml(toml)
|
|
113
|
+
expect(result.arena.task).toBe('x')
|
|
114
|
+
})
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
// ── Execution Plan ─────────────────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
describe('buildExecutionPlan', () => {
|
|
120
|
+
test('generates plan: 2 sides × 1 run = 2 cells', () => {
|
|
121
|
+
const toml = parseArenaToml(minimalToml)
|
|
122
|
+
const plan = buildExecutionPlan(toml)
|
|
123
|
+
expect(plan.task).toBe('Test task')
|
|
124
|
+
expect(plan.criteria).toEqual(['a', 'b'])
|
|
125
|
+
expect(plan.cells).toHaveLength(2)
|
|
126
|
+
expect(plan.total_runs).toBe(2)
|
|
127
|
+
expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
|
|
128
|
+
expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
test('generates plan: 3 sides × 3 runs = 9 cells', () => {
|
|
132
|
+
const toml = parseArenaToml(fullToml)
|
|
133
|
+
const plan = buildExecutionPlan(toml)
|
|
134
|
+
expect(plan.cells).toHaveLength(9)
|
|
135
|
+
expect(plan.total_runs).toBe(9)
|
|
136
|
+
|
|
137
|
+
// Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
|
|
138
|
+
expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
|
|
139
|
+
expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
|
|
140
|
+
expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
|
|
141
|
+
expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
|
|
142
|
+
expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
test('control flag preserved in plan cells', () => {
|
|
146
|
+
const toml = parseArenaToml(fullToml)
|
|
147
|
+
const plan = buildExecutionPlan(toml)
|
|
148
|
+
const baselineCells = plan.cells.filter(c => c.side === 'baseline')
|
|
149
|
+
expect(baselineCells).toHaveLength(3)
|
|
150
|
+
expect(baselineCells.every(c => c.control)).toBe(true)
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
test('dry-run: plan is pure data, no side effects', () => {
|
|
154
|
+
// The entire plan generation is a pure function — dry-run is just printing it
|
|
155
|
+
const toml = parseArenaToml(fullToml)
|
|
156
|
+
const plan = buildExecutionPlan(toml)
|
|
157
|
+
// Verify plan is self-describing for a --dry-run output
|
|
158
|
+
expect(plan.total_runs).toBeGreaterThan(0)
|
|
159
|
+
expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
|
|
160
|
+
expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
|
|
161
|
+
expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
|
|
162
|
+
expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
|
|
163
|
+
})
|
|
164
|
+
})
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { z } from 'zod'
|
|
2
|
+
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
3
|
+
|
|
4
|
+
// ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
|
|
5
|
+
// Anchored on: ADR-20260502110308316
|
|
6
|
+
|
|
7
|
+
export const SideEnv = z.object({
|
|
8
|
+
container: z.string().optional(),
|
|
9
|
+
pre_run: z.array(z.string()).default([]),
|
|
10
|
+
working_dir: z.string().optional(),
|
|
11
|
+
env_vars: z.record(z.string()).default({}),
|
|
12
|
+
})
|
|
13
|
+
export type SideEnv = z.infer<typeof SideEnv>
|
|
14
|
+
|
|
15
|
+
export const Side = z.object({
|
|
16
|
+
name: z.string(),
|
|
17
|
+
player: z.string(), // reference to player config (useAgent resolves)
|
|
18
|
+
deck: z.string(), // path to deck.toml
|
|
19
|
+
control: z.boolean().default(false),
|
|
20
|
+
env: SideEnv.default({}),
|
|
21
|
+
})
|
|
22
|
+
export type Side = z.infer<typeof Side>
|
|
23
|
+
|
|
24
|
+
export const ArenaToml = z.object({
|
|
25
|
+
arena: z.object({
|
|
26
|
+
task: z.string(), // task description or path to TASK-arena.md
|
|
27
|
+
criteria: z.array(z.string()).min(1),
|
|
28
|
+
runs_per_side: z.number().int().positive().default(1),
|
|
29
|
+
max_participants: z.number().int().min(2).max(5).default(5),
|
|
30
|
+
}),
|
|
31
|
+
side: z.array(Side).min(2).max(5),
|
|
32
|
+
})
|
|
33
|
+
export type ArenaToml = z.infer<typeof ArenaToml>
|
|
34
|
+
|
|
35
|
+
// ── Parser ─────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
export function parseArenaToml(content: string): ArenaToml {
|
|
38
|
+
// Simple inline TOML parser for arena.toml (no external dep needed for this subset)
|
|
39
|
+
const parsed = parseToml(content)
|
|
40
|
+
return ArenaToml.parse(parsed)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ── Plan generation (pure function, dry-run visible) ───────────────────────
|
|
44
|
+
|
|
45
|
+
export interface ExecutionCell {
|
|
46
|
+
side: string // side name
|
|
47
|
+
player: string // player reference
|
|
48
|
+
deck: string // deck path
|
|
49
|
+
run: number // 1-indexed run number
|
|
50
|
+
control: boolean
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface ExecutionPlan {
|
|
54
|
+
task: string
|
|
55
|
+
criteria: string[]
|
|
56
|
+
cells: ExecutionCell[]
|
|
57
|
+
total_runs: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
|
|
61
|
+
const cells: ExecutionCell[] = []
|
|
62
|
+
for (const side of toml.side) {
|
|
63
|
+
for (let run = 1; run <= toml.arena.runs_per_side; run++) {
|
|
64
|
+
cells.push({
|
|
65
|
+
side: side.name,
|
|
66
|
+
player: side.player,
|
|
67
|
+
deck: side.deck,
|
|
68
|
+
run,
|
|
69
|
+
control: side.control,
|
|
70
|
+
})
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
task: toml.arena.task,
|
|
75
|
+
criteria: toml.arena.criteria,
|
|
76
|
+
cells,
|
|
77
|
+
total_runs: cells.length,
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
|
|
82
|
+
|
|
83
|
+
function parseToml(text: string): Record<string, unknown> {
|
|
84
|
+
const result: Record<string, unknown> = {}
|
|
85
|
+
let currentTable: Record<string, unknown> = result
|
|
86
|
+
let currentTableKey = ''
|
|
87
|
+
const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
|
|
88
|
+
|
|
89
|
+
for (const rawLine of text.split('\n')) {
|
|
90
|
+
const line = rawLine.split('#')[0].trim()
|
|
91
|
+
if (!line) continue
|
|
92
|
+
|
|
93
|
+
// [[array]]
|
|
94
|
+
const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
|
|
95
|
+
if (arrayMatch) {
|
|
96
|
+
const key = arrayMatch[1] // e.g. "side"
|
|
97
|
+
if (!arrayTables.has(key)) arrayTables.set(key, [])
|
|
98
|
+
currentTable = {}
|
|
99
|
+
arrayTables.get(key)!.push(currentTable)
|
|
100
|
+
currentTableKey = key
|
|
101
|
+
continue
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// [section]
|
|
105
|
+
const sectionMatch = line.match(/^\[(.+?)\]$/)
|
|
106
|
+
if (sectionMatch) {
|
|
107
|
+
const key = sectionMatch[1]
|
|
108
|
+
// nested key like "side.env"
|
|
109
|
+
if (key.includes('.')) {
|
|
110
|
+
const [parent, child] = key.split('.')
|
|
111
|
+
const parentArr = arrayTables.get(parent)
|
|
112
|
+
if (parentArr && parentArr.length > 0) {
|
|
113
|
+
currentTable = {}
|
|
114
|
+
parentArr[parentArr.length - 1][child] = currentTable
|
|
115
|
+
}
|
|
116
|
+
} else {
|
|
117
|
+
result[key] = {}
|
|
118
|
+
currentTable = result[key] as Record<string, unknown>
|
|
119
|
+
}
|
|
120
|
+
currentTableKey = ''
|
|
121
|
+
continue
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// key = value
|
|
125
|
+
const eqIdx = line.indexOf('=')
|
|
126
|
+
if (eqIdx !== -1) {
|
|
127
|
+
const key = line.slice(0, eqIdx).trim()
|
|
128
|
+
let value = line.slice(eqIdx + 1).trim()
|
|
129
|
+
|
|
130
|
+
// String value
|
|
131
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
132
|
+
value = value.slice(1, -1)
|
|
133
|
+
} else if (value === 'true') {
|
|
134
|
+
value = 'true'
|
|
135
|
+
} else if (value === 'false') {
|
|
136
|
+
value = 'false'
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Array value: ["a", "b"]
|
|
140
|
+
if (value.startsWith('[') && value.endsWith(']')) {
|
|
141
|
+
const inner = value.slice(1, -1).trim()
|
|
142
|
+
if (!inner) {
|
|
143
|
+
currentTable[key] = []
|
|
144
|
+
} else {
|
|
145
|
+
const arr = inner.split(',').map(s => {
|
|
146
|
+
const t = s.trim()
|
|
147
|
+
if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
|
|
148
|
+
return t.slice(1, -1)
|
|
149
|
+
}
|
|
150
|
+
return t
|
|
151
|
+
})
|
|
152
|
+
currentTable[key] = arr
|
|
153
|
+
}
|
|
154
|
+
} else if (value === 'true') {
|
|
155
|
+
currentTable[key] = true
|
|
156
|
+
} else if (value === 'false') {
|
|
157
|
+
currentTable[key] = false
|
|
158
|
+
} else if (/^-?\d+(\.\d+)?$/.test(value)) {
|
|
159
|
+
currentTable[key] = Number(value)
|
|
160
|
+
} else {
|
|
161
|
+
currentTable[key] = value
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Materialize array tables into result
|
|
167
|
+
for (const [key, arr] of arrayTables) {
|
|
168
|
+
result[key] = arr
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return result
|
|
172
|
+
}
|
package/src/cli.ts
CHANGED
|
@@ -35,7 +35,7 @@ Usage:
|
|
|
35
35
|
lythoskill-arena viz <arena-dir>
|
|
36
36
|
|
|
37
37
|
Commands:
|
|
38
|
-
run Run arena programmatically (
|
|
38
|
+
run Run arena programmatically (declarative arena.toml or CLI flags)
|
|
39
39
|
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
40
40
|
viz Visualize arena report (ASCII charts)
|
|
41
41
|
|
|
@@ -44,14 +44,23 @@ Options:
|
|
|
44
44
|
-s, --skills <list> Comma-separated skill names (scaffold only)
|
|
45
45
|
--decks <list> Comma-separated deck paths
|
|
46
46
|
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
47
|
-
--players <list> Comma-separated player.toml paths (run only)
|
|
47
|
+
--players <list> Comma-separated player.toml paths (CLI run only)
|
|
48
|
+
--config <path> Path to arena.toml (declarative mode, k8s-style)
|
|
49
|
+
--dry-run Print execution plan without running (with --config)
|
|
48
50
|
--control <skill> Control skill for comparison (scaffold only)
|
|
49
51
|
--out <dir> Output directory (run: defaults to runs/arena-<id>)
|
|
50
52
|
-d, --dir <dir> Output directory (scaffold: defaults to tmp)
|
|
51
53
|
-p, --project <dir> Project directory (default: .)
|
|
52
54
|
|
|
53
55
|
Examples:
|
|
54
|
-
|
|
56
|
+
# Declarative mode (k8s-style)
|
|
57
|
+
lythoskill-arena run --config ./arena.toml
|
|
58
|
+
lythoskill-arena run --config ./arena.toml --dry-run
|
|
59
|
+
|
|
60
|
+
# CLI-flag mode (backward compat)
|
|
61
|
+
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
62
|
+
|
|
63
|
+
# Legacy scaffolding
|
|
55
64
|
lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
|
|
56
65
|
lythoskill-arena viz runs/arena-20260504
|
|
57
66
|
`)
|
|
@@ -563,9 +572,45 @@ function runViz(argv: string[]) {
|
|
|
563
572
|
|
|
564
573
|
async function runProgrammaticArena(argv: string[]) {
|
|
565
574
|
const { options } = parseArgs(argv)
|
|
575
|
+
const { readFileSync } = await import('node:fs')
|
|
576
|
+
|
|
577
|
+
const hasConfig = !!(options as Record<string, string | undefined>).config
|
|
578
|
+
const dryRun = argv.includes('--dry-run')
|
|
579
|
+
|
|
580
|
+
if (hasConfig) {
|
|
581
|
+
// arena.toml declarative mode
|
|
582
|
+
const { parseArenaToml } = await import('./arena-toml')
|
|
583
|
+
const { runArenaFromToml } = await import('./runner')
|
|
584
|
+
const configPath = (options as Record<string, string | undefined>).config!
|
|
585
|
+
|
|
586
|
+
const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
|
|
587
|
+
const result = await runArenaFromToml({
|
|
588
|
+
toml,
|
|
589
|
+
taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
|
|
590
|
+
? toml.arena.task
|
|
591
|
+
: (options as Record<string, string | undefined>).task ?? toml.arena.task,
|
|
592
|
+
outDir: (options as Record<string, string | undefined>).out,
|
|
593
|
+
dryRun,
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
if ('plan' in result) {
|
|
597
|
+
// dry-run
|
|
598
|
+
console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
|
|
599
|
+
for (const cell of result.plan.cells) {
|
|
600
|
+
console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
|
|
601
|
+
}
|
|
602
|
+
return
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
606
|
+
console.log(`📁 Artifacts: ${result.artifactsDir}`)
|
|
607
|
+
console.log(`📊 Report: ${result.artifactsDir}/report.md`)
|
|
608
|
+
return
|
|
609
|
+
}
|
|
566
610
|
|
|
611
|
+
// CLI-flag mode (backward compat)
|
|
567
612
|
if (!options.task || !options.decks) {
|
|
568
|
-
console.error('❌ --task <path> and --decks <list> are required for "run"')
|
|
613
|
+
console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
|
|
569
614
|
process.exit(1)
|
|
570
615
|
}
|
|
571
616
|
|
|
@@ -577,7 +622,6 @@ async function runProgrammaticArena(argv: string[]) {
|
|
|
577
622
|
deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
|
|
578
623
|
criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
|
|
579
624
|
outDir: options.out ?? `runs/arena-${timestamp()}`,
|
|
580
|
-
projectDir: options.project,
|
|
581
625
|
})
|
|
582
626
|
|
|
583
627
|
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { resolvePlayer, resolveSides, groupBySide, totalRuns } from './player'
|
|
3
|
+
import { parseArenaToml } from './arena-toml'
|
|
4
|
+
|
|
5
|
+
const toml = parseArenaToml(`
|
|
6
|
+
[arena]
|
|
7
|
+
task = "Test task"
|
|
8
|
+
criteria = ["a", "b"]
|
|
9
|
+
runs_per_side = 3
|
|
10
|
+
|
|
11
|
+
[[side]]
|
|
12
|
+
name = "minimal"
|
|
13
|
+
player = "claude-code"
|
|
14
|
+
deck = "./decks/minimal.toml"
|
|
15
|
+
|
|
16
|
+
[[side]]
|
|
17
|
+
name = "rich"
|
|
18
|
+
player = "expert-architect"
|
|
19
|
+
deck = "./decks/rich.toml"
|
|
20
|
+
`)
|
|
21
|
+
|
|
22
|
+
describe('resolvePlayer', () => {
|
|
23
|
+
test('maps claude-code → claude', () => {
|
|
24
|
+
expect(resolvePlayer('claude-code')).toBe('claude')
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
test('maps Claude → claude (case insensitive)', () => {
|
|
28
|
+
expect(resolvePlayer('Claude')).toBe('claude')
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
test('maps kimi → kimi', () => {
|
|
32
|
+
expect(resolvePlayer('kimi')).toBe('kimi')
|
|
33
|
+
})
|
|
34
|
+
|
|
35
|
+
test('passes through unknown player names', () => {
|
|
36
|
+
expect(resolvePlayer('expert-architect')).toBe('expert-architect')
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
test('trims whitespace', () => {
|
|
40
|
+
expect(resolvePlayer(' claude-code ')).toBe('claude')
|
|
41
|
+
})
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
describe('resolveSides', () => {
|
|
45
|
+
test('resolves all sides in arena.toml', () => {
|
|
46
|
+
const sides = resolveSides(toml)
|
|
47
|
+
expect(sides).toHaveLength(2)
|
|
48
|
+
expect(sides[0].platform).toBe('claude')
|
|
49
|
+
expect(sides[1].platform).toBe('expert-architect')
|
|
50
|
+
expect(sides[0].playerName).toBe('claude-code')
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
test('preserves side config', () => {
|
|
54
|
+
const sides = resolveSides(toml)
|
|
55
|
+
expect(sides[0].side.name).toBe('minimal')
|
|
56
|
+
expect(sides[0].side.deck).toBe('./decks/minimal.toml')
|
|
57
|
+
})
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
describe('groupBySide', () => {
|
|
61
|
+
test('groups by side name with run count', () => {
|
|
62
|
+
const groups = groupBySide(toml)
|
|
63
|
+
expect(groups).toHaveLength(2)
|
|
64
|
+
expect(groups[0].runs).toBe(3) // runs_per_side
|
|
65
|
+
expect(groups[1].runs).toBe(3)
|
|
66
|
+
expect(groups[0].platform).toBe('claude')
|
|
67
|
+
})
|
|
68
|
+
|
|
69
|
+
test('control flag preserved', () => {
|
|
70
|
+
const controlToml = parseArenaToml(`
|
|
71
|
+
[arena]
|
|
72
|
+
task = "x"
|
|
73
|
+
criteria = ["a"]
|
|
74
|
+
|
|
75
|
+
[[side]]
|
|
76
|
+
name = "test"
|
|
77
|
+
player = "claude-code"
|
|
78
|
+
deck = "a.toml"
|
|
79
|
+
|
|
80
|
+
[[side]]
|
|
81
|
+
name = "baseline"
|
|
82
|
+
player = "claude-code"
|
|
83
|
+
deck = "b.toml"
|
|
84
|
+
control = true
|
|
85
|
+
`)
|
|
86
|
+
const groups = groupBySide(controlToml)
|
|
87
|
+
expect(groups[1].control).toBe(true)
|
|
88
|
+
})
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
describe('totalRuns', () => {
|
|
92
|
+
test('calculates sides × runs_per_side', () => {
|
|
93
|
+
expect(totalRuns(toml)).toBe(6) // 2 sides × 3 runs
|
|
94
|
+
})
|
|
95
|
+
})
|
package/src/player.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { Side, ArenaToml } from './arena-toml'
|
|
2
|
+
|
|
3
|
+
// ── Player reference resolution (pure function) ────────────────────────────
|
|
4
|
+
// Maps arena.toml player names → platform identifiers.
|
|
5
|
+
// AgentAdapter creation is the IO layer's job (T4), not ours.
|
|
6
|
+
|
|
7
|
+
export interface ResolvedSide {
|
|
8
|
+
side: Side
|
|
9
|
+
platform: string // resolved platform for useAgent()
|
|
10
|
+
playerName: string // original player reference
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
/** Built-in player registry. Player names that map directly to useAgent platforms. */
|
|
14
|
+
const BUILTIN_PLAYERS: Record<string, string> = {
|
|
15
|
+
'claude': 'claude',
|
|
16
|
+
'claude-code': 'claude',
|
|
17
|
+
'kimi': 'kimi',
|
|
18
|
+
'cursor': 'cursor',
|
|
19
|
+
'gemini': 'gemini',
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Resolve a player reference to its platform identifier.
|
|
24
|
+
* - Built-in names (claude, kimi, cursor) map directly
|
|
25
|
+
* - Unknown names are passed through (assumed to be useAgent-compatible)
|
|
26
|
+
* - Future: custom player.toml files will override built-in mappings
|
|
27
|
+
*/
|
|
28
|
+
export function resolvePlayer(name: string): string {
|
|
29
|
+
const normalized = name.toLowerCase().trim()
|
|
30
|
+
return BUILTIN_PLAYERS[normalized] ?? normalized
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Map arena.toml sides to resolved side configs.
|
|
35
|
+
* Pure function — no IO, no agent creation.
|
|
36
|
+
*/
|
|
37
|
+
export function resolveSides(toml: ArenaToml): ResolvedSide[] {
|
|
38
|
+
return toml.side.map(side => ({
|
|
39
|
+
side,
|
|
40
|
+
platform: resolvePlayer(side.player),
|
|
41
|
+
playerName: side.player,
|
|
42
|
+
}))
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ── Side grouping (for per-side aggregation in T3) ─────────────────────────
|
|
46
|
+
|
|
47
|
+
export interface SideGroup {
|
|
48
|
+
sideName: string
|
|
49
|
+
player: string
|
|
50
|
+
deck: string
|
|
51
|
+
control: boolean
|
|
52
|
+
runs: number
|
|
53
|
+
platform: string
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** Group resolved sides by name for per-side statistical aggregation */
|
|
57
|
+
export function groupBySide(toml: ArenaToml): SideGroup[] {
|
|
58
|
+
return resolveSides(toml).map(rs => ({
|
|
59
|
+
sideName: rs.side.name,
|
|
60
|
+
player: rs.playerName,
|
|
61
|
+
deck: rs.side.deck,
|
|
62
|
+
control: rs.side.control,
|
|
63
|
+
runs: toml.arena.runs_per_side,
|
|
64
|
+
platform: rs.platform,
|
|
65
|
+
}))
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Get total run count from arena.toml (sides × runs_per_side) */
|
|
69
|
+
export function totalRuns(toml: ArenaToml): number {
|
|
70
|
+
return toml.side.length * toml.arena.runs_per_side
|
|
71
|
+
}
|
package/src/runner.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
-
import { mkdirSync, writeFileSync,
|
|
1
|
+
import { mkdirSync, writeFileSync, readFileSync } from 'node:fs'
|
|
2
2
|
import { join, resolve } from 'node:path'
|
|
3
3
|
import { runAgentScenario, type AgentScenario } from '@lythos/test-utils/agent-bdd'
|
|
4
4
|
import { useAgent } from '@lythos/test-utils/agents'
|
|
5
|
-
import { ArenaManifest, Player
|
|
5
|
+
import { ArenaManifest, Player } from '@lythos/test-utils/schema'
|
|
6
|
+
import type { ArenaManifest as ArenaManifestType, JudgeVerdict } from '@lythos/test-utils/schema'
|
|
6
7
|
import { runComparativeJudge } from './comparative-judge'
|
|
8
|
+
import { parseArenaToml, buildExecutionPlan, type ArenaToml } from './arena-toml'
|
|
9
|
+
import { resolvePlayer, resolveSides } from './player'
|
|
10
|
+
import { aggregateAllStats } from './stats'
|
|
11
|
+
import type { SideStats } from './stats'
|
|
7
12
|
|
|
8
13
|
// ── Helpers ───────────────────────────────────────────────────────────────
|
|
9
14
|
|
|
@@ -12,150 +17,211 @@ function stamp(): string {
|
|
|
12
17
|
return `${d.getFullYear()}${String(d.getMonth() + 1).padStart(2, '0')}${String(d.getDate()).padStart(2, '0')}-${String(d.getHours()).padStart(2, '0')}${String(d.getMinutes()).padStart(2, '0')}${String(d.getSeconds()).padStart(2, '0')}`
|
|
13
18
|
}
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
if (arrays.length === 0) return [[]]
|
|
17
|
-
const [first, ...rest] = arrays
|
|
18
|
-
const restProd = cartesian(rest)
|
|
19
|
-
return first.flatMap(a => restProd.map(r => [a, ...r]))
|
|
20
|
-
}
|
|
20
|
+
// ── Declarative runner (arena.toml → execute) ─────────────────────────────
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
export interface ArenaResult {
|
|
23
|
+
manifest: ArenaManifestType
|
|
24
|
+
report: unknown
|
|
25
|
+
stats: SideStats[]
|
|
26
|
+
artifactsDir: string
|
|
24
27
|
}
|
|
25
28
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
export async function runArena(opts: {
|
|
29
|
+
export async function runArenaFromToml(opts: {
|
|
30
|
+
toml: ArenaToml
|
|
29
31
|
taskPath: string
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
outDir
|
|
34
|
-
projectDir?: string
|
|
35
|
-
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
36
|
-
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
37
|
-
|
|
38
|
-
// Load players
|
|
39
|
-
const players = playerPaths.map(p => {
|
|
40
|
-
const content = readFileSync(resolve(p), 'utf-8')
|
|
41
|
-
const parsed = Player.parse(JSON.parse(content))
|
|
42
|
-
return { path: p, ...parsed }
|
|
43
|
-
})
|
|
32
|
+
outDir?: string
|
|
33
|
+
dryRun?: boolean
|
|
34
|
+
}): Promise<ArenaResult | { plan: ReturnType<typeof buildExecutionPlan> }> {
|
|
35
|
+
const { toml, taskPath, outDir, dryRun } = opts
|
|
44
36
|
|
|
45
|
-
|
|
46
|
-
const decks = deckPaths.map(p => ({ path: resolve(p) }))
|
|
37
|
+
const plan = buildExecutionPlan(toml)
|
|
47
38
|
|
|
48
|
-
//
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
deck_path: deck.path,
|
|
53
|
-
}))
|
|
39
|
+
// dry-run: return plan without executing
|
|
40
|
+
if (dryRun) {
|
|
41
|
+
return { plan }
|
|
42
|
+
}
|
|
54
43
|
|
|
55
|
-
// Build arena manifest
|
|
56
44
|
const arenaId = `arena-${stamp()}`
|
|
57
45
|
const artifactsDir = outDir || join(process.cwd(), 'runs', arenaId)
|
|
46
|
+
const resolved = resolveSides(toml)
|
|
58
47
|
|
|
48
|
+
// Build manifest
|
|
59
49
|
const manifest = ArenaManifest.parse({
|
|
60
50
|
id: arenaId,
|
|
61
51
|
created_at: new Date().toISOString(),
|
|
62
52
|
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
63
53
|
mode: 'decks',
|
|
64
|
-
participants:
|
|
65
|
-
id:
|
|
66
|
-
name:
|
|
67
|
-
player:
|
|
68
|
-
deck:
|
|
69
|
-
description: `${
|
|
54
|
+
participants: [...new Map(resolved.map(r => [r.side.name, r])).values()].map(r => ({
|
|
55
|
+
id: r.side.name,
|
|
56
|
+
name: r.side.name,
|
|
57
|
+
player: r.platform,
|
|
58
|
+
deck: r.side.deck,
|
|
59
|
+
description: `${r.playerName} × ${r.side.deck}`,
|
|
70
60
|
})),
|
|
71
|
-
criteria,
|
|
61
|
+
criteria: toml.arena.criteria,
|
|
72
62
|
status: 'running',
|
|
73
63
|
})
|
|
74
64
|
|
|
75
65
|
mkdirSync(artifactsDir, { recursive: true })
|
|
76
66
|
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(manifest, null, 2) + '\n')
|
|
77
67
|
|
|
78
|
-
//
|
|
79
|
-
const
|
|
68
|
+
// Execute plan: per-cell agent run
|
|
69
|
+
const verdictsBySide = new Map<string, JudgeVerdict[]>()
|
|
80
70
|
|
|
81
|
-
for (const
|
|
82
|
-
const cellDir = join(artifactsDir, 'runs',
|
|
71
|
+
for (const cell of plan.cells) {
|
|
72
|
+
const cellDir = join(artifactsDir, 'runs', cell.side, `run-${cell.run}`)
|
|
83
73
|
mkdirSync(cellDir, { recursive: true })
|
|
84
74
|
|
|
85
75
|
try {
|
|
76
|
+
const agent = useAgent(resolvePlayer(cell.player))
|
|
86
77
|
const result = await runAgentScenario({
|
|
87
78
|
scenarioPath: resolve(taskPath),
|
|
88
|
-
agent
|
|
79
|
+
agent,
|
|
89
80
|
setupWorkdir(_scenario: AgentScenario, workdir: string) {
|
|
90
81
|
mkdirSync(workdir, { recursive: true })
|
|
91
|
-
|
|
92
|
-
const deckContent = readFileSync(variant.deck_path, 'utf-8')
|
|
82
|
+
const deckContent = readFileSync(resolve(cell.deck), 'utf-8')
|
|
93
83
|
writeFileSync(join(workdir, 'skill-deck.toml'), deckContent)
|
|
94
84
|
},
|
|
95
|
-
baseDir: artifactsDir,
|
|
85
|
+
baseDir: join(artifactsDir, 'runs', cell.side),
|
|
96
86
|
})
|
|
97
87
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
88
|
+
const v = (result.verdict ?? {
|
|
89
|
+
verdict: 'ERROR' as const,
|
|
90
|
+
reason: 'No verdict returned',
|
|
91
|
+
criteria: [],
|
|
92
|
+
}) as JudgeVerdict
|
|
93
|
+
|
|
94
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
95
|
+
verdictsBySide.get(cell.side)!.push(v)
|
|
102
96
|
} catch (e) {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
verdict:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
},
|
|
97
|
+
if (!verdictsBySide.has(cell.side)) verdictsBySide.set(cell.side, [])
|
|
98
|
+
verdictsBySide.get(cell.side)!.push({
|
|
99
|
+
verdict: 'ERROR' as const,
|
|
100
|
+
reason: `Runner exception: ${e instanceof Error ? e.message : String(e)}`,
|
|
101
|
+
criteria: [],
|
|
109
102
|
})
|
|
110
103
|
}
|
|
111
104
|
}
|
|
112
105
|
|
|
113
|
-
//
|
|
114
|
-
const
|
|
106
|
+
// Aggregate stats
|
|
107
|
+
const stats = aggregateAllStats(verdictsBySide)
|
|
108
|
+
|
|
109
|
+
// Comparative judge
|
|
110
|
+
const flatVerdicts: { participantId: string; verdict: unknown }[] = []
|
|
111
|
+
for (const [side, verdicts] of verdictsBySide) {
|
|
112
|
+
// Use the first run's verdict for comparative judge (or aggregate into one)
|
|
113
|
+
if (verdicts.length > 0) {
|
|
114
|
+
flatVerdicts.push({ participantId: side, verdict: verdicts[0] })
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const judge = useAgent(resolved[0]?.platform ?? 'claude')
|
|
115
119
|
const report = await runComparativeJudge({
|
|
116
120
|
manifest,
|
|
117
|
-
verdicts,
|
|
121
|
+
verdicts: flatVerdicts,
|
|
118
122
|
judge,
|
|
119
123
|
workdir: artifactsDir,
|
|
120
124
|
})
|
|
121
125
|
|
|
122
126
|
// Write report
|
|
123
|
-
|
|
127
|
+
writeReport(artifactsDir, manifest, report, stats)
|
|
124
128
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
129
|
+
// Update manifest
|
|
130
|
+
const finalManifest = ArenaManifest.parse({ ...manifest, status: 'completed' })
|
|
131
|
+
writeFileSync(join(artifactsDir, 'arena.json'), JSON.stringify(finalManifest, null, 2) + '\n')
|
|
128
132
|
|
|
129
|
-
|
|
130
|
-
|
|
133
|
+
return { manifest: finalManifest, report, stats, artifactsDir }
|
|
134
|
+
}
|
|
131
135
|
|
|
132
|
-
|
|
133
|
-
${renderPareto(report)}
|
|
136
|
+
// ── Backward compat: CLI-flag style runner ─────────────────────────────────
|
|
134
137
|
|
|
135
|
-
|
|
136
|
-
|
|
138
|
+
export async function runArena(opts: {
|
|
139
|
+
taskPath: string
|
|
140
|
+
playerPaths: string[]
|
|
141
|
+
deckPaths: string[]
|
|
142
|
+
criteria: string[]
|
|
143
|
+
outDir: string
|
|
144
|
+
}): Promise<{ manifest: ArenaManifestType; report: unknown; artifactsDir: string }> {
|
|
145
|
+
const { taskPath, playerPaths, deckPaths, criteria, outDir } = opts
|
|
137
146
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
147
|
+
// Convert CLI flags to ArenaToml internally
|
|
148
|
+
const toml: ArenaToml = {
|
|
149
|
+
arena: {
|
|
150
|
+
task: readFileSync(resolve(taskPath), 'utf-8').slice(0, 200),
|
|
151
|
+
criteria,
|
|
152
|
+
runs_per_side: 1,
|
|
153
|
+
max_participants: Math.min(playerPaths.length, deckPaths.length),
|
|
154
|
+
},
|
|
155
|
+
side: playerPaths.flatMap((playerPath, pi) =>
|
|
156
|
+
deckPaths.map((deckPath, di) => ({
|
|
157
|
+
name: `run-${String(pi * deckPaths.length + di + 1).padStart(2, '0')}`,
|
|
158
|
+
player: Player.parse(JSON.parse(readFileSync(resolve(playerPath), 'utf-8'))).platform,
|
|
159
|
+
deck: deckPath,
|
|
160
|
+
}))
|
|
161
|
+
),
|
|
162
|
+
}
|
|
141
163
|
|
|
142
|
-
|
|
143
|
-
const
|
|
144
|
-
|
|
164
|
+
const result = await runArenaFromToml({ toml, taskPath, outDir })
|
|
165
|
+
const { manifest, report, artifactsDir } = result as ArenaResult
|
|
166
|
+
return { manifest, report, artifactsDir }
|
|
167
|
+
}
|
|
145
168
|
|
|
146
|
-
|
|
169
|
+
// ── Report renderer ────────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
function writeReport(dir: string, manifest: ArenaManifestType, report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[]; pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[]; key_findings?: string[]; recommendations?: { audience: string; recommendation: string }[] }, stats: SideStats[]): void {
|
|
172
|
+
const lines: string[] = [
|
|
173
|
+
`# Arena Report: ${manifest.id}`,
|
|
174
|
+
'',
|
|
175
|
+
`**Task**: ${manifest.task}`,
|
|
176
|
+
`**Criteria**: ${manifest.criteria.join(', ')}`,
|
|
177
|
+
`**Date**: ${new Date().toISOString()}`,
|
|
178
|
+
'',
|
|
179
|
+
'## Score Matrix',
|
|
180
|
+
'',
|
|
181
|
+
renderScoreMatrix(report),
|
|
182
|
+
'',
|
|
183
|
+
'## Per-Side Statistics',
|
|
184
|
+
'',
|
|
185
|
+
renderStatsTable(stats),
|
|
186
|
+
'',
|
|
187
|
+
'## Pareto Frontier',
|
|
188
|
+
'',
|
|
189
|
+
renderPareto(report),
|
|
190
|
+
'',
|
|
191
|
+
'## Key Findings',
|
|
192
|
+
'',
|
|
193
|
+
...(report.key_findings ?? []).map((f: string) => `- ${f}`),
|
|
194
|
+
'',
|
|
195
|
+
'## Recommendations',
|
|
196
|
+
'',
|
|
197
|
+
...(report.recommendations ?? []).map((r: { audience: string; recommendation: string }) => `- **${r.audience}**: ${r.recommendation}`),
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
writeFileSync(join(dir, 'report.md'), lines.join('\n') + '\n')
|
|
147
201
|
}
|
|
148
202
|
|
|
149
|
-
|
|
203
|
+
function renderStatsTable(stats: SideStats[]): string {
|
|
204
|
+
if (stats.length === 0) return 'No statistics available.\n'
|
|
205
|
+
|
|
206
|
+
let table = `| Side | Runs | Pass Rate | Mean Confidence | Criteria |\n`
|
|
207
|
+
table += `|------|------|-----------|-----------------|----------|\n`
|
|
208
|
+
|
|
209
|
+
for (const s of stats) {
|
|
210
|
+
const confStr = s.meanConfidence != null ? `${s.meanConfidence.toFixed(0)}%` : '-'
|
|
211
|
+
const criteriaStr = s.criteria.map(c => `${c.name}: ${(c.mean * 100).toFixed(0)}%`).join(', ')
|
|
212
|
+
table += `| ${s.sideName} | ${s.runs} | ${(s.passRate * 100).toFixed(0)}% | ${confStr} | ${criteriaStr} |\n`
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
return table
|
|
216
|
+
}
|
|
150
217
|
|
|
151
218
|
function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id: string; criterion: string; weight: number; score: number; rationale: string }[] }): string {
|
|
152
219
|
if (!report.score_matrix?.length) return 'No scores available.\n'
|
|
153
220
|
|
|
154
|
-
// Build participant × criterion matrix
|
|
155
221
|
const participants = [...new Set(report.score_matrix.map(s => s.participant_id))]
|
|
156
222
|
const criteria = [...new Set(report.score_matrix.map(s => s.criterion))]
|
|
157
223
|
|
|
158
|
-
let table = `| Criterion | Weight | ${participants.
|
|
224
|
+
let table = `| Criterion | Weight | ${participants.join(' | ')} |\n`
|
|
159
225
|
table += `|${'---|'.repeat(2 + participants.length)}\n`
|
|
160
226
|
|
|
161
227
|
for (const c of criteria) {
|
|
@@ -165,7 +231,6 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
|
|
|
165
231
|
}).join(' | ')} |\n`
|
|
166
232
|
}
|
|
167
233
|
|
|
168
|
-
// Weighted totals
|
|
169
234
|
table += `| **Weighted Total** | 100% | ${participants.map(p => {
|
|
170
235
|
const pScores = report.score_matrix!.filter(s => s.participant_id === p)
|
|
171
236
|
const avg = pScores.length ? pScores.reduce((sum, s) => sum + s.score, 0) / pScores.length : 0
|
|
@@ -177,11 +242,9 @@ function renderScoreMatrix(report: unknown & { score_matrix?: { participant_id:
|
|
|
177
242
|
|
|
178
243
|
function renderPareto(report: unknown & { pareto?: { participant_id: string; dominated: boolean; dominated_by: string[] }[] }): string {
|
|
179
244
|
if (!report.pareto?.length) return 'No Pareto analysis.\n'
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
return `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
186
|
-
}).join('\n')
|
|
245
|
+
return report.pareto.map(p =>
|
|
246
|
+
p.dominated
|
|
247
|
+
? `- **${p.participant_id}**: dominated by ${p.dominated_by.join(', ')}`
|
|
248
|
+
: `- **${p.participant_id}**: Pareto-optimal (non-dominated)`
|
|
249
|
+
).join('\n')
|
|
187
250
|
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { aggregateSideStats, aggregateAllStats } from './stats'
|
|
3
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
4
|
+
|
|
5
|
+
function makeVerdict(overrides?: Partial<JudgeVerdict>): JudgeVerdict {
|
|
6
|
+
return {
|
|
7
|
+
verdict: 'PASS',
|
|
8
|
+
reason: 'OK',
|
|
9
|
+
criteria: [{ name: 'correctness', passed: true }],
|
|
10
|
+
...overrides,
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// ── aggregateSideStats ─────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
describe('aggregateSideStats', () => {
|
|
17
|
+
test('single run: passRate=1, no variance', () => {
|
|
18
|
+
const stats = aggregateSideStats('test', [makeVerdict()])
|
|
19
|
+
expect(stats.sideName).toBe('test')
|
|
20
|
+
expect(stats.runs).toBe(1)
|
|
21
|
+
expect(stats.passRate).toBe(1)
|
|
22
|
+
expect(stats.failRate).toBe(0)
|
|
23
|
+
expect(stats.errorRate).toBe(0)
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
test('3 runs: 2 PASS, 1 FAIL', () => {
|
|
27
|
+
const verdicts = [
|
|
28
|
+
makeVerdict(),
|
|
29
|
+
makeVerdict(),
|
|
30
|
+
makeVerdict({ verdict: 'FAIL', reason: 'bad' }),
|
|
31
|
+
]
|
|
32
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
33
|
+
expect(stats.passRate).toBeCloseTo(2 / 3)
|
|
34
|
+
expect(stats.failRate).toBeCloseTo(1 / 3)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('confidence: mean across runs', () => {
|
|
38
|
+
const verdicts = [
|
|
39
|
+
makeVerdict({ confidence: 90 }),
|
|
40
|
+
makeVerdict({ confidence: 80 }),
|
|
41
|
+
makeVerdict({ confidence: 70 }),
|
|
42
|
+
]
|
|
43
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
44
|
+
expect(stats.meanConfidence).toBeCloseTo(80)
|
|
45
|
+
expect(stats.confidenceVariance).toBeCloseTo(100) // (100+0+100)/2 = 100
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
test('confidence: null when no verdict has it', () => {
|
|
49
|
+
const stats = aggregateSideStats('test', [makeVerdict(), makeVerdict()])
|
|
50
|
+
expect(stats.meanConfidence).toBeNull()
|
|
51
|
+
expect(stats.confidenceVariance).toBeNull()
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
test('per-criterion pass rate', () => {
|
|
55
|
+
const verdicts = [
|
|
56
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
57
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: false }] }),
|
|
58
|
+
makeVerdict({ criteria: [{ name: 'accuracy', passed: true }] }),
|
|
59
|
+
]
|
|
60
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
61
|
+
expect(stats.criteria).toHaveLength(1)
|
|
62
|
+
expect(stats.criteria[0].name).toBe('accuracy')
|
|
63
|
+
expect(stats.criteria[0].mean).toBeCloseTo(2 / 3)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
test('per-criterion scores: mean and variance', () => {
|
|
67
|
+
const verdicts = [
|
|
68
|
+
makeVerdict({ scores: { coverage: 5, relevance: 4 } }),
|
|
69
|
+
makeVerdict({ scores: { coverage: 3, relevance: 4 } }),
|
|
70
|
+
makeVerdict({ scores: { coverage: 4, relevance: 4 } }),
|
|
71
|
+
]
|
|
72
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
73
|
+
expect(stats.scoreByCriterion.coverage.mean).toBeCloseTo(4)
|
|
74
|
+
expect(stats.scoreByCriterion.relevance.mean).toBeCloseTo(4)
|
|
75
|
+
expect(stats.scoreByCriterion.relevance.variance).toBe(0) // all 4s
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
test('zero runs: all zeros', () => {
|
|
79
|
+
const stats = aggregateSideStats('empty', [])
|
|
80
|
+
expect(stats.runs).toBe(0)
|
|
81
|
+
expect(stats.passRate).toBe(0)
|
|
82
|
+
expect(stats.meanConfidence).toBeNull()
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('handles ERROR verdicts correctly', () => {
|
|
86
|
+
const verdicts = [
|
|
87
|
+
makeVerdict(),
|
|
88
|
+
makeVerdict({ verdict: 'ERROR', reason: 'parse failed' }),
|
|
89
|
+
]
|
|
90
|
+
const stats = aggregateSideStats('test', verdicts)
|
|
91
|
+
expect(stats.passRate).toBe(0.5)
|
|
92
|
+
expect(stats.errorRate).toBe(0.5)
|
|
93
|
+
})
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
// ── aggregateAllStats ──────────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
describe('aggregateAllStats', () => {
|
|
99
|
+
test('aggregates multiple sides', () => {
|
|
100
|
+
const map = new Map<string, JudgeVerdict[]>()
|
|
101
|
+
map.set('side-a', [makeVerdict(), makeVerdict()])
|
|
102
|
+
map.set('side-b', [makeVerdict({ verdict: 'FAIL', reason: 'nope' })])
|
|
103
|
+
|
|
104
|
+
const stats = aggregateAllStats(map)
|
|
105
|
+
expect(stats).toHaveLength(2)
|
|
106
|
+
expect(stats[0].sideName).toBe('side-a')
|
|
107
|
+
expect(stats[0].passRate).toBe(1)
|
|
108
|
+
expect(stats[1].sideName).toBe('side-b')
|
|
109
|
+
expect(stats[1].passRate).toBe(0)
|
|
110
|
+
})
|
|
111
|
+
})
|
package/src/stats.ts
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import type { JudgeVerdict } from '@lythos/test-utils/schema'
|
|
2
|
+
|
|
3
|
+
// ── Statistical aggregation for runs_per_side ─────────────────────────────
|
|
4
|
+
// All pure functions. Input: N verdicts from N runs. Output: aggregated stats.
|
|
5
|
+
|
|
6
|
+
export interface CriterionStats {
|
|
7
|
+
name: string
|
|
8
|
+
mean: number
|
|
9
|
+
variance: number
|
|
10
|
+
min: number
|
|
11
|
+
max: number
|
|
12
|
+
count: number // number of runs that reported this criterion
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface SideStats {
|
|
16
|
+
sideName: string
|
|
17
|
+
runs: number
|
|
18
|
+
passRate: number // PASS / total
|
|
19
|
+
failRate: number
|
|
20
|
+
errorRate: number
|
|
21
|
+
meanConfidence: number | null // null if no verdict had confidence
|
|
22
|
+
confidenceVariance: number | null
|
|
23
|
+
criteria: CriterionStats[]
|
|
24
|
+
scoreByCriterion: Record<string, { mean: number; variance: number }>
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Helpers ────────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
function mean(values: number[]): number {
|
|
30
|
+
if (values.length === 0) return 0
|
|
31
|
+
return values.reduce((a, b) => a + b, 0) / values.length
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function variance(values: number[], m?: number): number {
|
|
35
|
+
if (values.length < 2) return 0
|
|
36
|
+
const avg = m ?? mean(values)
|
|
37
|
+
return values.reduce((sum, v) => sum + (v - avg) ** 2, 0) / (values.length - 1)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ── Aggregator ────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
export function aggregateSideStats(sideName: string, verdicts: JudgeVerdict[]): SideStats {
|
|
43
|
+
const runs = verdicts.length
|
|
44
|
+
const passCount = verdicts.filter(v => v.verdict === 'PASS').length
|
|
45
|
+
const failCount = verdicts.filter(v => v.verdict === 'FAIL').length
|
|
46
|
+
const errorCount = verdicts.filter(v => v.verdict === 'ERROR').length
|
|
47
|
+
|
|
48
|
+
// Confidence
|
|
49
|
+
const confidences = verdicts.map(v => v.confidence).filter((c): c is number => c != null)
|
|
50
|
+
const meanConf = confidences.length > 0 ? mean(confidences) : null
|
|
51
|
+
const confVar = confidences.length > 1 ? variance(confidences, meanConf!) : null
|
|
52
|
+
|
|
53
|
+
// Per-criterion stats from verdict.criteria
|
|
54
|
+
const criterionMap = new Map<string, { passed: boolean; note?: string }[]>()
|
|
55
|
+
for (const v of verdicts) {
|
|
56
|
+
for (const c of v.criteria ?? []) {
|
|
57
|
+
if (!criterionMap.has(c.name)) criterionMap.set(c.name, [])
|
|
58
|
+
criterionMap.get(c.name)!.push({ passed: c.passed, note: c.note })
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const criteria: CriterionStats[] = []
|
|
63
|
+
for (const [name, values] of criterionMap) {
|
|
64
|
+
const passRate = values.filter(v => v.passed).length / values.length
|
|
65
|
+
criteria.push({
|
|
66
|
+
name,
|
|
67
|
+
mean: passRate, // for criteria, "mean" = pass rate across runs
|
|
68
|
+
variance: passRate * (1 - passRate), // Bernoulli variance
|
|
69
|
+
min: 0,
|
|
70
|
+
max: 1,
|
|
71
|
+
count: values.length,
|
|
72
|
+
})
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Per-criterion scores (1-5) from verdict.scores
|
|
76
|
+
const scoreMap = new Map<string, number[]>()
|
|
77
|
+
for (const v of verdicts) {
|
|
78
|
+
if (v.scores) {
|
|
79
|
+
for (const [criterion, score] of Object.entries(v.scores)) {
|
|
80
|
+
if (!scoreMap.has(criterion)) scoreMap.set(criterion, [])
|
|
81
|
+
scoreMap.get(criterion)!.push(score)
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const scoreByCriterion: Record<string, { mean: number; variance: number }> = {}
|
|
87
|
+
for (const [criterion, scores] of scoreMap) {
|
|
88
|
+
const m = mean(scores)
|
|
89
|
+
scoreByCriterion[criterion] = {
|
|
90
|
+
mean: m,
|
|
91
|
+
variance: scores.length > 1 ? variance(scores, m) : 0,
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
sideName,
|
|
97
|
+
runs,
|
|
98
|
+
passRate: runs > 0 ? passCount / runs : 0,
|
|
99
|
+
failRate: runs > 0 ? failCount / runs : 0,
|
|
100
|
+
errorRate: runs > 0 ? errorCount / runs : 0,
|
|
101
|
+
meanConfidence: meanConf,
|
|
102
|
+
confidenceVariance: confVar,
|
|
103
|
+
criteria,
|
|
104
|
+
scoreByCriterion,
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Aggregate stats for all sides from a map of sideName → verdicts[] */
|
|
109
|
+
export function aggregateAllStats(
|
|
110
|
+
verdictsBySide: Map<string, JudgeVerdict[]>
|
|
111
|
+
): SideStats[] {
|
|
112
|
+
const stats: SideStats[] = []
|
|
113
|
+
for (const [sideName, verdicts] of verdictsBySide) {
|
|
114
|
+
stats.push(aggregateSideStats(sideName, verdicts))
|
|
115
|
+
}
|
|
116
|
+
return stats
|
|
117
|
+
}
|