@lythos/skill-arena 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -1
- package/src/arena-toml.test.ts +164 -0
- package/src/arena-toml.ts +172 -0
- package/src/cli.ts +95 -10
- package/src/comparative-judge.test.ts +92 -0
- package/src/comparative-judge.ts +166 -0
- package/src/player.test.ts +95 -0
- package/src/player.ts +71 -0
- package/src/runner.ts +250 -0
- package/src/stats.test.ts +111 -0
- package/src/stats.ts +117 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.9.
|
|
3
|
+
"version": "0.9.3",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -35,5 +35,9 @@
|
|
|
35
35
|
"homepage": "https://github.com/lythos-labs/lythoskill/tree/main/packages/lythoskill-arena#readme",
|
|
36
36
|
"engines": {
|
|
37
37
|
"bun": ">=1.0.0"
|
|
38
|
+
},
|
|
39
|
+
"dependencies": {
|
|
40
|
+
"@lythos/test-utils": "^0.9.1",
|
|
41
|
+
"zod-to-json-schema": "^3.25.2"
|
|
38
42
|
}
|
|
39
43
|
}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { parseArenaToml, buildExecutionPlan, ArenaToml } from './arena-toml'
|
|
3
|
+
|
|
4
|
+
const minimalToml = `
|
|
5
|
+
[arena]
|
|
6
|
+
task = "Test task"
|
|
7
|
+
criteria = ["a", "b"]
|
|
8
|
+
|
|
9
|
+
[[side]]
|
|
10
|
+
name = "runner-a"
|
|
11
|
+
player = "claude-code"
|
|
12
|
+
deck = "./decks/a.toml"
|
|
13
|
+
|
|
14
|
+
[[side]]
|
|
15
|
+
name = "runner-b"
|
|
16
|
+
player = "claude-code"
|
|
17
|
+
deck = "./decks/b.toml"
|
|
18
|
+
`
|
|
19
|
+
|
|
20
|
+
const fullToml = `
|
|
21
|
+
[arena]
|
|
22
|
+
task = "Generate auth flow diagram"
|
|
23
|
+
criteria = ["syntax", "context", "logic", "token"]
|
|
24
|
+
runs_per_side = 3
|
|
25
|
+
|
|
26
|
+
[[side]]
|
|
27
|
+
name = "minimal"
|
|
28
|
+
player = "standard-coder"
|
|
29
|
+
deck = "./decks/minimal.toml"
|
|
30
|
+
|
|
31
|
+
[[side]]
|
|
32
|
+
name = "rich"
|
|
33
|
+
player = "expert-architect"
|
|
34
|
+
deck = "./decks/rich.toml"
|
|
35
|
+
|
|
36
|
+
[[side]]
|
|
37
|
+
name = "baseline"
|
|
38
|
+
player = "standard-coder"
|
|
39
|
+
deck = "./decks/baseline.toml"
|
|
40
|
+
control = true
|
|
41
|
+
|
|
42
|
+
[side.env]
|
|
43
|
+
container = "node:20-alpine"
|
|
44
|
+
pre_run = ["npm ci", "npm run build"]
|
|
45
|
+
working_dir = "/workspace"
|
|
46
|
+
`
|
|
47
|
+
|
|
48
|
+
// ── Schema + Parser ────────────────────────────────────────────────────────
|
|
49
|
+
|
|
50
|
+
describe('parseArenaToml', () => {
|
|
51
|
+
test('parses minimal two-side arena', () => {
|
|
52
|
+
const result = parseArenaToml(minimalToml)
|
|
53
|
+
expect(result.arena.task).toBe('Test task')
|
|
54
|
+
expect(result.arena.criteria).toEqual(['a', 'b'])
|
|
55
|
+
expect(result.arena.runs_per_side).toBe(1) // default
|
|
56
|
+
expect(result.side).toHaveLength(2)
|
|
57
|
+
expect(result.side[0].name).toBe('runner-a')
|
|
58
|
+
expect(result.side[0].player).toBe('claude-code')
|
|
59
|
+
expect(result.side[0].deck).toBe('./decks/a.toml')
|
|
60
|
+
expect(result.side[0].control).toBe(false) // default
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
test('parses full arena with runs_per_side and control', () => {
|
|
64
|
+
const result = parseArenaToml(fullToml)
|
|
65
|
+
expect(result.arena.runs_per_side).toBe(3)
|
|
66
|
+
expect(result.side).toHaveLength(3)
|
|
67
|
+
expect(result.side[2].name).toBe('baseline')
|
|
68
|
+
expect(result.side[2].control).toBe(true)
|
|
69
|
+
})
|
|
70
|
+
|
|
71
|
+
test('parses side env block', () => {
|
|
72
|
+
const result = parseArenaToml(fullToml)
|
|
73
|
+
const env = result.side[2].env
|
|
74
|
+
expect(env.container).toBe('node:20-alpine')
|
|
75
|
+
expect(env.pre_run).toEqual(['npm ci', 'npm run build'])
|
|
76
|
+
expect(env.working_dir).toBe('/workspace')
|
|
77
|
+
expect(env.env_vars).toEqual({})
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('rejects fewer than 2 sides', () => {
|
|
81
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "only"\nplayer = "c"\ndeck = "x.toml"`
|
|
82
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
test('rejects empty criteria', () => {
|
|
86
|
+
const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
87
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
test('rejects non-object input', () => {
|
|
91
|
+
expect(() => ArenaToml.parse('not valid')).toThrow()
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
test('rejects missing arena section', () => {
|
|
95
|
+
expect(() => parseArenaToml('[[side]]\nname = "a"')).toThrow()
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
test('rejects runs_per_side = 0', () => {
|
|
99
|
+
const bad = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 0\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
100
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
test('parses integer and boolean values correctly', () => {
|
|
104
|
+
const toml = `[arena]\ntask = "x"\ncriteria = ["a"]\nruns_per_side = 2\nmax_participants = 5\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
105
|
+
const result = parseArenaToml(toml)
|
|
106
|
+
expect(result.arena.runs_per_side).toBe(2)
|
|
107
|
+
expect(result.arena.max_participants).toBe(5)
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
test('comments are stripped', () => {
|
|
111
|
+
const toml = `[arena]\n# this is a comment\ntask = "x"\ncriteria = ["a"]\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
112
|
+
const result = parseArenaToml(toml)
|
|
113
|
+
expect(result.arena.task).toBe('x')
|
|
114
|
+
})
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
// ── Execution Plan ─────────────────────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
describe('buildExecutionPlan', () => {
|
|
120
|
+
test('generates plan: 2 sides × 1 run = 2 cells', () => {
|
|
121
|
+
const toml = parseArenaToml(minimalToml)
|
|
122
|
+
const plan = buildExecutionPlan(toml)
|
|
123
|
+
expect(plan.task).toBe('Test task')
|
|
124
|
+
expect(plan.criteria).toEqual(['a', 'b'])
|
|
125
|
+
expect(plan.cells).toHaveLength(2)
|
|
126
|
+
expect(plan.total_runs).toBe(2)
|
|
127
|
+
expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
|
|
128
|
+
expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
test('generates plan: 3 sides × 3 runs = 9 cells', () => {
|
|
132
|
+
const toml = parseArenaToml(fullToml)
|
|
133
|
+
const plan = buildExecutionPlan(toml)
|
|
134
|
+
expect(plan.cells).toHaveLength(9)
|
|
135
|
+
expect(plan.total_runs).toBe(9)
|
|
136
|
+
|
|
137
|
+
// Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
|
|
138
|
+
expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
|
|
139
|
+
expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
|
|
140
|
+
expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
|
|
141
|
+
expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
|
|
142
|
+
expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
|
|
143
|
+
})
|
|
144
|
+
|
|
145
|
+
test('control flag preserved in plan cells', () => {
|
|
146
|
+
const toml = parseArenaToml(fullToml)
|
|
147
|
+
const plan = buildExecutionPlan(toml)
|
|
148
|
+
const baselineCells = plan.cells.filter(c => c.side === 'baseline')
|
|
149
|
+
expect(baselineCells).toHaveLength(3)
|
|
150
|
+
expect(baselineCells.every(c => c.control)).toBe(true)
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
test('dry-run: plan is pure data, no side effects', () => {
|
|
154
|
+
// The entire plan generation is a pure function — dry-run is just printing it
|
|
155
|
+
const toml = parseArenaToml(fullToml)
|
|
156
|
+
const plan = buildExecutionPlan(toml)
|
|
157
|
+
// Verify plan is self-describing for a --dry-run output
|
|
158
|
+
expect(plan.total_runs).toBeGreaterThan(0)
|
|
159
|
+
expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
|
|
160
|
+
expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
|
|
161
|
+
expect(plan.cells.every(c => typeof c.deck === 'string')).toBe(true)
|
|
162
|
+
expect(plan.cells.every(c => typeof c.run === 'number')).toBe(true)
|
|
163
|
+
})
|
|
164
|
+
})
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { z } from 'zod'
|
|
2
|
+
import type { ArenaManifest } from '@lythos/test-utils/schema'
|
|
3
|
+
|
|
4
|
+
// ── arena.toml Zod schema (declarative input, k8s-manifest style) ──────────
|
|
5
|
+
// Anchored on: ADR-20260502110308316
|
|
6
|
+
|
|
7
|
+
export const SideEnv = z.object({
|
|
8
|
+
container: z.string().optional(),
|
|
9
|
+
pre_run: z.array(z.string()).default([]),
|
|
10
|
+
working_dir: z.string().optional(),
|
|
11
|
+
env_vars: z.record(z.string()).default({}),
|
|
12
|
+
})
|
|
13
|
+
export type SideEnv = z.infer<typeof SideEnv>
|
|
14
|
+
|
|
15
|
+
export const Side = z.object({
|
|
16
|
+
name: z.string(),
|
|
17
|
+
player: z.string(), // reference to player config (useAgent resolves)
|
|
18
|
+
deck: z.string(), // path to deck.toml
|
|
19
|
+
control: z.boolean().default(false),
|
|
20
|
+
env: SideEnv.default({}),
|
|
21
|
+
})
|
|
22
|
+
export type Side = z.infer<typeof Side>
|
|
23
|
+
|
|
24
|
+
export const ArenaToml = z.object({
|
|
25
|
+
arena: z.object({
|
|
26
|
+
task: z.string(), // task description or path to TASK-arena.md
|
|
27
|
+
criteria: z.array(z.string()).min(1),
|
|
28
|
+
runs_per_side: z.number().int().positive().default(1),
|
|
29
|
+
max_participants: z.number().int().min(2).max(5).default(5),
|
|
30
|
+
}),
|
|
31
|
+
side: z.array(Side).min(2).max(5),
|
|
32
|
+
})
|
|
33
|
+
export type ArenaToml = z.infer<typeof ArenaToml>
|
|
34
|
+
|
|
35
|
+
// ── Parser ─────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
export function parseArenaToml(content: string): ArenaToml {
|
|
38
|
+
// Simple inline TOML parser for arena.toml (no external dep needed for this subset)
|
|
39
|
+
const parsed = parseToml(content)
|
|
40
|
+
return ArenaToml.parse(parsed)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ── Plan generation (pure function, dry-run visible) ───────────────────────
|
|
44
|
+
|
|
45
|
+
export interface ExecutionCell {
|
|
46
|
+
side: string // side name
|
|
47
|
+
player: string // player reference
|
|
48
|
+
deck: string // deck path
|
|
49
|
+
run: number // 1-indexed run number
|
|
50
|
+
control: boolean
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export interface ExecutionPlan {
|
|
54
|
+
task: string
|
|
55
|
+
criteria: string[]
|
|
56
|
+
cells: ExecutionCell[]
|
|
57
|
+
total_runs: number
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
|
|
61
|
+
const cells: ExecutionCell[] = []
|
|
62
|
+
for (const side of toml.side) {
|
|
63
|
+
for (let run = 1; run <= toml.arena.runs_per_side; run++) {
|
|
64
|
+
cells.push({
|
|
65
|
+
side: side.name,
|
|
66
|
+
player: side.player,
|
|
67
|
+
deck: side.deck,
|
|
68
|
+
run,
|
|
69
|
+
control: side.control,
|
|
70
|
+
})
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
task: toml.arena.task,
|
|
75
|
+
criteria: toml.arena.criteria,
|
|
76
|
+
cells,
|
|
77
|
+
total_runs: cells.length,
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ── Minimal TOML parser (handles the arena.toml subset without external dep) ──
|
|
82
|
+
|
|
83
|
+
function parseToml(text: string): Record<string, unknown> {
|
|
84
|
+
const result: Record<string, unknown> = {}
|
|
85
|
+
let currentTable: Record<string, unknown> = result
|
|
86
|
+
let currentTableKey = ''
|
|
87
|
+
const arrayTables: Map<string, Record<string, unknown>[]> = new Map()
|
|
88
|
+
|
|
89
|
+
for (const rawLine of text.split('\n')) {
|
|
90
|
+
const line = rawLine.split('#')[0].trim()
|
|
91
|
+
if (!line) continue
|
|
92
|
+
|
|
93
|
+
// [[array]]
|
|
94
|
+
const arrayMatch = line.match(/^\[\[(.+?)\]\]$/)
|
|
95
|
+
if (arrayMatch) {
|
|
96
|
+
const key = arrayMatch[1] // e.g. "side"
|
|
97
|
+
if (!arrayTables.has(key)) arrayTables.set(key, [])
|
|
98
|
+
currentTable = {}
|
|
99
|
+
arrayTables.get(key)!.push(currentTable)
|
|
100
|
+
currentTableKey = key
|
|
101
|
+
continue
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// [section]
|
|
105
|
+
const sectionMatch = line.match(/^\[(.+?)\]$/)
|
|
106
|
+
if (sectionMatch) {
|
|
107
|
+
const key = sectionMatch[1]
|
|
108
|
+
// nested key like "side.env"
|
|
109
|
+
if (key.includes('.')) {
|
|
110
|
+
const [parent, child] = key.split('.')
|
|
111
|
+
const parentArr = arrayTables.get(parent)
|
|
112
|
+
if (parentArr && parentArr.length > 0) {
|
|
113
|
+
currentTable = {}
|
|
114
|
+
parentArr[parentArr.length - 1][child] = currentTable
|
|
115
|
+
}
|
|
116
|
+
} else {
|
|
117
|
+
result[key] = {}
|
|
118
|
+
currentTable = result[key] as Record<string, unknown>
|
|
119
|
+
}
|
|
120
|
+
currentTableKey = ''
|
|
121
|
+
continue
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// key = value
|
|
125
|
+
const eqIdx = line.indexOf('=')
|
|
126
|
+
if (eqIdx !== -1) {
|
|
127
|
+
const key = line.slice(0, eqIdx).trim()
|
|
128
|
+
let value = line.slice(eqIdx + 1).trim()
|
|
129
|
+
|
|
130
|
+
// String value
|
|
131
|
+
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
132
|
+
value = value.slice(1, -1)
|
|
133
|
+
} else if (value === 'true') {
|
|
134
|
+
value = 'true'
|
|
135
|
+
} else if (value === 'false') {
|
|
136
|
+
value = 'false'
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Array value: ["a", "b"]
|
|
140
|
+
if (value.startsWith('[') && value.endsWith(']')) {
|
|
141
|
+
const inner = value.slice(1, -1).trim()
|
|
142
|
+
if (!inner) {
|
|
143
|
+
currentTable[key] = []
|
|
144
|
+
} else {
|
|
145
|
+
const arr = inner.split(',').map(s => {
|
|
146
|
+
const t = s.trim()
|
|
147
|
+
if ((t.startsWith('"') && t.endsWith('"')) || (t.startsWith("'") && t.endsWith("'"))) {
|
|
148
|
+
return t.slice(1, -1)
|
|
149
|
+
}
|
|
150
|
+
return t
|
|
151
|
+
})
|
|
152
|
+
currentTable[key] = arr
|
|
153
|
+
}
|
|
154
|
+
} else if (value === 'true') {
|
|
155
|
+
currentTable[key] = true
|
|
156
|
+
} else if (value === 'false') {
|
|
157
|
+
currentTable[key] = false
|
|
158
|
+
} else if (/^-?\d+(\.\d+)?$/.test(value)) {
|
|
159
|
+
currentTable[key] = Number(value)
|
|
160
|
+
} else {
|
|
161
|
+
currentTable[key] = value
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Materialize array tables into result
|
|
167
|
+
for (const [key, arr] of arrayTables) {
|
|
168
|
+
result[key] = arr
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return result
|
|
172
|
+
}
|
package/src/cli.ts
CHANGED
|
@@ -29,23 +29,40 @@ function printHelp(): void {
|
|
|
29
29
|
console.log(`🎭 lythoskill-arena — Skill comparison runner
|
|
30
30
|
|
|
31
31
|
Usage:
|
|
32
|
-
lythoskill-arena --task
|
|
33
|
-
lythoskill-arena --task "<
|
|
32
|
+
lythoskill-arena run --task <path> --players <A.toml,B.toml> --decks <A.toml,B.toml> --criteria <c1,c2,...> [--out <dir>]
|
|
33
|
+
lythoskill-arena scaffold --task "<description>" --skills <skill1,skill2,...>
|
|
34
|
+
lythoskill-arena scaffold --task "<description>" --decks <deck1,deck2,...>
|
|
34
35
|
lythoskill-arena viz <arena-dir>
|
|
35
36
|
|
|
37
|
+
Commands:
|
|
38
|
+
run Run arena programmatically (declarative arena.toml or CLI flags)
|
|
39
|
+
scaffold Create arena directory structure (legacy, manual subagent execution)
|
|
40
|
+
viz Visualize arena report (ASCII charts)
|
|
41
|
+
|
|
36
42
|
Options:
|
|
37
|
-
-t, --task <desc>
|
|
38
|
-
-s, --skills <list> Comma-separated skill names
|
|
43
|
+
-t, --task <path|desc> Task description or path to TASK-arena.md
|
|
44
|
+
-s, --skills <list> Comma-separated skill names (scaffold only)
|
|
39
45
|
--decks <list> Comma-separated deck paths
|
|
40
46
|
-c, --criteria <list> Evaluation criteria (default: syntax,context,logic,token)
|
|
41
|
-
--
|
|
42
|
-
|
|
47
|
+
--players <list> Comma-separated player.toml paths (CLI run only)
|
|
48
|
+
--config <path> Path to arena.toml (declarative mode, k8s-style)
|
|
49
|
+
--dry-run Print execution plan without running (with --config)
|
|
50
|
+
--control <skill> Control skill for comparison (scaffold only)
|
|
51
|
+
--out <dir> Output directory (run: defaults to runs/arena-<id>)
|
|
52
|
+
-d, --dir <dir> Output directory (scaffold: defaults to tmp)
|
|
43
53
|
-p, --project <dir> Project directory (default: .)
|
|
44
54
|
|
|
45
55
|
Examples:
|
|
46
|
-
|
|
47
|
-
lythoskill-arena
|
|
48
|
-
lythoskill-arena
|
|
56
|
+
# Declarative mode (k8s-style)
|
|
57
|
+
lythoskill-arena run --config ./arena.toml
|
|
58
|
+
lythoskill-arena run --config ./arena.toml --dry-run
|
|
59
|
+
|
|
60
|
+
# CLI-flag mode (backward compat)
|
|
61
|
+
lythoskill-arena run --task ./TASK-arena.md --players ./players/claude.toml --decks ./decks/run-01.toml,./decks/run-02.toml --criteria coverage,relevance
|
|
62
|
+
|
|
63
|
+
# Legacy scaffolding
|
|
64
|
+
lythoskill-arena scaffold --task "Refactor auth module" --skills skill-a,skill-b
|
|
65
|
+
lythoskill-arena viz runs/arena-20260504
|
|
49
66
|
`)
|
|
50
67
|
}
|
|
51
68
|
|
|
@@ -551,6 +568,67 @@ function runViz(argv: string[]) {
|
|
|
551
568
|
console.log(renderRadarChart(report))
|
|
552
569
|
}
|
|
553
570
|
|
|
571
|
+
// ── Run: programmatic arena execution ───────────────────────
|
|
572
|
+
|
|
573
|
+
async function runProgrammaticArena(argv: string[]) {
|
|
574
|
+
const { options } = parseArgs(argv)
|
|
575
|
+
const { readFileSync } = await import('node:fs')
|
|
576
|
+
|
|
577
|
+
const hasConfig = !!(options as Record<string, string | undefined>).config
|
|
578
|
+
const dryRun = argv.includes('--dry-run')
|
|
579
|
+
|
|
580
|
+
if (hasConfig) {
|
|
581
|
+
// arena.toml declarative mode
|
|
582
|
+
const { parseArenaToml } = await import('./arena-toml')
|
|
583
|
+
const { runArenaFromToml } = await import('./runner')
|
|
584
|
+
const configPath = (options as Record<string, string | undefined>).config!
|
|
585
|
+
|
|
586
|
+
const toml = parseArenaToml(readFileSync(configPath, 'utf-8'))
|
|
587
|
+
const result = await runArenaFromToml({
|
|
588
|
+
toml,
|
|
589
|
+
taskPath: toml.arena.task.startsWith('/') || toml.arena.task.startsWith('./')
|
|
590
|
+
? toml.arena.task
|
|
591
|
+
: (options as Record<string, string | undefined>).task ?? toml.arena.task,
|
|
592
|
+
outDir: (options as Record<string, string | undefined>).out,
|
|
593
|
+
dryRun,
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
if ('plan' in result) {
|
|
597
|
+
// dry-run
|
|
598
|
+
console.log(`\n📋 Dry-run: ${result.plan.total_runs} cells across ${result.plan.cells.length / Math.max(1, toml.arena.runs_per_side)} sides × ${toml.arena.runs_per_side} runs`)
|
|
599
|
+
for (const cell of result.plan.cells) {
|
|
600
|
+
console.log(` ${cell.side}/run-${cell.run}: ${cell.player} × ${cell.deck}${cell.control ? ' [control]' : ''}`)
|
|
601
|
+
}
|
|
602
|
+
return
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
606
|
+
console.log(`📁 Artifacts: ${result.artifactsDir}`)
|
|
607
|
+
console.log(`📊 Report: ${result.artifactsDir}/report.md`)
|
|
608
|
+
return
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// CLI-flag mode (backward compat)
|
|
612
|
+
if (!options.task || !options.decks) {
|
|
613
|
+
console.error('❌ --task <path> and --decks <list> are required for "run" (or use --config <arena.toml>)')
|
|
614
|
+
process.exit(1)
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const { runArena: runArenaProgrammatic } = await import('./runner')
|
|
618
|
+
|
|
619
|
+
const result = await runArenaProgrammatic({
|
|
620
|
+
taskPath: options.task,
|
|
621
|
+
playerPaths: (options.players ?? 'players/claude-code.toml').split(',').map(s => s.trim()).filter(Boolean),
|
|
622
|
+
deckPaths: options.decks.split(',').map(s => s.trim()).filter(Boolean),
|
|
623
|
+
criteria: (options.criteria ?? 'syntax,context,logic,token').split(',').map(s => s.trim()).filter(Boolean),
|
|
624
|
+
outDir: options.out ?? `runs/arena-${timestamp()}`,
|
|
625
|
+
})
|
|
626
|
+
|
|
627
|
+
console.log(`\n🎮 Arena complete: ${result.manifest.id}`)
|
|
628
|
+
console.log(`📁 Artifacts: ${result.artifactsDir}`)
|
|
629
|
+
console.log(`📊 Report: ${result.artifactsDir}/report.md`)
|
|
630
|
+
}
|
|
631
|
+
|
|
554
632
|
// ── Main Entry ───────────────────────────────────────────────
|
|
555
633
|
|
|
556
634
|
if (import.meta.main) {
|
|
@@ -559,7 +637,14 @@ if (import.meta.main) {
|
|
|
559
637
|
|
|
560
638
|
if (cmd === 'viz') {
|
|
561
639
|
runViz(args.slice(1))
|
|
640
|
+
} else if (cmd === 'run') {
|
|
641
|
+
runProgrammaticArena(args.slice(1))
|
|
642
|
+
} else if (cmd === 'scaffold' || !cmd || args[0]?.startsWith('-')) {
|
|
643
|
+
// Legacy behavior: if no subcommand or starts with flags, treat as scaffold
|
|
644
|
+
runArena(cmd === 'scaffold' ? args.slice(1) : args)
|
|
562
645
|
} else {
|
|
563
|
-
|
|
646
|
+
console.error(`❌ Unknown command: ${cmd}`)
|
|
647
|
+
printHelp()
|
|
648
|
+
process.exit(1)
|
|
564
649
|
}
|
|
565
650
|
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { describe, test, expect } from 'bun:test'
|
|
2
|
+
import { computePareto } from './comparative-judge'
|
|
3
|
+
|
|
4
|
+
describe('computePareto', () => {
|
|
5
|
+
test('single participant is always non-dominated', () => {
|
|
6
|
+
const result = computePareto([
|
|
7
|
+
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
8
|
+
])
|
|
9
|
+
expect(result).toHaveLength(1)
|
|
10
|
+
expect(result[0].dominated).toBe(false)
|
|
11
|
+
expect(result[0].dominated_by).toEqual([])
|
|
12
|
+
})
|
|
13
|
+
|
|
14
|
+
test('clear dominance: run-01 dominates run-02 on all criteria', () => {
|
|
15
|
+
const result = computePareto([
|
|
16
|
+
{ participant_id: 'run-01', scores: { coverage: 5, relevance: 5 } },
|
|
17
|
+
{ participant_id: 'run-02', scores: { coverage: 3, relevance: 2 } },
|
|
18
|
+
])
|
|
19
|
+
expect(result[0].dominated).toBe(false)
|
|
20
|
+
expect(result[1].dominated).toBe(true)
|
|
21
|
+
expect(result[1].dominated_by).toEqual(['run-01'])
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
test('equal scores: no one dominates', () => {
|
|
25
|
+
const result = computePareto([
|
|
26
|
+
{ participant_id: 'run-01', scores: { a: 4, b: 4 } },
|
|
27
|
+
{ participant_id: 'run-02', scores: { a: 4, b: 4 } },
|
|
28
|
+
])
|
|
29
|
+
expect(result[0].dominated).toBe(false)
|
|
30
|
+
expect(result[1].dominated).toBe(false)
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
test('cross dominance: each wins on different criteria', () => {
|
|
34
|
+
const result = computePareto([
|
|
35
|
+
{ participant_id: 'run-01', scores: { speed: 5, accuracy: 2 } },
|
|
36
|
+
{ participant_id: 'run-02', scores: { speed: 2, accuracy: 5 } },
|
|
37
|
+
])
|
|
38
|
+
// Neither dominates: run-01 better on speed but worse on accuracy
|
|
39
|
+
expect(result[0].dominated).toBe(false)
|
|
40
|
+
expect(result[1].dominated).toBe(false)
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
test('multi-participant: transitive dominance chain', () => {
|
|
44
|
+
const result = computePareto([
|
|
45
|
+
{ participant_id: 'best', scores: { a: 5, b: 5, c: 5 } },
|
|
46
|
+
{ participant_id: 'mid', scores: { a: 4, b: 4, c: 4 } },
|
|
47
|
+
{ participant_id: 'worst', scores: { a: 2, b: 2, c: 2 } },
|
|
48
|
+
])
|
|
49
|
+
// best dominates both, mid dominates worst
|
|
50
|
+
expect(result[0].dominated).toBe(false) // best
|
|
51
|
+
expect(result[1].dominated).toBe(true) // mid (by best)
|
|
52
|
+
expect(result[1].dominated_by).toEqual(['best'])
|
|
53
|
+
expect(result[2].dominated).toBe(true) // worst (by both)
|
|
54
|
+
expect(result[2].dominated_by.sort()).toEqual(['best', 'mid'].sort())
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
test('Pareto frontier from playground BDD-research: run-01 dominates run-02', () => {
|
|
58
|
+
// From playground/arena-bdd-research/report.md:
|
|
59
|
+
// Run-01: coverage=5, relevance=5, actionability=5, depth=5
|
|
60
|
+
// Run-02: coverage=3, relevance=2, actionability=2, depth=1
|
|
61
|
+
const result = computePareto([
|
|
62
|
+
{ participant_id: 'run-01', scores: { coverage: 5, relevance: 5, actionability: 5, depth: 5 } },
|
|
63
|
+
{ participant_id: 'run-02', scores: { coverage: 3, relevance: 2, actionability: 2, depth: 1 } },
|
|
64
|
+
])
|
|
65
|
+
expect(result[0].dominated).toBe(false) // run-01: Pareto-optimal
|
|
66
|
+
expect(result[1].dominated).toBe(true) // run-02: dominated by run-01
|
|
67
|
+
expect(result[1].dominated_by).toEqual(['run-01'])
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('empty scores object', () => {
|
|
71
|
+
const result = computePareto([
|
|
72
|
+
{ participant_id: 'a', scores: {} },
|
|
73
|
+
{ participant_id: 'b', scores: {} },
|
|
74
|
+
])
|
|
75
|
+
expect(result).toHaveLength(2)
|
|
76
|
+
expect(result[0].dominated).toBe(false)
|
|
77
|
+
expect(result[1].dominated).toBe(false)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('partial criteria overlap', () => {
|
|
81
|
+
const result = computePareto([
|
|
82
|
+
{ participant_id: 'run-01', scores: { a: 5, b: 3 } },
|
|
83
|
+
{ participant_id: 'run-02', scores: { a: 3, c: 5 } },
|
|
84
|
+
])
|
|
85
|
+
// run-01 has a=5 vs run-02 a=3 (a wins)
|
|
86
|
+
// run-02 has b=undefined vs run-01 b=3 → treated as 0. So run-01 >= run-02 on all shared crit, > on one.
|
|
87
|
+
// But c: run-01 has 0, run-02 has 5. So run-02 > run-01 on c.
|
|
88
|
+
// Cross-dominance → neither dominates
|
|
89
|
+
expect(result[0].dominated).toBe(false)
|
|
90
|
+
expect(result[1].dominated).toBe(false)
|
|
91
|
+
})
|
|
92
|
+
})
|