@lythos/skill-arena 0.11.2 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/package.json +5 -5
- package/src/arena-toml.test.ts +44 -46
- package/src/arena-toml.ts +12 -13
- package/src/cli.ts +238 -667
- package/src/runner.ts +152 -183
package/README.md
CHANGED
|
@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
|
|
|
49
49
|
```bash
|
|
50
50
|
bun add -d @lythos/skill-arena
|
|
51
51
|
# or use directly
|
|
52
|
-
bunx @lythos/skill-arena@0.
|
|
52
|
+
bunx @lythos/skill-arena@0.13.0 <command>
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
## Quick Start
|
|
56
56
|
|
|
57
57
|
```bash
|
|
58
58
|
# Single: test a deck with one agent
|
|
59
|
-
bunx @lythos/skill-arena@0.
|
|
59
|
+
bunx @lythos/skill-arena@0.13.0 single \
|
|
60
60
|
--deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
|
|
61
61
|
--brief "Generate auth flow diagram"
|
|
62
62
|
|
|
63
63
|
# Vs: compare multiple decks side by side
|
|
64
64
|
curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
|
|
65
|
-
bunx @lythos/skill-arena@0.
|
|
65
|
+
bunx @lythos/skill-arena@0.13.0 vs --config ./arena.toml
|
|
66
66
|
```
|
|
67
67
|
|
|
68
68
|
## Commands
|
|
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.11.2 vs --config ./arena.toml
|
|
|
71
71
|
|
|
72
72
|
```bash
|
|
73
73
|
# Print execution plan without running
|
|
74
|
-
bunx @lythos/skill-arena@0.
|
|
74
|
+
bunx @lythos/skill-arena@0.13.0 vs --config arena.toml --dry-run
|
|
75
75
|
|
|
76
76
|
# Execute with per-side runs_per_side and statistical aggregation
|
|
77
|
-
bunx @lythos/skill-arena@0.
|
|
77
|
+
bunx @lythos/skill-arena@0.13.0 vs --config arena.toml
|
|
78
78
|
```
|
|
79
79
|
|
|
80
80
|
### Scaffold mode (legacy, manual execution)
|
|
81
81
|
|
|
82
82
|
```
|
|
83
|
-
bunx @lythos/skill-arena@0.
|
|
83
|
+
bunx @lythos/skill-arena@0.13.0 scaffold --task "Generate auth flow diagram" \
|
|
84
84
|
--decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
|
|
85
85
|
```
|
|
86
86
|
|
|
87
87
|
### Viz
|
|
88
88
|
|
|
89
89
|
```bash
|
|
90
|
-
bunx @lythos/skill-arena@0.
|
|
90
|
+
bunx @lythos/skill-arena@0.13.0 viz runs/arena-<id>/
|
|
91
91
|
```
|
|
92
92
|
|
|
93
93
|
## Skill Documentation
|
|
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
|
|
|
101
101
|
Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
|
|
102
102
|
|
|
103
103
|
```
|
|
104
|
-
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.
|
|
104
|
+
Starter (this package) → npm publish → bunx @lythos/skill-arena@0.13.0 ...
|
|
105
105
|
Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
|
|
106
106
|
Output (skills/<name>/) → git commit → agent-visible skill
|
|
107
107
|
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lythos/skill-arena",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.0",
|
|
4
4
|
"description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai-agent",
|
|
@@ -42,13 +42,13 @@
|
|
|
42
42
|
"bun": ">=1.0.0"
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"@lythos/cold-pool": "^0.
|
|
46
|
-
"@lythos/infra": "^0.
|
|
47
|
-
"@lythos/test-utils": "^0.
|
|
45
|
+
"@lythos/cold-pool": "^0.13.0",
|
|
46
|
+
"@lythos/infra": "^0.13.0",
|
|
47
|
+
"@lythos/test-utils": "^0.13.0",
|
|
48
48
|
"zod": "^3.24.0",
|
|
49
49
|
"zod-to-json-schema": "^3.25.2"
|
|
50
50
|
},
|
|
51
51
|
"optionalDependencies": {
|
|
52
|
-
"@lythos/agent-adapter-claude-sdk": "^0.
|
|
52
|
+
"@lythos/agent-adapter-claude-sdk": "^0.13.0"
|
|
53
53
|
}
|
|
54
54
|
}
|
package/src/arena-toml.test.ts
CHANGED
|
@@ -18,6 +18,22 @@ player = "claude-code"
|
|
|
18
18
|
deck = "./decks/b.toml"
|
|
19
19
|
`
|
|
20
20
|
|
|
21
|
+
const judgeToml = `
|
|
22
|
+
[arena]
|
|
23
|
+
task = "Test task"
|
|
24
|
+
judge = "Evaluate completeness and correctness. Return JSON."
|
|
25
|
+
|
|
26
|
+
[[side]]
|
|
27
|
+
name = "runner-a"
|
|
28
|
+
player = "claude-code"
|
|
29
|
+
deck = "./decks/a.toml"
|
|
30
|
+
|
|
31
|
+
[[side]]
|
|
32
|
+
name = "runner-b"
|
|
33
|
+
player = "claude-code"
|
|
34
|
+
deck = "./decks/b.toml"
|
|
35
|
+
`
|
|
36
|
+
|
|
21
37
|
const fullToml = `
|
|
22
38
|
[arena]
|
|
23
39
|
task = "Generate auth flow diagram"
|
|
@@ -46,19 +62,24 @@ pre_run = ["npm ci", "npm run build"]
|
|
|
46
62
|
working_dir = "/workspace"
|
|
47
63
|
`
|
|
48
64
|
|
|
49
|
-
// ── Schema + Parser ────────────────────────────────────────────────────────
|
|
50
|
-
|
|
51
65
|
describe('parseArenaToml', () => {
|
|
52
|
-
test('parses minimal two-side arena', () => {
|
|
66
|
+
test('parses minimal two-side arena with criteria', () => {
|
|
53
67
|
const result = parseArenaToml(minimalToml)
|
|
54
68
|
expect(result.arena.task).toBe('Test task')
|
|
55
69
|
expect(result.arena.criteria).toEqual(['a', 'b'])
|
|
56
|
-
expect(result.arena.runs_per_side).toBe(1)
|
|
70
|
+
expect(result.arena.runs_per_side).toBe(1)
|
|
57
71
|
expect(result.side).toHaveLength(2)
|
|
58
72
|
expect(result.side[0].name).toBe('runner-a')
|
|
59
73
|
expect(result.side[0].player).toBe('claude-code')
|
|
60
74
|
expect(result.side[0].deck).toBe('./decks/a.toml')
|
|
61
|
-
expect(result.side[0].control).toBe(false)
|
|
75
|
+
expect(result.side[0].control).toBe(false)
|
|
76
|
+
})
|
|
77
|
+
|
|
78
|
+
test('parses arena with judge field (preferred over criteria)', () => {
|
|
79
|
+
const result = parseArenaToml(judgeToml)
|
|
80
|
+
expect(result.arena.judge).toContain('Evaluate completeness')
|
|
81
|
+
expect(result.arena.criteria).toBeUndefined()
|
|
82
|
+
expect(result.side).toHaveLength(2)
|
|
62
83
|
})
|
|
63
84
|
|
|
64
85
|
test('parses full arena with runs_per_side and control', () => {
|
|
@@ -83,7 +104,17 @@ describe('parseArenaToml', () => {
|
|
|
83
104
|
expect(() => parseArenaToml(bad)).toThrow()
|
|
84
105
|
})
|
|
85
106
|
|
|
86
|
-
test('rejects
|
|
107
|
+
test('rejects neither judge nor criteria provided', () => {
|
|
108
|
+
const bad = `[arena]\ntask = "x"\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
109
|
+
expect(() => parseArenaToml(bad)).toThrow()
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
test('accepts judge without criteria (either is sufficient)', () => {
|
|
113
|
+
const toml = `[arena]\ntask = "x"\njudge = "Evaluate this."\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
114
|
+
expect(() => parseArenaToml(toml)).not.toThrow()
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
test('rejects empty criteria and no judge', () => {
|
|
87
118
|
const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
|
|
88
119
|
expect(() => parseArenaToml(bad)).toThrow()
|
|
89
120
|
})
|
|
@@ -115,18 +146,20 @@ describe('parseArenaToml', () => {
|
|
|
115
146
|
})
|
|
116
147
|
})
|
|
117
148
|
|
|
118
|
-
// ── Execution Plan ─────────────────────────────────────────────────────────
|
|
119
|
-
|
|
120
149
|
describe('buildExecutionPlan', () => {
|
|
121
150
|
test('generates plan: 2 sides × 1 run = 2 cells', () => {
|
|
122
151
|
const toml = parseArenaToml(minimalToml)
|
|
123
152
|
const plan = buildExecutionPlan(toml)
|
|
124
153
|
expect(plan.task).toBe('Test task')
|
|
125
|
-
expect(plan.
|
|
154
|
+
expect(plan.judge).toBeNull()
|
|
126
155
|
expect(plan.cells).toHaveLength(2)
|
|
127
156
|
expect(plan.total_runs).toBe(2)
|
|
128
|
-
|
|
129
|
-
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
test('generates plan with judge field populated', () => {
|
|
160
|
+
const toml = parseArenaToml(judgeToml)
|
|
161
|
+
const plan = buildExecutionPlan(toml)
|
|
162
|
+
expect(plan.judge).toContain('Evaluate completeness')
|
|
130
163
|
})
|
|
131
164
|
|
|
132
165
|
test('generates plan: 3 sides × 3 runs = 9 cells', () => {
|
|
@@ -134,13 +167,6 @@ describe('buildExecutionPlan', () => {
|
|
|
134
167
|
const plan = buildExecutionPlan(toml)
|
|
135
168
|
expect(plan.cells).toHaveLength(9)
|
|
136
169
|
expect(plan.total_runs).toBe(9)
|
|
137
|
-
|
|
138
|
-
// Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
|
|
139
|
-
expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
|
|
140
|
-
expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
|
|
141
|
-
expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
|
|
142
|
-
expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
|
|
143
|
-
expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
|
|
144
170
|
})
|
|
145
171
|
|
|
146
172
|
test('control flag preserved in plan cells', () => {
|
|
@@ -151,37 +177,9 @@ describe('buildExecutionPlan', () => {
|
|
|
151
177
|
expect(baselineCells.every(c => c.control)).toBe(true)
|
|
152
178
|
})
|
|
153
179
|
|
|
154
|
-
test('dry-run output format matches expected log', () => {
|
|
155
|
-
const toml = parseArenaToml(minimalToml)
|
|
156
|
-
const plan = buildExecutionPlan(toml)
|
|
157
|
-
|
|
158
|
-
// Simulate what --dry-run would log
|
|
159
|
-
const logs: string[] = []
|
|
160
|
-
for (const line of formatPlanOutput(plan)) {
|
|
161
|
-
logs.push(line)
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
expect(logs.some(l => l.includes('2 cells'))).toBe(true)
|
|
165
|
-
expect(logs.some(l => l.includes('runner-a'))).toBe(true)
|
|
166
|
-
expect(logs.some(l => l.includes('runner-b'))).toBe(true)
|
|
167
|
-
expect(logs.some(l => l.includes('claude-code'))).toBe(true)
|
|
168
|
-
expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
|
|
169
|
-
})
|
|
170
|
-
|
|
171
|
-
test('dry-run output shows control flag for control sides', () => {
|
|
172
|
-
const toml = parseArenaToml(fullToml)
|
|
173
|
-
const plan = buildExecutionPlan(toml)
|
|
174
|
-
const lines = formatPlanOutput(plan)
|
|
175
|
-
const baselineLines = lines.filter(l => l.includes('baseline'))
|
|
176
|
-
// All baseline cells should have [control] flag
|
|
177
|
-
expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
|
|
178
|
-
})
|
|
179
|
-
|
|
180
180
|
test('dry-run: plan is pure data, no side effects', () => {
|
|
181
|
-
// The entire plan generation is a pure function — dry-run is just printing it
|
|
182
181
|
const toml = parseArenaToml(fullToml)
|
|
183
182
|
const plan = buildExecutionPlan(toml)
|
|
184
|
-
// Verify plan is self-describing for a --dry-run output
|
|
185
183
|
expect(plan.total_runs).toBeGreaterThan(0)
|
|
186
184
|
expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
|
|
187
185
|
expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
|
package/src/arena-toml.ts
CHANGED
|
@@ -23,14 +23,21 @@ export type Side = z.infer<typeof Side>
|
|
|
23
23
|
|
|
24
24
|
export const ArenaToml = z.object({
|
|
25
25
|
arena: z.object({
|
|
26
|
-
task: z.string(), // task description or path to TASK
|
|
27
|
-
|
|
26
|
+
task: z.string(), // task description or path to TASK.agent.md
|
|
27
|
+
// judge: path to judge.md file (preferred) or inline natural-language criteria text.
|
|
28
|
+
// When present, readFileSync + pass as JudgeInput.criteria directly — no parsing.
|
|
29
|
+
// If absent, fall back to criteria string[] (legacy, each string becomes a bullet).
|
|
30
|
+
judge: z.string().optional().describe('Path to judge.md (natural-language criteria for the judge LLM) or inline criteria text. No parsing — passed directly as JudgeInput.criteria.'),
|
|
31
|
+
criteria: z.array(z.string()).optional().describe('Legacy string criteria. Each becomes a bullet in generated judge prompt. Use judge for full natural-language criteria.'),
|
|
28
32
|
runs_per_side: z.number().int().positive().default(1),
|
|
29
33
|
max_participants: z.number().int().min(2).max(5).default(5),
|
|
30
34
|
model: z.string().optional(), // e.g. "claude-sonnet-4-6"
|
|
31
35
|
endpoint: z.string().optional(), // e.g. "api.anthropic.com"
|
|
32
36
|
notes: z.string().optional(), // freeform reproducibility notes
|
|
33
|
-
})
|
|
37
|
+
}).refine(
|
|
38
|
+
data => !!(data.judge || (data.criteria && data.criteria.length > 0)),
|
|
39
|
+
{ message: 'At least one of arena.judge or arena.criteria must be provided' }
|
|
40
|
+
),
|
|
34
41
|
side: z.array(Side).min(2).max(5),
|
|
35
42
|
})
|
|
36
43
|
export type ArenaToml = z.infer<typeof ArenaToml>
|
|
@@ -38,7 +45,6 @@ export type ArenaToml = z.infer<typeof ArenaToml>
|
|
|
38
45
|
// ── Parser ─────────────────────────────────────────────────────────────────
|
|
39
46
|
|
|
40
47
|
export function parseArenaToml(content: string): ArenaToml {
|
|
41
|
-
// Simple inline TOML parser for arena.toml (no external dep needed for this subset)
|
|
42
48
|
const parsed = parseToml(content)
|
|
43
49
|
return ArenaToml.parse(parsed)
|
|
44
50
|
}
|
|
@@ -55,7 +61,7 @@ export interface ExecutionCell {
|
|
|
55
61
|
|
|
56
62
|
export interface ExecutionPlan {
|
|
57
63
|
task: string
|
|
58
|
-
|
|
64
|
+
judge: string | null // resolved judge text (from judge.md or inline)
|
|
59
65
|
cells: ExecutionCell[]
|
|
60
66
|
total_runs: number
|
|
61
67
|
}
|
|
@@ -75,7 +81,7 @@ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
|
|
|
75
81
|
}
|
|
76
82
|
return {
|
|
77
83
|
task: toml.arena.task,
|
|
78
|
-
|
|
84
|
+
judge: toml.arena.judge ?? null,
|
|
79
85
|
cells,
|
|
80
86
|
total_runs: cells.length,
|
|
81
87
|
}
|
|
@@ -108,7 +114,6 @@ function parseToml(text: string): Record<string, unknown> {
|
|
|
108
114
|
const sectionMatch = line.match(/^\[(.+?)\]$/)
|
|
109
115
|
if (sectionMatch) {
|
|
110
116
|
const key = sectionMatch[1]
|
|
111
|
-
// nested key like "side.env"
|
|
112
117
|
if (key.includes('.')) {
|
|
113
118
|
const [parent, child] = key.split('.')
|
|
114
119
|
const parentArr = arrayTables.get(parent)
|
|
@@ -130,13 +135,8 @@ function parseToml(text: string): Record<string, unknown> {
|
|
|
130
135
|
const key = line.slice(0, eqIdx).trim()
|
|
131
136
|
let value = line.slice(eqIdx + 1).trim()
|
|
132
137
|
|
|
133
|
-
// String value
|
|
134
138
|
if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
|
|
135
139
|
value = value.slice(1, -1)
|
|
136
|
-
} else if (value === 'true') {
|
|
137
|
-
value = 'true'
|
|
138
|
-
} else if (value === 'false') {
|
|
139
|
-
value = 'false'
|
|
140
140
|
}
|
|
141
141
|
|
|
142
142
|
// Array value: ["a", "b"]
|
|
@@ -166,7 +166,6 @@ function parseToml(text: string): Record<string, unknown> {
|
|
|
166
166
|
}
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
-
// Materialize array tables into result
|
|
170
169
|
for (const [key, arr] of arrayTables) {
|
|
171
170
|
result[key] = arr
|
|
172
171
|
}
|