@lythos/skill-arena 0.11.2 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -49,20 +49,20 @@ Note: Claude `-p` mode has known issues with web tools in Bun.spawn (deferred to
49
49
  ```bash
50
50
  bun add -d @lythos/skill-arena
51
51
  # or use directly
52
- bunx @lythos/skill-arena@0.11.2 <command>
52
+ bunx @lythos/skill-arena@0.12.0 <command>
53
53
  ```
54
54
 
55
55
  ## Quick Start
56
56
 
57
57
  ```bash
58
58
  # Single: test a deck with one agent
59
- bunx @lythos/skill-arena@0.11.2 single \
59
+ bunx @lythos/skill-arena@0.12.0 single \
60
60
  --deck https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml \
61
61
  --brief "Generate auth flow diagram"
62
62
 
63
63
  # Vs: compare multiple decks side by side
64
64
  curl -fsSL https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/arena/research-compare/arena.toml > arena.toml
65
- bunx @lythos/skill-arena@0.11.2 vs --config ./arena.toml
65
+ bunx @lythos/skill-arena@0.12.0 vs --config ./arena.toml
66
66
  ```
67
67
 
68
68
  ## Commands
@@ -71,23 +71,23 @@ bunx @lythos/skill-arena@0.11.2 vs --config ./arena.toml
71
71
 
72
72
  ```bash
73
73
  # Print execution plan without running
74
- bunx @lythos/skill-arena@0.11.2 vs --config arena.toml --dry-run
74
+ bunx @lythos/skill-arena@0.12.0 vs --config arena.toml --dry-run
75
75
 
76
76
  # Execute with per-side runs_per_side and statistical aggregation
77
- bunx @lythos/skill-arena@0.11.2 vs --config arena.toml
77
+ bunx @lythos/skill-arena@0.12.0 vs --config arena.toml
78
78
  ```
79
79
 
80
80
  ### Scaffold mode (legacy, manual execution)
81
81
 
82
82
  ```
83
- bunx @lythos/skill-arena@0.11.2 scaffold --task "Generate auth flow diagram" \
83
+ bunx @lythos/skill-arena@0.12.0 scaffold --task "Generate auth flow diagram" \
84
84
  --decks https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/scout.toml,https://raw.githubusercontent.com/lythos-labs/lythoskill/main/examples/decks/documents.toml
85
85
  ```
86
86
 
87
87
  ### Viz
88
88
 
89
89
  ```bash
90
- bunx @lythos/skill-arena@0.11.2 viz runs/arena-<id>/
90
+ bunx @lythos/skill-arena@0.12.0 viz runs/arena-<id>/
91
91
  ```
92
92
 
93
93
  ## Skill Documentation
@@ -101,7 +101,7 @@ The agent-visible **Skill** layer documentation is here:
101
101
  Part of the [lythoskill](https://github.com/lythos-labs/lythoskill) ecosystem — the thin-skill pattern separates heavy logic (this npm package) from lightweight agent instructions (SKILL.md).
102
102
 
103
103
  ```
104
- Starter (this package) → npm publish → bunx @lythos/skill-arena@0.11.2 ...
104
+ Starter (this package) → npm publish → bunx @lythos/skill-arena@0.12.0 ...
105
105
  Skill (packages/<name>/skill/) → build → SKILL.md + thin scripts
106
106
  Output (skills/<name>/) → git commit → agent-visible skill
107
107
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lythos/skill-arena",
3
- "version": "0.11.2",
3
+ "version": "0.12.0",
4
4
  "description": "Skill Arena — benchmark skill effectiveness with controlled-variable comparison",
5
5
  "keywords": [
6
6
  "ai-agent",
@@ -42,13 +42,13 @@
42
42
  "bun": ">=1.0.0"
43
43
  },
44
44
  "dependencies": {
45
- "@lythos/cold-pool": "^0.11.2",
46
- "@lythos/infra": "^0.11.2",
47
- "@lythos/test-utils": "^0.11.2",
45
+ "@lythos/cold-pool": "^0.12.0",
46
+ "@lythos/infra": "^0.12.0",
47
+ "@lythos/test-utils": "^0.12.0",
48
48
  "zod": "^3.24.0",
49
49
  "zod-to-json-schema": "^3.25.2"
50
50
  },
51
51
  "optionalDependencies": {
52
- "@lythos/agent-adapter-claude-sdk": "^0.11.2"
52
+ "@lythos/agent-adapter-claude-sdk": "^0.12.0"
53
53
  }
54
54
  }
@@ -18,6 +18,22 @@ player = "claude-code"
18
18
  deck = "./decks/b.toml"
19
19
  `
20
20
 
21
+ const judgeToml = `
22
+ [arena]
23
+ task = "Test task"
24
+ judge = "Evaluate completeness and correctness. Return JSON."
25
+
26
+ [[side]]
27
+ name = "runner-a"
28
+ player = "claude-code"
29
+ deck = "./decks/a.toml"
30
+
31
+ [[side]]
32
+ name = "runner-b"
33
+ player = "claude-code"
34
+ deck = "./decks/b.toml"
35
+ `
36
+
21
37
  const fullToml = `
22
38
  [arena]
23
39
  task = "Generate auth flow diagram"
@@ -46,19 +62,24 @@ pre_run = ["npm ci", "npm run build"]
46
62
  working_dir = "/workspace"
47
63
  `
48
64
 
49
- // ── Schema + Parser ────────────────────────────────────────────────────────
50
-
51
65
  describe('parseArenaToml', () => {
52
- test('parses minimal two-side arena', () => {
66
+ test('parses minimal two-side arena with criteria', () => {
53
67
  const result = parseArenaToml(minimalToml)
54
68
  expect(result.arena.task).toBe('Test task')
55
69
  expect(result.arena.criteria).toEqual(['a', 'b'])
56
- expect(result.arena.runs_per_side).toBe(1) // default
70
+ expect(result.arena.runs_per_side).toBe(1)
57
71
  expect(result.side).toHaveLength(2)
58
72
  expect(result.side[0].name).toBe('runner-a')
59
73
  expect(result.side[0].player).toBe('claude-code')
60
74
  expect(result.side[0].deck).toBe('./decks/a.toml')
61
- expect(result.side[0].control).toBe(false) // default
75
+ expect(result.side[0].control).toBe(false)
76
+ })
77
+
78
+ test('parses arena with judge field (preferred over criteria)', () => {
79
+ const result = parseArenaToml(judgeToml)
80
+ expect(result.arena.judge).toContain('Evaluate completeness')
81
+ expect(result.arena.criteria).toBeUndefined()
82
+ expect(result.side).toHaveLength(2)
62
83
  })
63
84
 
64
85
  test('parses full arena with runs_per_side and control', () => {
@@ -83,7 +104,17 @@ describe('parseArenaToml', () => {
83
104
  expect(() => parseArenaToml(bad)).toThrow()
84
105
  })
85
106
 
86
- test('rejects empty criteria', () => {
107
+ test('rejects neither judge nor criteria provided', () => {
108
+ const bad = `[arena]\ntask = "x"\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
109
+ expect(() => parseArenaToml(bad)).toThrow()
110
+ })
111
+
112
+ test('accepts judge without criteria (either is sufficient)', () => {
113
+ const toml = `[arena]\ntask = "x"\njudge = "Evaluate this."\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
114
+ expect(() => parseArenaToml(toml)).not.toThrow()
115
+ })
116
+
117
+ test('rejects empty criteria and no judge', () => {
87
118
  const bad = `[arena]\ntask = "x"\ncriteria = []\n\n[[side]]\nname = "a"\nplayer = "c"\ndeck = "a.toml"\n\n[[side]]\nname = "b"\nplayer = "c"\ndeck = "b.toml"`
88
119
  expect(() => parseArenaToml(bad)).toThrow()
89
120
  })
@@ -115,18 +146,20 @@ describe('parseArenaToml', () => {
115
146
  })
116
147
  })
117
148
 
118
- // ── Execution Plan ─────────────────────────────────────────────────────────
119
-
120
149
  describe('buildExecutionPlan', () => {
121
150
  test('generates plan: 2 sides × 1 run = 2 cells', () => {
122
151
  const toml = parseArenaToml(minimalToml)
123
152
  const plan = buildExecutionPlan(toml)
124
153
  expect(plan.task).toBe('Test task')
125
- expect(plan.criteria).toEqual(['a', 'b'])
154
+ expect(plan.judge).toBeNull()
126
155
  expect(plan.cells).toHaveLength(2)
127
156
  expect(plan.total_runs).toBe(2)
128
- expect(plan.cells[0]).toEqual({ side: 'runner-a', player: 'claude-code', deck: './decks/a.toml', run: 1, control: false })
129
- expect(plan.cells[1]).toEqual({ side: 'runner-b', player: 'claude-code', deck: './decks/b.toml', run: 1, control: false })
157
+ })
158
+
159
+ test('generates plan with judge field populated', () => {
160
+ const toml = parseArenaToml(judgeToml)
161
+ const plan = buildExecutionPlan(toml)
162
+ expect(plan.judge).toContain('Evaluate completeness')
130
163
  })
131
164
 
132
165
  test('generates plan: 3 sides × 3 runs = 9 cells', () => {
@@ -134,13 +167,6 @@ describe('buildExecutionPlan', () => {
134
167
  const plan = buildExecutionPlan(toml)
135
168
  expect(plan.cells).toHaveLength(9)
136
169
  expect(plan.total_runs).toBe(9)
137
-
138
- // Cells are ordered: side 0 run 1, side 0 run 2, side 0 run 3, side 1 run 1, ...
139
- expect(plan.cells[0]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 1, control: false })
140
- expect(plan.cells[1]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 2, control: false })
141
- expect(plan.cells[2]).toEqual({ side: 'minimal', player: 'standard-coder', deck: './decks/minimal.toml', run: 3, control: false })
142
- expect(plan.cells[3]).toEqual({ side: 'rich', player: 'expert-architect', deck: './decks/rich.toml', run: 1, control: false })
143
- expect(plan.cells[8]).toEqual({ side: 'baseline', player: 'standard-coder', deck: './decks/baseline.toml', run: 3, control: true })
144
170
  })
145
171
 
146
172
  test('control flag preserved in plan cells', () => {
@@ -151,37 +177,9 @@ describe('buildExecutionPlan', () => {
151
177
  expect(baselineCells.every(c => c.control)).toBe(true)
152
178
  })
153
179
 
154
- test('dry-run output format matches expected log', () => {
155
- const toml = parseArenaToml(minimalToml)
156
- const plan = buildExecutionPlan(toml)
157
-
158
- // Simulate what --dry-run would log
159
- const logs: string[] = []
160
- for (const line of formatPlanOutput(plan)) {
161
- logs.push(line)
162
- }
163
-
164
- expect(logs.some(l => l.includes('2 cells'))).toBe(true)
165
- expect(logs.some(l => l.includes('runner-a'))).toBe(true)
166
- expect(logs.some(l => l.includes('runner-b'))).toBe(true)
167
- expect(logs.some(l => l.includes('claude-code'))).toBe(true)
168
- expect(logs.every(l => !l.includes('control'))).toBe(true) // no control flags in minimal
169
- })
170
-
171
- test('dry-run output shows control flag for control sides', () => {
172
- const toml = parseArenaToml(fullToml)
173
- const plan = buildExecutionPlan(toml)
174
- const lines = formatPlanOutput(plan)
175
- const baselineLines = lines.filter(l => l.includes('baseline'))
176
- // All baseline cells should have [control] flag
177
- expect(baselineLines.every(l => l.includes('[control]'))).toBe(true)
178
- })
179
-
180
180
  test('dry-run: plan is pure data, no side effects', () => {
181
- // The entire plan generation is a pure function — dry-run is just printing it
182
181
  const toml = parseArenaToml(fullToml)
183
182
  const plan = buildExecutionPlan(toml)
184
- // Verify plan is self-describing for a --dry-run output
185
183
  expect(plan.total_runs).toBeGreaterThan(0)
186
184
  expect(plan.cells.every(c => typeof c.side === 'string')).toBe(true)
187
185
  expect(plan.cells.every(c => typeof c.player === 'string')).toBe(true)
package/src/arena-toml.ts CHANGED
@@ -23,14 +23,21 @@ export type Side = z.infer<typeof Side>
23
23
 
24
24
  export const ArenaToml = z.object({
25
25
  arena: z.object({
26
- task: z.string(), // task description or path to TASK-arena.md
27
- criteria: z.array(z.string()).min(1),
26
+ task: z.string(), // task description or path to TASK.agent.md
27
+ // judge: path to judge.md file (preferred) or inline natural-language criteria text.
28
+ // When present, readFileSync + pass as JudgeInput.criteria directly — no parsing.
29
+ // If absent, fall back to criteria string[] (legacy, each string becomes a bullet).
30
+ judge: z.string().optional().describe('Path to judge.md (natural-language criteria for the judge LLM) or inline criteria text. No parsing — passed directly as JudgeInput.criteria.'),
31
+ criteria: z.array(z.string()).optional().describe('Legacy string criteria. Each becomes a bullet in generated judge prompt. Use judge for full natural-language criteria.'),
28
32
  runs_per_side: z.number().int().positive().default(1),
29
33
  max_participants: z.number().int().min(2).max(5).default(5),
30
34
  model: z.string().optional(), // e.g. "claude-sonnet-4-6"
31
35
  endpoint: z.string().optional(), // e.g. "api.anthropic.com"
32
36
  notes: z.string().optional(), // freeform reproducibility notes
33
- }),
37
+ }).refine(
38
+ data => !!(data.judge || (data.criteria && data.criteria.length > 0)),
39
+ { message: 'At least one of arena.judge or arena.criteria must be provided' }
40
+ ),
34
41
  side: z.array(Side).min(2).max(5),
35
42
  })
36
43
  export type ArenaToml = z.infer<typeof ArenaToml>
@@ -38,7 +45,6 @@ export type ArenaToml = z.infer<typeof ArenaToml>
38
45
  // ── Parser ─────────────────────────────────────────────────────────────────
39
46
 
40
47
  export function parseArenaToml(content: string): ArenaToml {
41
- // Simple inline TOML parser for arena.toml (no external dep needed for this subset)
42
48
  const parsed = parseToml(content)
43
49
  return ArenaToml.parse(parsed)
44
50
  }
@@ -55,7 +61,7 @@ export interface ExecutionCell {
55
61
 
56
62
  export interface ExecutionPlan {
57
63
  task: string
58
- criteria: string[]
64
+ judge: string | null // resolved judge text (from judge.md or inline)
59
65
  cells: ExecutionCell[]
60
66
  total_runs: number
61
67
  }
@@ -75,7 +81,7 @@ export function buildExecutionPlan(toml: ArenaToml): ExecutionPlan {
75
81
  }
76
82
  return {
77
83
  task: toml.arena.task,
78
- criteria: toml.arena.criteria,
84
+ judge: toml.arena.judge ?? null,
79
85
  cells,
80
86
  total_runs: cells.length,
81
87
  }
@@ -108,7 +114,6 @@ function parseToml(text: string): Record<string, unknown> {
108
114
  const sectionMatch = line.match(/^\[(.+?)\]$/)
109
115
  if (sectionMatch) {
110
116
  const key = sectionMatch[1]
111
- // nested key like "side.env"
112
117
  if (key.includes('.')) {
113
118
  const [parent, child] = key.split('.')
114
119
  const parentArr = arrayTables.get(parent)
@@ -130,13 +135,8 @@ function parseToml(text: string): Record<string, unknown> {
130
135
  const key = line.slice(0, eqIdx).trim()
131
136
  let value = line.slice(eqIdx + 1).trim()
132
137
 
133
- // String value
134
138
  if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) {
135
139
  value = value.slice(1, -1)
136
- } else if (value === 'true') {
137
- value = 'true'
138
- } else if (value === 'false') {
139
- value = 'false'
140
140
  }
141
141
 
142
142
  // Array value: ["a", "b"]
@@ -166,7 +166,6 @@ function parseToml(text: string): Record<string, unknown> {
166
166
  }
167
167
  }
168
168
 
169
- // Materialize array tables into result
170
169
  for (const [key, arr] of arrayTables) {
171
170
  result[key] = arr
172
171
  }